# HG changeset patch # User Matt Harbison # Date 2017-11-17 05:06:45 # Node ID 417e8e040102c6feca3d8cc6c17e5866563a7be5 # Parent b0c01a5ee35c1e920b141e570911bdb3a91d6f87 lfs: verify lfs object content when transferring to and from the remote store This avoids inserting corrupt files into the usercache, and local and remote stores. One down side is that the bad file won't be available locally for forensic purposes after a remote download. I'm thinking about adding an 'incoming' directory to the local lfs store to handle the download, and then move it to the 'objects' directory after it passes verification. That would have the additional benefit of not concatenating each transfer chunk in memory until the full file is transferred. Verification isn't needed when the data is passed back through the revlog interface or when the oid was just calculated, but otherwise it is on by default. The additional overhead should be well worth avoiding problems with file based remote stores, or buggy lfs servers. Having two different verify functions is a little sad, but the full data of the blob is mostly passed around in memory, because that's what the revlog interface wants. The upload function, however, chunks up the data. It would be ideal if that was how the content is always handled, but that's probably a huge project. I don't really like printing the long hash, but `hg debugdata` isn't a public interface, and is the only way to get it. The filelog and revision info is nowhere near this area, so recommending `hg verify` is the easiest thing to do. diff --git a/hgext/lfs/blobstore.py b/hgext/lfs/blobstore.py --- a/hgext/lfs/blobstore.py +++ b/hgext/lfs/blobstore.py @@ -7,6 +7,7 @@ from __future__ import absolute_import +import hashlib import json import os import re @@ -99,8 +100,11 @@ class local(object): self.cachevfs = lfsvfs(usercache) self.ui = repo.ui - def write(self, oid, data): + def write(self, oid, data, verify=True): """Write blob to local blobstore.""" + if verify: + _verify(oid, data) + with self.vfs(oid, 'wb', atomictemp=True) as fp: fp.write(data) @@ -110,14 +114,23 @@ class local(object): self.ui.note(_('lfs: adding %s to the usercache\n') % oid) lfutil.link(self.vfs.join(oid), self.cachevfs.join(oid)) - def read(self, oid): + def read(self, oid, verify=True): """Read blob from local blobstore.""" if not self.vfs.exists(oid): + blob = self._read(self.cachevfs, oid, verify) + self.ui.note(_('lfs: found %s in the usercache\n') % oid) lfutil.link(self.cachevfs.join(oid), self.vfs.join(oid)) - self.ui.note(_('lfs: found %s in the usercache\n') % oid) else: self.ui.note(_('lfs: found %s in the local lfs store\n') % oid) - return self.vfs.read(oid) + blob = self._read(self.vfs, oid, verify) + return blob + + def _read(self, vfs, oid, verify): + """Read blob (after verifying) from the given store""" + blob = vfs.read(oid) + if verify: + _verify(oid, blob) + return blob def has(self, oid): """Returns True if the local blobstore contains the requested blob, @@ -233,6 +246,8 @@ class _gitlfsremote(object): request = util.urlreq.request(href) if action == 'upload': # If uploading blobs, read data from local blobstore. + with localstore.vfs(oid) as fp: + _verifyfile(oid, fp) request.data = filewithprogress(localstore.vfs(oid), None) request.get_method = lambda: 'PUT' @@ -253,7 +268,7 @@ class _gitlfsremote(object): if action == 'download': # If downloading blobs, store downloaded data to local blobstore - localstore.write(oid, response) + localstore.write(oid, response, verify=True) def _batch(self, pointers, localstore, action): if action not in ['upload', 'download']: @@ -324,14 +339,14 @@ class _dummyremote(object): def writebatch(self, pointers, fromstore): for p in pointers: - content = fromstore.read(p.oid()) + content = fromstore.read(p.oid(), verify=True) with self.vfs(p.oid(), 'wb', atomictemp=True) as fp: fp.write(content) def readbatch(self, pointers, tostore): for p in pointers: content = self.vfs.read(p.oid()) - tostore.write(p.oid(), content) + tostore.write(p.oid(), content, verify=True) class _nullremote(object): """Null store storing blobs to /dev/null.""" @@ -368,6 +383,24 @@ class _promptremote(object): None: _promptremote, } +def _verify(oid, content): + realoid = hashlib.sha256(content).hexdigest() + if realoid != oid: + raise error.Abort(_('detected corrupt lfs object: %s') % oid, + hint=_('run hg verify')) + +def _verifyfile(oid, fp): + sha256 = hashlib.sha256() + while True: + data = fp.read(1024 * 1024) + if not data: + break + sha256.update(data) + realoid = sha256.hexdigest() + if realoid != oid: + raise error.Abort(_('detected corrupt lfs object: %s') % oid, + hint=_('run hg verify')) + def remote(repo): """remotestore factory. return a store in _storemap depending on config""" defaulturl = '' diff --git a/hgext/lfs/wrapper.py b/hgext/lfs/wrapper.py --- a/hgext/lfs/wrapper.py +++ b/hgext/lfs/wrapper.py @@ -54,7 +54,9 @@ def readfromstore(self, text): if not store.has(oid): p.filename = getattr(self, 'indexfile', None) self.opener.lfsremoteblobstore.readbatch([p], store) - text = store.read(oid) + + # The caller will validate the content + text = store.read(oid, verify=False) # pack hg filelog metadata hgmeta = {} @@ -76,7 +78,7 @@ def writetostore(self, text): # git-lfs only supports sha256 oid = hashlib.sha256(text).hexdigest() - self.opener.lfslocalblobstore.write(oid, text) + self.opener.lfslocalblobstore.write(oid, text, verify=False) # replace contents with metadata longoid = 'sha256:%s' % oid diff --git a/tests/test-lfs-test-server.t b/tests/test-lfs-test-server.t --- a/tests/test-lfs-test-server.t +++ b/tests/test-lfs-test-server.t @@ -105,16 +105,15 @@ Clear the cache to force a download lfs: found 37a65ab78d5ecda767e8622c248b5dbff1e68b1678ab0e730d5eb8601ec8ad19 in the local lfs store 3 files updated, 0 files merged, 0 files removed, 0 files unresolved -Test a corrupt file download, but clear the cache first to force a download - -XXX: ideally, the validation would occur before polluting the usercache and -local store, with a clearer error message. +Test a corrupt file download, but clear the cache first to force a download. $ rm -rf `hg config lfs.usercache` $ cp $TESTTMP/lfs-content/d1/1e/1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 blob $ echo 'damage' > $TESTTMP/lfs-content/d1/1e/1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 $ rm ../repo1/.hg/store/lfs/objects/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 $ rm ../repo1/* + +XXX: suggesting `hg verify` won't help with a corrupt file on the lfs server. $ hg --repo ../repo1 update -C tip -v resolving manifests getting a @@ -123,18 +122,16 @@ local store, with a clearer error messag lfs: found 31cf46fbc4ecd458a0943c5b4881f1f5a6dd36c53d6167d5b69ac45149b38e5b in the local lfs store getting c lfs: downloading d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 (19 bytes) - lfs: adding d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 to the usercache - lfs: processed: d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 - lfs: found d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 in the local lfs store - abort: integrity check failed on data/c.i:0! + abort: detected corrupt lfs object: d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 + (run hg verify) [255] -BUG: the corrupted blob was added to the usercache and local store +The corrupted blob is not added to the usercache or local store - $ cat ../repo1/.hg/store/lfs/objects/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 | $TESTDIR/f --sha256 - sha256=fa82ca222fc9813afad3559637960bf311170cdd80ed35287f4623eb2320a660 - $ cat `hg config lfs.usercache`/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 | $TESTDIR/f --sha256 - sha256=fa82ca222fc9813afad3559637960bf311170cdd80ed35287f4623eb2320a660 + $ test -f ../repo1/.hg/store/lfs/objects/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 + [1] + $ test -f `hg config lfs.usercache`/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 + [1] $ cp blob $TESTTMP/lfs-content/d1/1e/1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 Test a corrupted file upload @@ -146,7 +143,8 @@ Test a corrupted file upload pushing to ../repo1 searching for changes lfs: uploading e659058e26b07b39d2a9c7145b3f99b41f797b6621c8076600e9cb7ee88291f0 (17 bytes) - abort: HTTP error: HTTP Error 500: Internal Server Error (oid=e659058e26b07b39d2a9c7145b3f99b41f797b6621c8076600e9cb7ee88291f0, action=upload)! + abort: detected corrupt lfs object: e659058e26b07b39d2a9c7145b3f99b41f797b6621c8076600e9cb7ee88291f0 + (run hg verify) [255] Check error message when the remote missed a blob: diff --git a/tests/test-lfs.t b/tests/test-lfs.t --- a/tests/test-lfs.t +++ b/tests/test-lfs.t @@ -704,18 +704,17 @@ Damaging a file required by the update d updating to branch default resolving manifests getting l - lfs: adding 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b to the usercache - lfs: found 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b in the local lfs store - abort: integrity check failed on data/l.i:3! + abort: detected corrupt lfs object: 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b + (run hg verify) [255] -BUG: A corrupted lfs blob either shouldn't be created after a transfer from a -file://remotestore, or it shouldn't be left behind. +A corrupted lfs blob is not transferred from a file://remotestore to the +usercache or local store. - $ cat emptycache/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256 - sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff - $ cat fromcorrupt2/.hg/store/lfs/objects/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256 - sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff + $ test -f emptycache/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b + [1] + $ test -f fromcorrupt2/.hg/store/lfs/objects/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b + [1] $ hg -R fromcorrupt2 verify checking changesets @@ -723,14 +722,13 @@ file://remotestore, or it shouldn't be l crosschecking files in changesets and manifests checking files l@1: unpacking 46a2f24864bc: integrity check failed on data/l.i:0 - l@4: unpacking 6f1ff1f39c11: integrity check failed on data/l.i:3 large@0: unpacking 2c531e0992ff: integrity check failed on data/large.i:0 4 files, 5 changesets, 10 total revisions - 3 integrity errors encountered! + 2 integrity errors encountered! (first damaged changeset appears to be 0) [1] -BUG: push will happily send corrupt files upstream. (The alternate dummy remote +Corrupt local files are not sent upstream. (The alternate dummy remote avoids the corrupt lfs object in the original remote.) $ mkdir $TESTTMP/dummy-remote2 @@ -742,18 +740,9 @@ avoids the corrupt lfs object in the ori lfs: found 89b6070915a3d573ff3599d1cda305bc5e38549b15c4847ab034169da66e1ca8 in the local lfs store lfs: found b1a6ea88da0017a0e77db139a54618986e9a2489bee24af9fe596de9daac498c in the local lfs store lfs: found 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e in the local lfs store - 5 changesets found - uncompressed size of bundle content: - 997 (changelog) - 1032 (manifests) - 841 l - 272 large - 788 s - 139 small - adding changesets - adding manifests - adding file changes - added 5 changesets with 10 changes to 4 files + abort: detected corrupt lfs object: 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e + (run hg verify) + [255] $ hg -R fromcorrupt2 --config lfs.url=file:///$TESTTMP/dummy-remote2 verify -v repository uses revlog format 1 @@ -764,27 +753,21 @@ avoids the corrupt lfs object in the ori lfs: found 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e in the local lfs store l@1: unpacking 46a2f24864bc: integrity check failed on data/l.i:0 lfs: found 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b in the local lfs store - l@4: unpacking 6f1ff1f39c11: integrity check failed on data/l.i:3 lfs: found 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e in the local lfs store large@0: unpacking 2c531e0992ff: integrity check failed on data/large.i:0 lfs: found 89b6070915a3d573ff3599d1cda305bc5e38549b15c4847ab034169da66e1ca8 in the local lfs store lfs: found b1a6ea88da0017a0e77db139a54618986e9a2489bee24af9fe596de9daac498c in the local lfs store 4 files, 5 changesets, 10 total revisions - 3 integrity errors encountered! + 2 integrity errors encountered! (first damaged changeset appears to be 0) [1] $ cat $TESTTMP/dummy-remote2/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256 - sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff + sha256=22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b $ cat fromcorrupt2/.hg/store/lfs/objects/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256 - sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff - - $ cat $TESTTMP/dummy-remote2/66/100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e - LONGER-THAN-TEN-BYTES-WILL-TRIGGER-LFS - damage - $ cat $TESTTMP/dummy-remote2/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b - RESTORE-TO-BE-LARGE - damage + sha256=22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b + $ test -f $TESTTMP/dummy-remote2/66/100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e + [1] Accessing a corrupt file will complain