##// END OF EJS Templates
verify: document corner cases...
Jun Wu -
r31761:b044c339 default
parent child Browse files
Show More
@@ -1,434 +1,480 b''
1 1 # verify.py - repository integrity checking for Mercurial
2 2 #
3 3 # Copyright 2006, 2007 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import os
11 11
12 12 from .i18n import _
13 13 from .node import (
14 14 nullid,
15 15 short,
16 16 )
17 17
18 18 from . import (
19 19 error,
20 20 revlog,
21 21 scmutil,
22 22 util,
23 23 )
24 24
25 25 def verify(repo):
26 26 with repo.lock():
27 27 return verifier(repo).verify()
28 28
29 29 def _normpath(f):
30 30 # under hg < 2.4, convert didn't sanitize paths properly, so a
31 31 # converted repo may contain repeated slashes
32 32 while '//' in f:
33 33 f = f.replace('//', '/')
34 34 return f
35 35
36 36 class verifier(object):
37 37 # The match argument is always None in hg core, but e.g. the narrowhg
38 38 # extension will pass in a matcher here.
39 39 def __init__(self, repo, match=None):
40 40 self.repo = repo.unfiltered()
41 41 self.ui = repo.ui
42 42 self.match = match or scmutil.matchall(repo)
43 43 self.badrevs = set()
44 44 self.errors = 0
45 45 self.warnings = 0
46 46 self.havecl = len(repo.changelog) > 0
47 47 self.havemf = len(repo.manifestlog._revlog) > 0
48 48 self.revlogv1 = repo.changelog.version != revlog.REVLOGV0
49 49 self.lrugetctx = util.lrucachefunc(repo.changectx)
50 50 self.refersmf = False
51 51 self.fncachewarned = False
52 52
53 53 def warn(self, msg):
54 54 self.ui.warn(msg + "\n")
55 55 self.warnings += 1
56 56
57 57 def err(self, linkrev, msg, filename=None):
58 58 if linkrev is not None:
59 59 self.badrevs.add(linkrev)
60 60 else:
61 61 linkrev = '?'
62 62 msg = "%s: %s" % (linkrev, msg)
63 63 if filename:
64 64 msg = "%s@%s" % (filename, msg)
65 65 self.ui.warn(" " + msg + "\n")
66 66 self.errors += 1
67 67
68 68 def exc(self, linkrev, msg, inst, filename=None):
69 69 if not str(inst):
70 70 inst = repr(inst)
71 71 self.err(linkrev, "%s: %s" % (msg, inst), filename)
72 72
73 73 def checklog(self, obj, name, linkrev):
74 74 if not len(obj) and (self.havecl or self.havemf):
75 75 self.err(linkrev, _("empty or missing %s") % name)
76 76 return
77 77
78 78 d = obj.checksize()
79 79 if d[0]:
80 80 self.err(None, _("data length off by %d bytes") % d[0], name)
81 81 if d[1]:
82 82 self.err(None, _("index contains %d extra bytes") % d[1], name)
83 83
84 84 if obj.version != revlog.REVLOGV0:
85 85 if not self.revlogv1:
86 86 self.warn(_("warning: `%s' uses revlog format 1") % name)
87 87 elif self.revlogv1:
88 88 self.warn(_("warning: `%s' uses revlog format 0") % name)
89 89
90 90 def checkentry(self, obj, i, node, seen, linkrevs, f):
91 91 lr = obj.linkrev(obj.rev(node))
92 92 if lr < 0 or (self.havecl and lr not in linkrevs):
93 93 if lr < 0 or lr >= len(self.repo.changelog):
94 94 msg = _("rev %d points to nonexistent changeset %d")
95 95 else:
96 96 msg = _("rev %d points to unexpected changeset %d")
97 97 self.err(None, msg % (i, lr), f)
98 98 if linkrevs:
99 99 if f and len(linkrevs) > 1:
100 100 try:
101 101 # attempt to filter down to real linkrevs
102 102 linkrevs = [l for l in linkrevs
103 103 if self.lrugetctx(l)[f].filenode() == node]
104 104 except Exception:
105 105 pass
106 106 self.warn(_(" (expected %s)") % " ".join(map(str, linkrevs)))
107 107 lr = None # can't be trusted
108 108
109 109 try:
110 110 p1, p2 = obj.parents(node)
111 111 if p1 not in seen and p1 != nullid:
112 112 self.err(lr, _("unknown parent 1 %s of %s") %
113 113 (short(p1), short(node)), f)
114 114 if p2 not in seen and p2 != nullid:
115 115 self.err(lr, _("unknown parent 2 %s of %s") %
116 116 (short(p2), short(node)), f)
117 117 except Exception as inst:
118 118 self.exc(lr, _("checking parents of %s") % short(node), inst, f)
119 119
120 120 if node in seen:
121 121 self.err(lr, _("duplicate revision %d (%d)") % (i, seen[node]), f)
122 122 seen[node] = i
123 123 return lr
124 124
125 125 def verify(self):
126 126 repo = self.repo
127 127
128 128 ui = repo.ui
129 129
130 130 if not repo.url().startswith('file:'):
131 131 raise error.Abort(_("cannot verify bundle or remote repos"))
132 132
133 133 if os.path.exists(repo.sjoin("journal")):
134 134 ui.warn(_("abandoned transaction found - run hg recover\n"))
135 135
136 136 if ui.verbose or not self.revlogv1:
137 137 ui.status(_("repository uses revlog format %d\n") %
138 138 (self.revlogv1 and 1 or 0))
139 139
140 140 mflinkrevs, filelinkrevs = self._verifychangelog()
141 141
142 142 filenodes = self._verifymanifest(mflinkrevs)
143 143 del mflinkrevs
144 144
145 145 self._crosscheckfiles(filelinkrevs, filenodes)
146 146
147 147 totalfiles, filerevisions = self._verifyfiles(filenodes, filelinkrevs)
148 148
149 149 ui.status(_("%d files, %d changesets, %d total revisions\n") %
150 150 (totalfiles, len(repo.changelog), filerevisions))
151 151 if self.warnings:
152 152 ui.warn(_("%d warnings encountered!\n") % self.warnings)
153 153 if self.fncachewarned:
154 154 ui.warn(_('hint: run "hg debugrebuildfncache" to recover from '
155 155 'corrupt fncache\n'))
156 156 if self.errors:
157 157 ui.warn(_("%d integrity errors encountered!\n") % self.errors)
158 158 if self.badrevs:
159 159 ui.warn(_("(first damaged changeset appears to be %d)\n")
160 160 % min(self.badrevs))
161 161 return 1
162 162
163 163 def _verifychangelog(self):
164 164 ui = self.ui
165 165 repo = self.repo
166 166 match = self.match
167 167 cl = repo.changelog
168 168
169 169 ui.status(_("checking changesets\n"))
170 170 mflinkrevs = {}
171 171 filelinkrevs = {}
172 172 seen = {}
173 173 self.checklog(cl, "changelog", 0)
174 174 total = len(repo)
175 175 for i in repo:
176 176 ui.progress(_('checking'), i, total=total, unit=_('changesets'))
177 177 n = cl.node(i)
178 178 self.checkentry(cl, i, n, seen, [i], "changelog")
179 179
180 180 try:
181 181 changes = cl.read(n)
182 182 if changes[0] != nullid:
183 183 mflinkrevs.setdefault(changes[0], []).append(i)
184 184 self.refersmf = True
185 185 for f in changes[3]:
186 186 if match(f):
187 187 filelinkrevs.setdefault(_normpath(f), []).append(i)
188 188 except Exception as inst:
189 189 self.refersmf = True
190 190 self.exc(i, _("unpacking changeset %s") % short(n), inst)
191 191 ui.progress(_('checking'), None)
192 192 return mflinkrevs, filelinkrevs
193 193
194 194 def _verifymanifest(self, mflinkrevs, dir="", storefiles=None,
195 195 progress=None):
196 196 repo = self.repo
197 197 ui = self.ui
198 198 match = self.match
199 199 mfl = self.repo.manifestlog
200 200 mf = mfl._revlog.dirlog(dir)
201 201
202 202 if not dir:
203 203 self.ui.status(_("checking manifests\n"))
204 204
205 205 filenodes = {}
206 206 subdirnodes = {}
207 207 seen = {}
208 208 label = "manifest"
209 209 if dir:
210 210 label = dir
211 211 revlogfiles = mf.files()
212 212 storefiles.difference_update(revlogfiles)
213 213 if progress: # should be true since we're in a subdirectory
214 214 progress()
215 215 if self.refersmf:
216 216 # Do not check manifest if there are only changelog entries with
217 217 # null manifests.
218 218 self.checklog(mf, label, 0)
219 219 total = len(mf)
220 220 for i in mf:
221 221 if not dir:
222 222 ui.progress(_('checking'), i, total=total, unit=_('manifests'))
223 223 n = mf.node(i)
224 224 lr = self.checkentry(mf, i, n, seen, mflinkrevs.get(n, []), label)
225 225 if n in mflinkrevs:
226 226 del mflinkrevs[n]
227 227 elif dir:
228 228 self.err(lr, _("%s not in parent-directory manifest") %
229 229 short(n), label)
230 230 else:
231 231 self.err(lr, _("%s not in changesets") % short(n), label)
232 232
233 233 try:
234 234 mfdelta = mfl.get(dir, n).readdelta(shallow=True)
235 235 for f, fn, fl in mfdelta.iterentries():
236 236 if not f:
237 237 self.err(lr, _("entry without name in manifest"))
238 238 elif f == "/dev/null": # ignore this in very old repos
239 239 continue
240 240 fullpath = dir + _normpath(f)
241 241 if fl == 't':
242 242 if not match.visitdir(fullpath):
243 243 continue
244 244 subdirnodes.setdefault(fullpath + '/', {}).setdefault(
245 245 fn, []).append(lr)
246 246 else:
247 247 if not match(fullpath):
248 248 continue
249 249 filenodes.setdefault(fullpath, {}).setdefault(fn, lr)
250 250 except Exception as inst:
251 251 self.exc(lr, _("reading delta %s") % short(n), inst, label)
252 252 if not dir:
253 253 ui.progress(_('checking'), None)
254 254
255 255 if self.havemf:
256 256 for c, m in sorted([(c, m) for m in mflinkrevs
257 257 for c in mflinkrevs[m]]):
258 258 if dir:
259 259 self.err(c, _("parent-directory manifest refers to unknown "
260 260 "revision %s") % short(m), label)
261 261 else:
262 262 self.err(c, _("changeset refers to unknown revision %s") %
263 263 short(m), label)
264 264
265 265 if not dir and subdirnodes:
266 266 self.ui.status(_("checking directory manifests\n"))
267 267 storefiles = set()
268 268 subdirs = set()
269 269 revlogv1 = self.revlogv1
270 270 for f, f2, size in repo.store.datafiles():
271 271 if not f:
272 272 self.err(None, _("cannot decode filename '%s'") % f2)
273 273 elif (size > 0 or not revlogv1) and f.startswith('meta/'):
274 274 storefiles.add(_normpath(f))
275 275 subdirs.add(os.path.dirname(f))
276 276 subdircount = len(subdirs)
277 277 currentsubdir = [0]
278 278 def progress():
279 279 currentsubdir[0] += 1
280 280 ui.progress(_('checking'), currentsubdir[0], total=subdircount,
281 281 unit=_('manifests'))
282 282
283 283 for subdir, linkrevs in subdirnodes.iteritems():
284 284 subdirfilenodes = self._verifymanifest(linkrevs, subdir, storefiles,
285 285 progress)
286 286 for f, onefilenodes in subdirfilenodes.iteritems():
287 287 filenodes.setdefault(f, {}).update(onefilenodes)
288 288
289 289 if not dir and subdirnodes:
290 290 ui.progress(_('checking'), None)
291 291 for f in sorted(storefiles):
292 292 self.warn(_("warning: orphan revlog '%s'") % f)
293 293
294 294 return filenodes
295 295
296 296 def _crosscheckfiles(self, filelinkrevs, filenodes):
297 297 repo = self.repo
298 298 ui = self.ui
299 299 ui.status(_("crosschecking files in changesets and manifests\n"))
300 300
301 301 total = len(filelinkrevs) + len(filenodes)
302 302 count = 0
303 303 if self.havemf:
304 304 for f in sorted(filelinkrevs):
305 305 count += 1
306 306 ui.progress(_('crosschecking'), count, total=total)
307 307 if f not in filenodes:
308 308 lr = filelinkrevs[f][0]
309 309 self.err(lr, _("in changeset but not in manifest"), f)
310 310
311 311 if self.havecl:
312 312 for f in sorted(filenodes):
313 313 count += 1
314 314 ui.progress(_('crosschecking'), count, total=total)
315 315 if f not in filelinkrevs:
316 316 try:
317 317 fl = repo.file(f)
318 318 lr = min([fl.linkrev(fl.rev(n)) for n in filenodes[f]])
319 319 except Exception:
320 320 lr = None
321 321 self.err(lr, _("in manifest but not in changeset"), f)
322 322
323 323 ui.progress(_('crosschecking'), None)
324 324
325 325 def _verifyfiles(self, filenodes, filelinkrevs):
326 326 repo = self.repo
327 327 ui = self.ui
328 328 lrugetctx = self.lrugetctx
329 329 revlogv1 = self.revlogv1
330 330 havemf = self.havemf
331 331 ui.status(_("checking files\n"))
332 332
333 333 storefiles = set()
334 334 for f, f2, size in repo.store.datafiles():
335 335 if not f:
336 336 self.err(None, _("cannot decode filename '%s'") % f2)
337 337 elif (size > 0 or not revlogv1) and f.startswith('data/'):
338 338 storefiles.add(_normpath(f))
339 339
340 340 files = sorted(set(filenodes) | set(filelinkrevs))
341 341 total = len(files)
342 342 revisions = 0
343 343 for i, f in enumerate(files):
344 344 ui.progress(_('checking'), i, item=f, total=total, unit=_('files'))
345 345 try:
346 346 linkrevs = filelinkrevs[f]
347 347 except KeyError:
348 348 # in manifest but not in changelog
349 349 linkrevs = []
350 350
351 351 if linkrevs:
352 352 lr = linkrevs[0]
353 353 else:
354 354 lr = None
355 355
356 356 try:
357 357 fl = repo.file(f)
358 358 except error.RevlogError as e:
359 359 self.err(lr, _("broken revlog! (%s)") % e, f)
360 360 continue
361 361
362 362 for ff in fl.files():
363 363 try:
364 364 storefiles.remove(ff)
365 365 except KeyError:
366 366 self.warn(_(" warning: revlog '%s' not in fncache!") % ff)
367 367 self.fncachewarned = True
368 368
369 369 self.checklog(fl, f, lr)
370 370 seen = {}
371 371 rp = None
372 372 for i in fl:
373 373 revisions += 1
374 374 n = fl.node(i)
375 375 lr = self.checkentry(fl, i, n, seen, linkrevs, f)
376 376 if f in filenodes:
377 377 if havemf and n not in filenodes[f]:
378 378 self.err(lr, _("%s not in manifests") % (short(n)), f)
379 379 else:
380 380 del filenodes[f][n]
381 381
382 # verify contents
382 # Verify contents. 4 cases to care about:
383 #
384 # common: the most common case
385 # rename: with a rename
386 # meta: file content starts with b'\1\n', the metadata
387 # header defined in filelog.py, but without a rename
388 # ext: content stored externally
389 #
390 # More formally, their differences are shown below:
391 #
392 # | common | rename | meta | ext
393 # -------------------------------------------------------
394 # flags() | 0 | 0 | 0 | not 0
395 # renamed() | False | True | False | ?
396 # rawtext[0:2]=='\1\n'| False | True | True | ?
397 #
398 # "rawtext" means the raw text stored in revlog data, which
399 # could be retrieved by "revision(rev, raw=True)". "text"
400 # mentioned below is "revision(rev, raw=False)".
401 #
402 # There are 3 different lengths stored physically:
403 # 1. L1: rawsize, stored in revlog index
404 # 2. L2: len(rawtext), stored in revlog data
405 # 3. L3: len(text), stored in revlog data if flags==0, or
406 # possibly somewhere else if flags!=0
407 #
408 # L1 should be equal to L2. L3 could be different from them.
409 # "text" may or may not affect commit hash depending on flag
410 # processors (see revlog.addflagprocessor).
411 #
412 # | common | rename | meta | ext
413 # -------------------------------------------------
414 # rawsize() | L1 | L1 | L1 | L1
415 # size() | L1 | L2-LM | L1(*) | L1 (?)
416 # len(rawtext) | L2 | L2 | L2 | L2
417 # len(text) | L2 | L2 | L2 | L3
418 # len(read()) | L2 | L2-LM | L2-LM | L3 (?)
419 #
420 # LM: length of metadata, depending on rawtext
421 # (*): not ideal, see comment in filelog.size
422 # (?): could be "- len(meta)" if the resolved content has
423 # rename metadata
424 #
425 # Checks needed to be done:
426 # 1. length check: L1 == L2, in all cases.
427 # 2. hash check: depending on flag processor, we may need to
428 # use either "text" (external), or "rawtext" (in revlog).
383 429 try:
384 430 l = len(fl.read(n))
385 431 rp = fl.renamed(n)
386 432 if l != fl.size(i):
387 433 if len(fl.revision(n)) != fl.size(i):
388 434 self.err(lr, _("unpacked size is %s, %s expected") %
389 435 (l, fl.size(i)), f)
390 436 except error.CensoredNodeError:
391 437 # experimental config: censor.policy
392 438 if ui.config("censor", "policy", "abort") == "abort":
393 439 self.err(lr, _("censored file data"), f)
394 440 except Exception as inst:
395 441 self.exc(lr, _("unpacking %s") % short(n), inst, f)
396 442
397 443 # check renames
398 444 try:
399 445 if rp:
400 446 if lr is not None and ui.verbose:
401 447 ctx = lrugetctx(lr)
402 448 found = False
403 449 for pctx in ctx.parents():
404 450 if rp[0] in pctx:
405 451 found = True
406 452 break
407 453 if not found:
408 454 self.warn(_("warning: copy source of '%s' not"
409 455 " in parents of %s") % (f, ctx))
410 456 fl2 = repo.file(rp[0])
411 457 if not len(fl2):
412 458 self.err(lr, _("empty or missing copy source "
413 459 "revlog %s:%s") % (rp[0], short(rp[1])), f)
414 460 elif rp[1] == nullid:
415 461 ui.note(_("warning: %s@%s: copy source"
416 462 " revision is nullid %s:%s\n")
417 463 % (f, lr, rp[0], short(rp[1])))
418 464 else:
419 465 fl2.rev(rp[1])
420 466 except Exception as inst:
421 467 self.exc(lr, _("checking rename of %s") % short(n), inst, f)
422 468
423 469 # cross-check
424 470 if f in filenodes:
425 471 fns = [(v, k) for k, v in filenodes[f].iteritems()]
426 472 for lr, node in sorted(fns):
427 473 self.err(lr, _("manifest refers to unknown revision %s") %
428 474 short(node), f)
429 475 ui.progress(_('checking'), None)
430 476
431 477 for f in sorted(storefiles):
432 478 self.warn(_("warning: orphan revlog '%s'") % f)
433 479
434 480 return len(files), revisions
General Comments 0
You need to be logged in to leave comments. Login now