##// END OF EJS Templates
snapshot: also consider the snapshot chain of one unrelated revision...
Boris Feld -
r39541:c6b8eab5 default
parent child Browse files
Show More
@@ -1,901 +1,932 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import heapq
14 14 import struct
15 15
16 16 # import stuff from node for others to import from revlog
17 17 from ..node import (
18 18 nullrev,
19 19 )
20 20 from ..i18n import _
21 21
22 22 from .constants import (
23 23 REVIDX_ISCENSORED,
24 24 REVIDX_RAWTEXT_CHANGING_FLAGS,
25 25 )
26 26
27 27 from ..thirdparty import (
28 28 attr,
29 29 )
30 30
31 31 from .. import (
32 32 error,
33 33 mdiff,
34 34 )
35 35
36 36 RevlogError = error.RevlogError
37 37 CensoredNodeError = error.CensoredNodeError
38 38
39 39 # maximum <delta-chain-data>/<revision-text-length> ratio
40 40 LIMIT_DELTA2TEXT = 2
41 41
42 42 class _testrevlog(object):
43 43 """minimalist fake revlog to use in doctests"""
44 44
45 45 def __init__(self, data, density=0.5, mingap=0):
46 46 """data is an list of revision payload boundaries"""
47 47 self._data = data
48 48 self._srdensitythreshold = density
49 49 self._srmingapsize = mingap
50 50
51 51 def start(self, rev):
52 52 if rev == 0:
53 53 return 0
54 54 return self._data[rev - 1]
55 55
56 56 def end(self, rev):
57 57 return self._data[rev]
58 58
59 59 def length(self, rev):
60 60 return self.end(rev) - self.start(rev)
61 61
62 62 def __len__(self):
63 63 return len(self._data)
64 64
65 65 def slicechunk(revlog, revs, deltainfo=None, targetsize=None):
66 66 """slice revs to reduce the amount of unrelated data to be read from disk.
67 67
68 68 ``revs`` is sliced into groups that should be read in one time.
69 69 Assume that revs are sorted.
70 70
71 71 The initial chunk is sliced until the overall density (payload/chunks-span
72 72 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
73 73 `revlog._srmingapsize` is skipped.
74 74
75 75 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
76 76 For consistency with other slicing choice, this limit won't go lower than
77 77 `revlog._srmingapsize`.
78 78
79 79 If individual revisions chunk are larger than this limit, they will still
80 80 be raised individually.
81 81
82 82 >>> revlog = _testrevlog([
83 83 ... 5, #00 (5)
84 84 ... 10, #01 (5)
85 85 ... 12, #02 (2)
86 86 ... 12, #03 (empty)
87 87 ... 27, #04 (15)
88 88 ... 31, #05 (4)
89 89 ... 31, #06 (empty)
90 90 ... 42, #07 (11)
91 91 ... 47, #08 (5)
92 92 ... 47, #09 (empty)
93 93 ... 48, #10 (1)
94 94 ... 51, #11 (3)
95 95 ... 74, #12 (23)
96 96 ... 85, #13 (11)
97 97 ... 86, #14 (1)
98 98 ... 91, #15 (5)
99 99 ... ])
100 100
101 101 >>> list(slicechunk(revlog, list(range(16))))
102 102 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
103 103 >>> list(slicechunk(revlog, [0, 15]))
104 104 [[0], [15]]
105 105 >>> list(slicechunk(revlog, [0, 11, 15]))
106 106 [[0], [11], [15]]
107 107 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
108 108 [[0], [11, 13, 15]]
109 109 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
110 110 [[1, 2], [5, 8, 10, 11], [14]]
111 111
112 112 Slicing with a maximum chunk size
113 113 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
114 114 [[0], [11], [13], [15]]
115 115 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
116 116 [[0], [11], [13, 15]]
117 117 """
118 118 if targetsize is not None:
119 119 targetsize = max(targetsize, revlog._srmingapsize)
120 120 # targetsize should not be specified when evaluating delta candidates:
121 121 # * targetsize is used to ensure we stay within specification when reading,
122 122 # * deltainfo is used to pick are good delta chain when writing.
123 123 if not (deltainfo is None or targetsize is None):
124 124 msg = 'cannot use `targetsize` with a `deltainfo`'
125 125 raise error.ProgrammingError(msg)
126 126 for chunk in _slicechunktodensity(revlog, revs,
127 127 deltainfo,
128 128 revlog._srdensitythreshold,
129 129 revlog._srmingapsize):
130 130 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
131 131 yield subchunk
132 132
133 133 def _slicechunktosize(revlog, revs, targetsize=None):
134 134 """slice revs to match the target size
135 135
136 136 This is intended to be used on chunk that density slicing selected by that
137 137 are still too large compared to the read garantee of revlog. This might
138 138 happens when "minimal gap size" interrupted the slicing or when chain are
139 139 built in a way that create large blocks next to each other.
140 140
141 141 >>> revlog = _testrevlog([
142 142 ... 3, #0 (3)
143 143 ... 5, #1 (2)
144 144 ... 6, #2 (1)
145 145 ... 8, #3 (2)
146 146 ... 8, #4 (empty)
147 147 ... 11, #5 (3)
148 148 ... 12, #6 (1)
149 149 ... 13, #7 (1)
150 150 ... 14, #8 (1)
151 151 ... ])
152 152
153 153 Cases where chunk is already small enough
154 154 >>> list(_slicechunktosize(revlog, [0], 3))
155 155 [[0]]
156 156 >>> list(_slicechunktosize(revlog, [6, 7], 3))
157 157 [[6, 7]]
158 158 >>> list(_slicechunktosize(revlog, [0], None))
159 159 [[0]]
160 160 >>> list(_slicechunktosize(revlog, [6, 7], None))
161 161 [[6, 7]]
162 162
163 163 cases where we need actual slicing
164 164 >>> list(_slicechunktosize(revlog, [0, 1], 3))
165 165 [[0], [1]]
166 166 >>> list(_slicechunktosize(revlog, [1, 3], 3))
167 167 [[1], [3]]
168 168 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
169 169 [[1, 2], [3]]
170 170 >>> list(_slicechunktosize(revlog, [3, 5], 3))
171 171 [[3], [5]]
172 172 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
173 173 [[3], [5]]
174 174 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
175 175 [[5], [6, 7, 8]]
176 176 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
177 177 [[0], [1, 2], [3], [5], [6, 7, 8]]
178 178
179 179 Case with too large individual chunk (must return valid chunk)
180 180 >>> list(_slicechunktosize(revlog, [0, 1], 2))
181 181 [[0], [1]]
182 182 >>> list(_slicechunktosize(revlog, [1, 3], 1))
183 183 [[1], [3]]
184 184 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
185 185 [[3], [5]]
186 186 """
187 187 assert targetsize is None or 0 <= targetsize
188 188 if targetsize is None or segmentspan(revlog, revs) <= targetsize:
189 189 yield revs
190 190 return
191 191
192 192 startrevidx = 0
193 193 startdata = revlog.start(revs[0])
194 194 endrevidx = 0
195 195 iterrevs = enumerate(revs)
196 196 next(iterrevs) # skip first rev.
197 197 for idx, r in iterrevs:
198 198 span = revlog.end(r) - startdata
199 199 if span <= targetsize:
200 200 endrevidx = idx
201 201 else:
202 202 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
203 203 if chunk:
204 204 yield chunk
205 205 startrevidx = idx
206 206 startdata = revlog.start(r)
207 207 endrevidx = idx
208 208 yield _trimchunk(revlog, revs, startrevidx)
209 209
210 210 def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5,
211 211 mingapsize=0):
212 212 """slice revs to reduce the amount of unrelated data to be read from disk.
213 213
214 214 ``revs`` is sliced into groups that should be read in one time.
215 215 Assume that revs are sorted.
216 216
217 217 ``deltainfo`` is a _deltainfo instance of a revision that we would append
218 218 to the top of the revlog.
219 219
220 220 The initial chunk is sliced until the overall density (payload/chunks-span
221 221 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
222 222 skipped.
223 223
224 224 >>> revlog = _testrevlog([
225 225 ... 5, #00 (5)
226 226 ... 10, #01 (5)
227 227 ... 12, #02 (2)
228 228 ... 12, #03 (empty)
229 229 ... 27, #04 (15)
230 230 ... 31, #05 (4)
231 231 ... 31, #06 (empty)
232 232 ... 42, #07 (11)
233 233 ... 47, #08 (5)
234 234 ... 47, #09 (empty)
235 235 ... 48, #10 (1)
236 236 ... 51, #11 (3)
237 237 ... 74, #12 (23)
238 238 ... 85, #13 (11)
239 239 ... 86, #14 (1)
240 240 ... 91, #15 (5)
241 241 ... ])
242 242
243 243 >>> list(_slicechunktodensity(revlog, list(range(16))))
244 244 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
245 245 >>> list(_slicechunktodensity(revlog, [0, 15]))
246 246 [[0], [15]]
247 247 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
248 248 [[0], [11], [15]]
249 249 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
250 250 [[0], [11, 13, 15]]
251 251 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
252 252 [[1, 2], [5, 8, 10, 11], [14]]
253 253 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
254 254 ... mingapsize=20))
255 255 [[1, 2, 3, 5, 8, 10, 11], [14]]
256 256 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
257 257 ... targetdensity=0.95))
258 258 [[1, 2], [5], [8, 10, 11], [14]]
259 259 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
260 260 ... targetdensity=0.95, mingapsize=12))
261 261 [[1, 2], [5, 8, 10, 11], [14]]
262 262 """
263 263 start = revlog.start
264 264 length = revlog.length
265 265
266 266 if len(revs) <= 1:
267 267 yield revs
268 268 return
269 269
270 270 nextrev = len(revlog)
271 271 nextoffset = revlog.end(nextrev - 1)
272 272
273 273 if deltainfo is None:
274 274 deltachainspan = segmentspan(revlog, revs)
275 275 chainpayload = sum(length(r) for r in revs)
276 276 else:
277 277 deltachainspan = deltainfo.distance
278 278 chainpayload = deltainfo.compresseddeltalen
279 279
280 280 if deltachainspan < mingapsize:
281 281 yield revs
282 282 return
283 283
284 284 readdata = deltachainspan
285 285
286 286 if deltachainspan:
287 287 density = chainpayload / float(deltachainspan)
288 288 else:
289 289 density = 1.0
290 290
291 291 if density >= targetdensity:
292 292 yield revs
293 293 return
294 294
295 295 if deltainfo is not None and deltainfo.deltalen:
296 296 revs = list(revs)
297 297 revs.append(nextrev)
298 298
299 299 # Store the gaps in a heap to have them sorted by decreasing size
300 300 gapsheap = []
301 301 heapq.heapify(gapsheap)
302 302 prevend = None
303 303 for i, rev in enumerate(revs):
304 304 if rev < nextrev:
305 305 revstart = start(rev)
306 306 revlen = length(rev)
307 307 else:
308 308 revstart = nextoffset
309 309 revlen = deltainfo.deltalen
310 310
311 311 # Skip empty revisions to form larger holes
312 312 if revlen == 0:
313 313 continue
314 314
315 315 if prevend is not None:
316 316 gapsize = revstart - prevend
317 317 # only consider holes that are large enough
318 318 if gapsize > mingapsize:
319 319 heapq.heappush(gapsheap, (-gapsize, i))
320 320
321 321 prevend = revstart + revlen
322 322
323 323 # Collect the indices of the largest holes until the density is acceptable
324 324 indicesheap = []
325 325 heapq.heapify(indicesheap)
326 326 while gapsheap and density < targetdensity:
327 327 oppgapsize, gapidx = heapq.heappop(gapsheap)
328 328
329 329 heapq.heappush(indicesheap, gapidx)
330 330
331 331 # the gap sizes are stored as negatives to be sorted decreasingly
332 332 # by the heap
333 333 readdata -= (-oppgapsize)
334 334 if readdata > 0:
335 335 density = chainpayload / float(readdata)
336 336 else:
337 337 density = 1.0
338 338
339 339 # Cut the revs at collected indices
340 340 previdx = 0
341 341 while indicesheap:
342 342 idx = heapq.heappop(indicesheap)
343 343
344 344 chunk = _trimchunk(revlog, revs, previdx, idx)
345 345 if chunk:
346 346 yield chunk
347 347
348 348 previdx = idx
349 349
350 350 chunk = _trimchunk(revlog, revs, previdx)
351 351 if chunk:
352 352 yield chunk
353 353
354 354 def _trimchunk(revlog, revs, startidx, endidx=None):
355 355 """returns revs[startidx:endidx] without empty trailing revs
356 356
357 357 Doctest Setup
358 358 >>> revlog = _testrevlog([
359 359 ... 5, #0
360 360 ... 10, #1
361 361 ... 12, #2
362 362 ... 12, #3 (empty)
363 363 ... 17, #4
364 364 ... 21, #5
365 365 ... 21, #6 (empty)
366 366 ... ])
367 367
368 368 Contiguous cases:
369 369 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
370 370 [0, 1, 2, 3, 4, 5]
371 371 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
372 372 [0, 1, 2, 3, 4]
373 373 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
374 374 [0, 1, 2]
375 375 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
376 376 [2]
377 377 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
378 378 [3, 4, 5]
379 379 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
380 380 [3, 4]
381 381
382 382 Discontiguous cases:
383 383 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
384 384 [1, 3, 5]
385 385 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
386 386 [1]
387 387 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
388 388 [3, 5]
389 389 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
390 390 [3, 5]
391 391 """
392 392 length = revlog.length
393 393
394 394 if endidx is None:
395 395 endidx = len(revs)
396 396
397 397 # If we have a non-emtpy delta candidate, there are nothing to trim
398 398 if revs[endidx - 1] < len(revlog):
399 399 # Trim empty revs at the end, except the very first revision of a chain
400 400 while (endidx > 1
401 401 and endidx > startidx
402 402 and length(revs[endidx - 1]) == 0):
403 403 endidx -= 1
404 404
405 405 return revs[startidx:endidx]
406 406
407 407 def segmentspan(revlog, revs, deltainfo=None):
408 408 """Get the byte span of a segment of revisions
409 409
410 410 revs is a sorted array of revision numbers
411 411
412 412 >>> revlog = _testrevlog([
413 413 ... 5, #0
414 414 ... 10, #1
415 415 ... 12, #2
416 416 ... 12, #3 (empty)
417 417 ... 17, #4
418 418 ... ])
419 419
420 420 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
421 421 17
422 422 >>> segmentspan(revlog, [0, 4])
423 423 17
424 424 >>> segmentspan(revlog, [3, 4])
425 425 5
426 426 >>> segmentspan(revlog, [1, 2, 3,])
427 427 7
428 428 >>> segmentspan(revlog, [1, 3])
429 429 7
430 430 """
431 431 if not revs:
432 432 return 0
433 433 if deltainfo is not None and len(revlog) <= revs[-1]:
434 434 if len(revs) == 1:
435 435 return deltainfo.deltalen
436 436 offset = revlog.end(len(revlog) - 1)
437 437 end = deltainfo.deltalen + offset
438 438 else:
439 439 end = revlog.end(revs[-1])
440 440 return end - revlog.start(revs[0])
441 441
442 442 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
443 443 """build full text from a (base, delta) pair and other metadata"""
444 444 # special case deltas which replace entire base; no need to decode
445 445 # base revision. this neatly avoids censored bases, which throw when
446 446 # they're decoded.
447 447 hlen = struct.calcsize(">lll")
448 448 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
449 449 len(delta) - hlen):
450 450 fulltext = delta[hlen:]
451 451 else:
452 452 # deltabase is rawtext before changed by flag processors, which is
453 453 # equivalent to non-raw text
454 454 basetext = revlog.revision(baserev, _df=fh, raw=False)
455 455 fulltext = mdiff.patch(basetext, delta)
456 456
457 457 try:
458 458 res = revlog._processflags(fulltext, flags, 'read', raw=True)
459 459 fulltext, validatehash = res
460 460 if validatehash:
461 461 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
462 462 if flags & REVIDX_ISCENSORED:
463 463 raise RevlogError(_('node %s is not censored') % expectednode)
464 464 except CensoredNodeError:
465 465 # must pass the censored index flag to add censored revisions
466 466 if not flags & REVIDX_ISCENSORED:
467 467 raise
468 468 return fulltext
469 469
470 470 @attr.s(slots=True, frozen=True)
471 471 class _deltainfo(object):
472 472 distance = attr.ib()
473 473 deltalen = attr.ib()
474 474 data = attr.ib()
475 475 base = attr.ib()
476 476 chainbase = attr.ib()
477 477 chainlen = attr.ib()
478 478 compresseddeltalen = attr.ib()
479 479 snapshotdepth = attr.ib()
480 480
481 481 def isgooddeltainfo(revlog, deltainfo, revinfo):
482 482 """Returns True if the given delta is good. Good means that it is within
483 483 the disk span, disk size, and chain length bounds that we know to be
484 484 performant."""
485 485 if deltainfo is None:
486 486 return False
487 487
488 488 # - 'deltainfo.distance' is the distance from the base revision --
489 489 # bounding it limits the amount of I/O we need to do.
490 490 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
491 491 # deltas we need to apply -- bounding it limits the amount of CPU
492 492 # we consume.
493 493
494 494 if revlog._sparserevlog:
495 495 # As sparse-read will be used, we can consider that the distance,
496 496 # instead of being the span of the whole chunk,
497 497 # is the span of the largest read chunk
498 498 base = deltainfo.base
499 499
500 500 if base != nullrev:
501 501 deltachain = revlog._deltachain(base)[0]
502 502 else:
503 503 deltachain = []
504 504
505 505 # search for the first non-snapshot revision
506 506 for idx, r in enumerate(deltachain):
507 507 if not revlog.issnapshot(r):
508 508 break
509 509 deltachain = deltachain[idx:]
510 510 chunks = slicechunk(revlog, deltachain, deltainfo)
511 511 all_span = [segmentspan(revlog, revs, deltainfo)
512 512 for revs in chunks]
513 513 distance = max(all_span)
514 514 else:
515 515 distance = deltainfo.distance
516 516
517 517 textlen = revinfo.textlen
518 518 defaultmax = textlen * 4
519 519 maxdist = revlog._maxdeltachainspan
520 520 if not maxdist:
521 521 maxdist = distance # ensure the conditional pass
522 522 maxdist = max(maxdist, defaultmax)
523 523 if revlog._sparserevlog and maxdist < revlog._srmingapsize:
524 524 # In multiple place, we are ignoring irrelevant data range below a
525 525 # certain size. Be also apply this tradeoff here and relax span
526 526 # constraint for small enought content.
527 527 maxdist = revlog._srmingapsize
528 528
529 529 # Bad delta from read span:
530 530 #
531 531 # If the span of data read is larger than the maximum allowed.
532 532 if maxdist < distance:
533 533 return False
534 534
535 535 # Bad delta from new delta size:
536 536 #
537 537 # If the delta size is larger than the target text, storing the
538 538 # delta will be inefficient.
539 539 if textlen < deltainfo.deltalen:
540 540 return False
541 541
542 542 # Bad delta from cumulated payload size:
543 543 #
544 544 # If the sum of delta get larger than K * target text length.
545 545 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
546 546 return False
547 547
548 548 # Bad delta from chain length:
549 549 #
550 550 # If the number of delta in the chain gets too high.
551 551 if (revlog._maxchainlen
552 552 and revlog._maxchainlen < deltainfo.chainlen):
553 553 return False
554 554
555 555 # bad delta from intermediate snapshot size limit
556 556 #
557 557 # If an intermediate snapshot size is higher than the limit. The
558 558 # limit exist to prevent endless chain of intermediate delta to be
559 559 # created.
560 560 if (deltainfo.snapshotdepth is not None and
561 561 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
562 562 return False
563 563
564 564 # bad delta if new intermediate snapshot is larger than the previous
565 565 # snapshot
566 566 if (deltainfo.snapshotdepth
567 567 and revlog.length(deltainfo.base) < deltainfo.deltalen):
568 568 return False
569 569
570 570 return True
571 571
572 572 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
573 573 """Provides group of revision to be tested as delta base
574 574
575 575 This top level function focus on emitting groups with unique and worthwhile
576 576 content. See _raw_candidate_groups for details about the group order.
577 577 """
578 578 # should we try to build a delta?
579 579 if not (len(revlog) and revlog._storedeltachains):
580 580 yield None
581 581 return
582 582
583 583 deltalength = revlog.length
584 584 deltaparent = revlog.deltaparent
585 585 good = None
586 586
587 587 deltas_limit = textlen * LIMIT_DELTA2TEXT
588 588
589 589 tested = set([nullrev])
590 590 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
591 591 while True:
592 592 temptative = candidates.send(good)
593 593 if temptative is None:
594 594 break
595 595 group = []
596 596 for rev in temptative:
597 597 # skip over empty delta (no need to include them in a chain)
598 598 while not (rev == nullrev or rev in tested or deltalength(rev)):
599 599 rev = deltaparent(rev)
600 600 tested.add(rev)
601 601 # filter out revision we tested already
602 602 if rev in tested:
603 603 continue
604 604 tested.add(rev)
605 605 # filter out delta base that will never produce good delta
606 606 if deltas_limit < revlog.length(rev):
607 607 continue
608 608 # no need to try a delta against nullrev, this will be done as a
609 609 # last resort.
610 610 if rev == nullrev:
611 611 continue
612 612 # no delta for rawtext-changing revs (see "candelta" for why)
613 613 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
614 614 continue
615 615 group.append(rev)
616 616 if group:
617 617 # XXX: in the sparse revlog case, group can become large,
618 618 # impacting performances. Some bounding or slicing mecanism
619 619 # would help to reduce this impact.
620 620 good = yield tuple(group)
621 621 yield None
622 622
623 623 def _findsnapshots(revlog, cache, start_rev):
624 624 """find snapshot from start_rev to tip"""
625 625 deltaparent = revlog.deltaparent
626 626 issnapshot = revlog.issnapshot
627 627 for rev in revlog.revs(start_rev):
628 628 if issnapshot(rev):
629 629 cache[deltaparent(rev)].append(rev)
630 630
631 631 def _refinedgroups(revlog, p1, p2, cachedelta):
632 632 good = None
633 633 # First we try to reuse a the delta contained in the bundle.
634 634 # (or from the source revlog)
635 635 #
636 636 # This logic only applies to general delta repositories and can be disabled
637 637 # through configuration. Disabling reuse source delta is useful when
638 638 # we want to make sure we recomputed "optimal" deltas.
639 639 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
640 640 # Assume what we received from the server is a good choice
641 641 # build delta will reuse the cache
642 642 good = yield (cachedelta[0],)
643 643 if good is not None:
644 644 yield None
645 645 return
646 646 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
647 647 good = yield candidates
648 648 if good is not None:
649 649 break
650 650
651 651 # if we have a refinable value, try to refine it
652 652 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
653 653 # refine snapshot down
654 654 previous = None
655 655 while previous != good:
656 656 previous = good
657 657 base = revlog.deltaparent(good)
658 658 if base == nullrev:
659 659 break
660 660 good = yield (base,)
661 661 # refine snapshot up
662 662 #
663 663 # XXX the _findsnapshots call can be expensive and is "duplicated" with
664 664 # the one done in `_rawgroups`. Once we start working on performance,
665 665 # we should make the two logics share this computation.
666 666 snapshots = collections.defaultdict(list)
667 667 _findsnapshots(revlog, snapshots, good + 1)
668 668 previous = None
669 669 while good != previous:
670 670 previous = good
671 671 children = tuple(sorted(c for c in snapshots[good]))
672 672 good = yield children
673 673
674 674 # we have found nothing
675 675 yield None
676 676
677 677 def _rawgroups(revlog, p1, p2, cachedelta):
678 678 """Provides group of revision to be tested as delta base
679 679
680 680 This lower level function focus on emitting delta theorically interresting
681 681 without looking it any practical details.
682 682
683 683 The group order aims at providing fast or small candidates first.
684 684 """
685 685 gdelta = revlog._generaldelta
686 686 sparse = revlog._sparserevlog
687 687 curr = len(revlog)
688 688 prev = curr - 1
689 689 deltachain = lambda rev: revlog._deltachain(rev)[0]
690 690
691 691 if gdelta:
692 692 # exclude already lazy tested base if any
693 693 parents = [p for p in (p1, p2) if p != nullrev]
694 694
695 695 if not revlog._deltabothparents and len(parents) == 2:
696 696 parents.sort()
697 697 # To minimize the chance of having to build a fulltext,
698 698 # pick first whichever parent is closest to us (max rev)
699 699 yield (parents[1],)
700 700 # then the other one (min rev) if the first did not fit
701 701 yield (parents[0],)
702 702 elif len(parents) > 0:
703 703 # Test all parents (1 or 2), and keep the best candidate
704 704 yield parents
705 705
706 706 if sparse and parents:
707 707 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
708 708 # See if we can use an existing snapshot in the parent chains to use as
709 709 # a base for a new intermediate-snapshot
710 710 #
711 711 # search for snapshot in parents delta chain
712 712 # map: snapshot-level: snapshot-rev
713 713 parents_snaps = collections.defaultdict(set)
714 714 candidate_chains = [deltachain(p) for p in parents]
715 715 for chain in candidate_chains:
716 716 for idx, s in enumerate(chain):
717 717 if not revlog.issnapshot(s):
718 718 break
719 719 parents_snaps[idx].add(s)
720 720 snapfloor = min(parents_snaps[0]) + 1
721 721 _findsnapshots(revlog, snapshots, snapfloor)
722 # search for the highest "unrelated" revision
723 #
724 # Adding snapshots used by "unrelated" revision increase the odd we
725 # reuse an independant, yet better snapshot chain.
726 #
727 # XXX instead of building a set of revisions, we could lazily enumerate
728 # over the chains. That would be more efficient, however we stick to
729 # simple code for now.
730 all_revs = set()
731 for chain in candidate_chains:
732 all_revs.update(chain)
733 other = None
734 for r in revlog.revs(prev, snapfloor):
735 if r not in all_revs:
736 other = r
737 break
738 if other is not None:
739 # To avoid unfair competition, we won't use unrelated intermediate
740 # snapshot that are deeper than the ones from the parent delta
741 # chain.
742 max_depth = max(parents_snaps.keys())
743 chain = deltachain(other)
744 for idx, s in enumerate(chain):
745 if s < snapfloor:
746 continue
747 if max_depth < idx:
748 break
749 if not revlog.issnapshot(s):
750 break
751 parents_snaps[idx].add(s)
722 752 # Test them as possible intermediate snapshot base
723 753 # We test them from highest to lowest level. High level one are more
724 754 # likely to result in small delta
725 755 floor = None
726 756 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
727 757 siblings = set()
728 758 for s in snaps:
729 759 siblings.update(snapshots[s])
730 760 # Before considering making a new intermediate snapshot, we check
731 761 # if an existing snapshot, children of base we consider, would be
732 762 # suitable.
733 763 #
734 764 # It give a change to reuse a delta chain "unrelated" to the
735 765 # current revision instead of starting our own. Without such
736 766 # re-use, topological branches would keep reopening new chains.
737 767 # Creating more and more snapshot as the repository grow.
738 768
739 769 if floor is not None:
740 770 # We only do this for siblings created after the one in our
741 771 # parent's delta chain. Those created before has less chances
742 772 # to be valid base since our ancestors had to create a new
743 773 # snapshot.
744 774 siblings = [r for r in siblings if floor < r]
745 775 yield tuple(sorted(siblings))
746 776 # then test the base from our parent's delta chain.
747 777 yield tuple(sorted(snaps))
748 778 floor = min(snaps)
749 779 # No suitable base found in the parent chain, search if any full
750 780 # snapshots emitted since parent's base would be a suitable base for an
751 781 # intermediate snapshot.
752 782 #
753 783 # It give a chance to reuse a delta chain unrelated to the current
754 784 # revisions instead of starting our own. Without such re-use,
755 785 # topological branches would keep reopening new full chains. Creating
756 786 # more and more snapshot as the repository grow.
757 787 yield tuple(snapshots[nullrev])
758 788
759 # other approach failed try against prev to hopefully save us a
760 # fulltext.
761 yield (prev,)
789 if not sparse:
790 # other approach failed try against prev to hopefully save us a
791 # fulltext.
792 yield (prev,)
762 793
763 794 class deltacomputer(object):
764 795 def __init__(self, revlog):
765 796 self.revlog = revlog
766 797
767 798 def buildtext(self, revinfo, fh):
768 799 """Builds a fulltext version of a revision
769 800
770 801 revinfo: _revisioninfo instance that contains all needed info
771 802 fh: file handle to either the .i or the .d revlog file,
772 803 depending on whether it is inlined or not
773 804 """
774 805 btext = revinfo.btext
775 806 if btext[0] is not None:
776 807 return btext[0]
777 808
778 809 revlog = self.revlog
779 810 cachedelta = revinfo.cachedelta
780 811 baserev = cachedelta[0]
781 812 delta = cachedelta[1]
782 813
783 814 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
784 815 revinfo.p1, revinfo.p2,
785 816 revinfo.flags, revinfo.node)
786 817 return fulltext
787 818
788 819 def _builddeltadiff(self, base, revinfo, fh):
789 820 revlog = self.revlog
790 821 t = self.buildtext(revinfo, fh)
791 822 if revlog.iscensored(base):
792 823 # deltas based on a censored revision must replace the
793 824 # full content in one patch, so delta works everywhere
794 825 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
795 826 delta = header + t
796 827 else:
797 828 ptext = revlog.revision(base, _df=fh, raw=True)
798 829 delta = mdiff.textdiff(ptext, t)
799 830
800 831 return delta
801 832
802 833 def _builddeltainfo(self, revinfo, base, fh):
803 834 # can we use the cached delta?
804 835 if revinfo.cachedelta and revinfo.cachedelta[0] == base:
805 836 delta = revinfo.cachedelta[1]
806 837 else:
807 838 delta = self._builddeltadiff(base, revinfo, fh)
808 839 revlog = self.revlog
809 840 header, data = revlog.compress(delta)
810 841 deltalen = len(header) + len(data)
811 842 chainbase = revlog.chainbase(base)
812 843 offset = revlog.end(len(revlog) - 1)
813 844 dist = deltalen + offset - revlog.start(chainbase)
814 845 if revlog._generaldelta:
815 846 deltabase = base
816 847 else:
817 848 deltabase = chainbase
818 849 chainlen, compresseddeltalen = revlog._chaininfo(base)
819 850 chainlen += 1
820 851 compresseddeltalen += deltalen
821 852
822 853 revlog = self.revlog
823 854 snapshotdepth = None
824 855 if deltabase == nullrev:
825 856 snapshotdepth = 0
826 857 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
827 858 # A delta chain should always be one full snapshot,
828 859 # zero or more semi-snapshots, and zero or more deltas
829 860 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
830 861 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
831 862 snapshotdepth = len(revlog._deltachain(deltabase)[0])
832 863
833 864 return _deltainfo(dist, deltalen, (header, data), deltabase,
834 865 chainbase, chainlen, compresseddeltalen,
835 866 snapshotdepth)
836 867
837 868 def _fullsnapshotinfo(self, fh, revinfo):
838 869 curr = len(self.revlog)
839 870 rawtext = self.buildtext(revinfo, fh)
840 871 data = self.revlog.compress(rawtext)
841 872 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
842 873 deltabase = chainbase = curr
843 874 snapshotdepth = 0
844 875 chainlen = 1
845 876
846 877 return _deltainfo(dist, deltalen, data, deltabase,
847 878 chainbase, chainlen, compresseddeltalen,
848 879 snapshotdepth)
849 880
850 881 def finddeltainfo(self, revinfo, fh):
851 882 """Find an acceptable delta against a candidate revision
852 883
853 884 revinfo: information about the revision (instance of _revisioninfo)
854 885 fh: file handle to either the .i or the .d revlog file,
855 886 depending on whether it is inlined or not
856 887
857 888 Returns the first acceptable candidate revision, as ordered by
858 889 _candidategroups
859 890
860 891 If no suitable deltabase is found, we return delta info for a full
861 892 snapshot.
862 893 """
863 894 if not revinfo.textlen:
864 895 return self._fullsnapshotinfo(fh, revinfo)
865 896
866 897 # no delta for flag processor revision (see "candelta" for why)
867 898 # not calling candelta since only one revision needs test, also to
868 899 # avoid overhead fetching flags again.
869 900 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
870 901 return self._fullsnapshotinfo(fh, revinfo)
871 902
872 903 cachedelta = revinfo.cachedelta
873 904 p1 = revinfo.p1
874 905 p2 = revinfo.p2
875 906 revlog = self.revlog
876 907
877 908 deltainfo = None
878 909 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
879 910 groups = _candidategroups(self.revlog, revinfo.textlen,
880 911 p1r, p2r, cachedelta)
881 912 candidaterevs = next(groups)
882 913 while candidaterevs is not None:
883 914 nominateddeltas = []
884 915 if deltainfo is not None:
885 916 # if we already found a good delta,
886 917 # challenge it against refined candidates
887 918 nominateddeltas.append(deltainfo)
888 919 for candidaterev in candidaterevs:
889 920 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
890 921 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
891 922 nominateddeltas.append(candidatedelta)
892 923 if nominateddeltas:
893 924 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
894 925 if deltainfo is not None:
895 926 candidaterevs = groups.send(deltainfo.base)
896 927 else:
897 928 candidaterevs = next(groups)
898 929
899 930 if deltainfo is None:
900 931 deltainfo = self._fullsnapshotinfo(fh, revinfo)
901 932 return deltainfo
@@ -1,133 +1,133 b''
1 1 ====================================
2 2 Test delta choice with sparse revlog
3 3 ====================================
4 4
5 5 Sparse-revlog usually shows the most gain on Manifest. However, it is simpler
6 6 to general an appropriate file, so we test with a single file instead. The
7 7 goal is to observe intermediate snapshot being created.
8 8
9 9 We need a large enough file. Part of the content needs to be replaced
10 10 repeatedly while some of it changes rarely.
11 11
12 12 $ bundlepath="$TESTDIR/artifacts/cache/big-file-churn.hg"
13 13
14 14 $ expectedhash=`cat "$bundlepath".md5`
15 15 $ if [ ! -f "$bundlepath" ]; then
16 16 > echo 'skipped: missing artifact, run "'"$TESTDIR"'/artifacts/scripts/generate-churning-bundle.py"'
17 17 > exit 80
18 18 > fi
19 19 $ currenthash=`f -M "$bundlepath" | cut -d = -f 2`
20 20 $ if [ "$currenthash" != "$expectedhash" ]; then
21 21 > echo 'skipped: outdated artifact, md5 "'"$currenthash"'" expected "'"$expectedhash"'" run "'"$TESTDIR"'/artifacts/scripts/generate-churning-bundle.py"'
22 22 > exit 80
23 23 > fi
24 24
25 25 $ cat >> $HGRCPATH << EOF
26 26 > [format]
27 27 > sparse-revlog = yes
28 28 > [storage]
29 29 > revlog.optimize-delta-parent-choice = yes
30 30 > EOF
31 31 $ hg init sparse-repo
32 32 $ cd sparse-repo
33 33 $ hg unbundle $bundlepath
34 34 adding changesets
35 35 adding manifests
36 36 adding file changes
37 37 added 5001 changesets with 5001 changes to 1 files (+89 heads)
38 38 new changesets 9706f5af64f4:d9032adc8114 (5001 drafts)
39 39 (run 'hg heads' to see heads, 'hg merge' to merge)
40 40 $ hg up
41 41 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
42 42 updated to "d9032adc8114: commit #5000"
43 43 89 other heads for branch "default"
44 44
45 45 $ hg log --stat -r 0:3
46 46 changeset: 0:9706f5af64f4
47 47 user: test
48 48 date: Thu Jan 01 00:00:00 1970 +0000
49 49 summary: initial commit
50 50
51 51 SPARSE-REVLOG-TEST-FILE | 10500 ++++++++++++++++++++++++++++++++++++++++++++++
52 52 1 files changed, 10500 insertions(+), 0 deletions(-)
53 53
54 54 changeset: 1:724907deaa5e
55 55 user: test
56 56 date: Thu Jan 01 00:00:00 1970 +0000
57 57 summary: commit #1
58 58
59 59 SPARSE-REVLOG-TEST-FILE | 1068 +++++++++++++++++++++++-----------------------
60 60 1 files changed, 534 insertions(+), 534 deletions(-)
61 61
62 62 changeset: 2:62c41bce3e5d
63 63 user: test
64 64 date: Thu Jan 01 00:00:00 1970 +0000
65 65 summary: commit #2
66 66
67 67 SPARSE-REVLOG-TEST-FILE | 1068 +++++++++++++++++++++++-----------------------
68 68 1 files changed, 534 insertions(+), 534 deletions(-)
69 69
70 70 changeset: 3:348a9cbd6959
71 71 user: test
72 72 date: Thu Jan 01 00:00:00 1970 +0000
73 73 summary: commit #3
74 74
75 75 SPARSE-REVLOG-TEST-FILE | 1068 +++++++++++++++++++++++-----------------------
76 76 1 files changed, 534 insertions(+), 534 deletions(-)
77 77
78 78
79 79 $ f -s .hg/store/data/*.d
80 .hg/store/data/_s_p_a_r_s_e-_r_e_v_l_o_g-_t_e_s_t-_f_i_l_e.d: size=59302280
80 .hg/store/data/_s_p_a_r_s_e-_r_e_v_l_o_g-_t_e_s_t-_f_i_l_e.d: size=59230936
81 81 $ hg debugrevlog *
82 82 format : 1
83 83 flags : generaldelta
84 84
85 85 revisions : 5001
86 86 merges : 625 (12.50%)
87 87 normal : 4376 (87.50%)
88 88 revisions : 5001
89 89 empty : 0 ( 0.00%)
90 90 text : 0 (100.00%)
91 91 delta : 0 (100.00%)
92 snapshot : 168 ( 3.36%)
93 lvl-0 : 4 ( 0.08%)
94 lvl-1 : 18 ( 0.36%)
95 lvl-2 : 39 ( 0.78%)
96 lvl-3 : 54 ( 1.08%)
97 lvl-4 : 53 ( 1.06%)
98 deltas : 4833 (96.64%)
99 revision size : 59302280
100 snapshot : 5833942 ( 9.84%)
101 lvl-0 : 804068 ( 1.36%)
102 lvl-1 : 1378470 ( 2.32%)
103 lvl-2 : 1608138 ( 2.71%)
104 lvl-3 : 1222158 ( 2.06%)
105 lvl-4 : 821108 ( 1.38%)
106 deltas : 53468338 (90.16%)
92 snapshot : 176 ( 3.52%)
93 lvl-0 : 3 ( 0.06%)
94 lvl-1 : 17 ( 0.34%)
95 lvl-2 : 45 ( 0.90%)
96 lvl-3 : 56 ( 1.12%)
97 lvl-4 : 55 ( 1.10%)
98 deltas : 4825 (96.48%)
99 revision size : 59230936
100 snapshot : 5770371 ( 9.74%)
101 lvl-0 : 602962 ( 1.02%)
102 lvl-1 : 1534153 ( 2.59%)
103 lvl-2 : 1604445 ( 2.71%)
104 lvl-3 : 1218174 ( 2.06%)
105 lvl-4 : 810637 ( 1.37%)
106 deltas : 53460565 (90.26%)
107 107
108 108 chunks : 5001
109 109 0x78 (x) : 5001 (100.00%)
110 chunks size : 59302280
111 0x78 (x) : 59302280 (100.00%)
110 chunks size : 59230936
111 0x78 (x) : 59230936 (100.00%)
112 112
113 113 avg chain length : 17
114 114 max chain length : 45
115 max chain reach : 22744720
115 max chain reach : 25326012
116 116 compression ratio : 29
117 117
118 118 uncompressed data size (min/max/avg) : 346468 / 346472 / 346471
119 full revision size (min/max/avg) : 200985 / 201050 / 201017
120 inter-snapshot size (min/max/avg) : 11598 / 163304 / 30669
121 level-1 (min/max/avg) : 15616 / 163304 / 76581
122 level-2 (min/max/avg) : 11602 / 86428 / 41234
123 level-3 (min/max/avg) : 11598 / 42390 / 22632
124 level-4 (min/max/avg) : 11603 / 19649 / 15492
125 delta size (min/max/avg) : 10649 / 105465 / 11063
119 full revision size (min/max/avg) : 200897 / 201050 / 200987
120 inter-snapshot size (min/max/avg) : 11598 / 171990 / 29869
121 level-1 (min/max/avg) : 14037 / 171990 / 90244
122 level-2 (min/max/avg) : 11632 / 84456 / 35654
123 level-3 (min/max/avg) : 11598 / 41486 / 21753
124 level-4 (min/max/avg) : 11618 / 19913 / 14738
125 delta size (min/max/avg) : 10649 / 105209 / 11079
126 126
127 deltas against prev : 4167 (86.22%)
128 where prev = p1 : 4129 (99.09%)
127 deltas against prev : 4156 (86.13%)
128 where prev = p1 : 4120 (99.13%)
129 129 where prev = p2 : 0 ( 0.00%)
130 other : 38 ( 0.91%)
131 deltas against p1 : 643 (13.30%)
130 other : 36 ( 0.87%)
131 deltas against p1 : 646 (13.39%)
132 132 deltas against p2 : 23 ( 0.48%)
133 133 deltas against other : 0 ( 0.00%)
General Comments 0
You need to be logged in to leave comments. Login now