##// END OF EJS Templates
snapshot: search for unrelated but reusable full-snapshot...
Boris Feld -
r39529:3ca144f1 default
parent child Browse files
Show More
@@ -1,794 +1,817 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 import collections
12 13 import heapq
13 14 import struct
14 15
15 16 # import stuff from node for others to import from revlog
16 17 from ..node import (
17 18 nullrev,
18 19 )
19 20 from ..i18n import _
20 21
21 22 from .constants import (
22 23 REVIDX_ISCENSORED,
23 24 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 25 )
25 26
26 27 from ..thirdparty import (
27 28 attr,
28 29 )
29 30
30 31 from .. import (
31 32 error,
32 33 mdiff,
33 34 )
34 35
35 36 RevlogError = error.RevlogError
36 37 CensoredNodeError = error.CensoredNodeError
37 38
38 39 # maximum <delta-chain-data>/<revision-text-length> ratio
39 40 LIMIT_DELTA2TEXT = 2
40 41
41 42 class _testrevlog(object):
42 43 """minimalist fake revlog to use in doctests"""
43 44
44 45 def __init__(self, data, density=0.5, mingap=0):
45 46 """data is an list of revision payload boundaries"""
46 47 self._data = data
47 48 self._srdensitythreshold = density
48 49 self._srmingapsize = mingap
49 50
50 51 def start(self, rev):
51 52 if rev == 0:
52 53 return 0
53 54 return self._data[rev - 1]
54 55
55 56 def end(self, rev):
56 57 return self._data[rev]
57 58
58 59 def length(self, rev):
59 60 return self.end(rev) - self.start(rev)
60 61
61 62 def __len__(self):
62 63 return len(self._data)
63 64
64 65 def slicechunk(revlog, revs, deltainfo=None, targetsize=None):
65 66 """slice revs to reduce the amount of unrelated data to be read from disk.
66 67
67 68 ``revs`` is sliced into groups that should be read in one time.
68 69 Assume that revs are sorted.
69 70
70 71 The initial chunk is sliced until the overall density (payload/chunks-span
71 72 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
72 73 `revlog._srmingapsize` is skipped.
73 74
74 75 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
75 76 For consistency with other slicing choice, this limit won't go lower than
76 77 `revlog._srmingapsize`.
77 78
78 79 If individual revisions chunk are larger than this limit, they will still
79 80 be raised individually.
80 81
81 82 >>> revlog = _testrevlog([
82 83 ... 5, #00 (5)
83 84 ... 10, #01 (5)
84 85 ... 12, #02 (2)
85 86 ... 12, #03 (empty)
86 87 ... 27, #04 (15)
87 88 ... 31, #05 (4)
88 89 ... 31, #06 (empty)
89 90 ... 42, #07 (11)
90 91 ... 47, #08 (5)
91 92 ... 47, #09 (empty)
92 93 ... 48, #10 (1)
93 94 ... 51, #11 (3)
94 95 ... 74, #12 (23)
95 96 ... 85, #13 (11)
96 97 ... 86, #14 (1)
97 98 ... 91, #15 (5)
98 99 ... ])
99 100
100 101 >>> list(slicechunk(revlog, list(range(16))))
101 102 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
102 103 >>> list(slicechunk(revlog, [0, 15]))
103 104 [[0], [15]]
104 105 >>> list(slicechunk(revlog, [0, 11, 15]))
105 106 [[0], [11], [15]]
106 107 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
107 108 [[0], [11, 13, 15]]
108 109 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
109 110 [[1, 2], [5, 8, 10, 11], [14]]
110 111
111 112 Slicing with a maximum chunk size
112 113 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
113 114 [[0], [11], [13], [15]]
114 115 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
115 116 [[0], [11], [13, 15]]
116 117 """
117 118 if targetsize is not None:
118 119 targetsize = max(targetsize, revlog._srmingapsize)
119 120 # targetsize should not be specified when evaluating delta candidates:
120 121 # * targetsize is used to ensure we stay within specification when reading,
121 122 # * deltainfo is used to pick are good delta chain when writing.
122 123 if not (deltainfo is None or targetsize is None):
123 124 msg = 'cannot use `targetsize` with a `deltainfo`'
124 125 raise error.ProgrammingError(msg)
125 126 for chunk in _slicechunktodensity(revlog, revs,
126 127 deltainfo,
127 128 revlog._srdensitythreshold,
128 129 revlog._srmingapsize):
129 130 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
130 131 yield subchunk
131 132
132 133 def _slicechunktosize(revlog, revs, targetsize=None):
133 134 """slice revs to match the target size
134 135
135 136 This is intended to be used on chunk that density slicing selected by that
136 137 are still too large compared to the read garantee of revlog. This might
137 138 happens when "minimal gap size" interrupted the slicing or when chain are
138 139 built in a way that create large blocks next to each other.
139 140
140 141 >>> revlog = _testrevlog([
141 142 ... 3, #0 (3)
142 143 ... 5, #1 (2)
143 144 ... 6, #2 (1)
144 145 ... 8, #3 (2)
145 146 ... 8, #4 (empty)
146 147 ... 11, #5 (3)
147 148 ... 12, #6 (1)
148 149 ... 13, #7 (1)
149 150 ... 14, #8 (1)
150 151 ... ])
151 152
152 153 Cases where chunk is already small enough
153 154 >>> list(_slicechunktosize(revlog, [0], 3))
154 155 [[0]]
155 156 >>> list(_slicechunktosize(revlog, [6, 7], 3))
156 157 [[6, 7]]
157 158 >>> list(_slicechunktosize(revlog, [0], None))
158 159 [[0]]
159 160 >>> list(_slicechunktosize(revlog, [6, 7], None))
160 161 [[6, 7]]
161 162
162 163 cases where we need actual slicing
163 164 >>> list(_slicechunktosize(revlog, [0, 1], 3))
164 165 [[0], [1]]
165 166 >>> list(_slicechunktosize(revlog, [1, 3], 3))
166 167 [[1], [3]]
167 168 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
168 169 [[1, 2], [3]]
169 170 >>> list(_slicechunktosize(revlog, [3, 5], 3))
170 171 [[3], [5]]
171 172 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
172 173 [[3], [5]]
173 174 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
174 175 [[5], [6, 7, 8]]
175 176 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
176 177 [[0], [1, 2], [3], [5], [6, 7, 8]]
177 178
178 179 Case with too large individual chunk (must return valid chunk)
179 180 >>> list(_slicechunktosize(revlog, [0, 1], 2))
180 181 [[0], [1]]
181 182 >>> list(_slicechunktosize(revlog, [1, 3], 1))
182 183 [[1], [3]]
183 184 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
184 185 [[3], [5]]
185 186 """
186 187 assert targetsize is None or 0 <= targetsize
187 188 if targetsize is None or segmentspan(revlog, revs) <= targetsize:
188 189 yield revs
189 190 return
190 191
191 192 startrevidx = 0
192 193 startdata = revlog.start(revs[0])
193 194 endrevidx = 0
194 195 iterrevs = enumerate(revs)
195 196 next(iterrevs) # skip first rev.
196 197 for idx, r in iterrevs:
197 198 span = revlog.end(r) - startdata
198 199 if span <= targetsize:
199 200 endrevidx = idx
200 201 else:
201 202 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
202 203 if chunk:
203 204 yield chunk
204 205 startrevidx = idx
205 206 startdata = revlog.start(r)
206 207 endrevidx = idx
207 208 yield _trimchunk(revlog, revs, startrevidx)
208 209
209 210 def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5,
210 211 mingapsize=0):
211 212 """slice revs to reduce the amount of unrelated data to be read from disk.
212 213
213 214 ``revs`` is sliced into groups that should be read in one time.
214 215 Assume that revs are sorted.
215 216
216 217 ``deltainfo`` is a _deltainfo instance of a revision that we would append
217 218 to the top of the revlog.
218 219
219 220 The initial chunk is sliced until the overall density (payload/chunks-span
220 221 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
221 222 skipped.
222 223
223 224 >>> revlog = _testrevlog([
224 225 ... 5, #00 (5)
225 226 ... 10, #01 (5)
226 227 ... 12, #02 (2)
227 228 ... 12, #03 (empty)
228 229 ... 27, #04 (15)
229 230 ... 31, #05 (4)
230 231 ... 31, #06 (empty)
231 232 ... 42, #07 (11)
232 233 ... 47, #08 (5)
233 234 ... 47, #09 (empty)
234 235 ... 48, #10 (1)
235 236 ... 51, #11 (3)
236 237 ... 74, #12 (23)
237 238 ... 85, #13 (11)
238 239 ... 86, #14 (1)
239 240 ... 91, #15 (5)
240 241 ... ])
241 242
242 243 >>> list(_slicechunktodensity(revlog, list(range(16))))
243 244 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
244 245 >>> list(_slicechunktodensity(revlog, [0, 15]))
245 246 [[0], [15]]
246 247 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
247 248 [[0], [11], [15]]
248 249 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
249 250 [[0], [11, 13, 15]]
250 251 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
251 252 [[1, 2], [5, 8, 10, 11], [14]]
252 253 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
253 254 ... mingapsize=20))
254 255 [[1, 2, 3, 5, 8, 10, 11], [14]]
255 256 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
256 257 ... targetdensity=0.95))
257 258 [[1, 2], [5], [8, 10, 11], [14]]
258 259 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
259 260 ... targetdensity=0.95, mingapsize=12))
260 261 [[1, 2], [5, 8, 10, 11], [14]]
261 262 """
262 263 start = revlog.start
263 264 length = revlog.length
264 265
265 266 if len(revs) <= 1:
266 267 yield revs
267 268 return
268 269
269 270 nextrev = len(revlog)
270 271 nextoffset = revlog.end(nextrev - 1)
271 272
272 273 if deltainfo is None:
273 274 deltachainspan = segmentspan(revlog, revs)
274 275 chainpayload = sum(length(r) for r in revs)
275 276 else:
276 277 deltachainspan = deltainfo.distance
277 278 chainpayload = deltainfo.compresseddeltalen
278 279
279 280 if deltachainspan < mingapsize:
280 281 yield revs
281 282 return
282 283
283 284 readdata = deltachainspan
284 285
285 286 if deltachainspan:
286 287 density = chainpayload / float(deltachainspan)
287 288 else:
288 289 density = 1.0
289 290
290 291 if density >= targetdensity:
291 292 yield revs
292 293 return
293 294
294 295 if deltainfo is not None and deltainfo.deltalen:
295 296 revs = list(revs)
296 297 revs.append(nextrev)
297 298
298 299 # Store the gaps in a heap to have them sorted by decreasing size
299 300 gapsheap = []
300 301 heapq.heapify(gapsheap)
301 302 prevend = None
302 303 for i, rev in enumerate(revs):
303 304 if rev < nextrev:
304 305 revstart = start(rev)
305 306 revlen = length(rev)
306 307 else:
307 308 revstart = nextoffset
308 309 revlen = deltainfo.deltalen
309 310
310 311 # Skip empty revisions to form larger holes
311 312 if revlen == 0:
312 313 continue
313 314
314 315 if prevend is not None:
315 316 gapsize = revstart - prevend
316 317 # only consider holes that are large enough
317 318 if gapsize > mingapsize:
318 319 heapq.heappush(gapsheap, (-gapsize, i))
319 320
320 321 prevend = revstart + revlen
321 322
322 323 # Collect the indices of the largest holes until the density is acceptable
323 324 indicesheap = []
324 325 heapq.heapify(indicesheap)
325 326 while gapsheap and density < targetdensity:
326 327 oppgapsize, gapidx = heapq.heappop(gapsheap)
327 328
328 329 heapq.heappush(indicesheap, gapidx)
329 330
330 331 # the gap sizes are stored as negatives to be sorted decreasingly
331 332 # by the heap
332 333 readdata -= (-oppgapsize)
333 334 if readdata > 0:
334 335 density = chainpayload / float(readdata)
335 336 else:
336 337 density = 1.0
337 338
338 339 # Cut the revs at collected indices
339 340 previdx = 0
340 341 while indicesheap:
341 342 idx = heapq.heappop(indicesheap)
342 343
343 344 chunk = _trimchunk(revlog, revs, previdx, idx)
344 345 if chunk:
345 346 yield chunk
346 347
347 348 previdx = idx
348 349
349 350 chunk = _trimchunk(revlog, revs, previdx)
350 351 if chunk:
351 352 yield chunk
352 353
353 354 def _trimchunk(revlog, revs, startidx, endidx=None):
354 355 """returns revs[startidx:endidx] without empty trailing revs
355 356
356 357 Doctest Setup
357 358 >>> revlog = _testrevlog([
358 359 ... 5, #0
359 360 ... 10, #1
360 361 ... 12, #2
361 362 ... 12, #3 (empty)
362 363 ... 17, #4
363 364 ... 21, #5
364 365 ... 21, #6 (empty)
365 366 ... ])
366 367
367 368 Contiguous cases:
368 369 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
369 370 [0, 1, 2, 3, 4, 5]
370 371 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
371 372 [0, 1, 2, 3, 4]
372 373 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
373 374 [0, 1, 2]
374 375 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
375 376 [2]
376 377 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
377 378 [3, 4, 5]
378 379 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
379 380 [3, 4]
380 381
381 382 Discontiguous cases:
382 383 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
383 384 [1, 3, 5]
384 385 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
385 386 [1]
386 387 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
387 388 [3, 5]
388 389 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
389 390 [3, 5]
390 391 """
391 392 length = revlog.length
392 393
393 394 if endidx is None:
394 395 endidx = len(revs)
395 396
396 397 # If we have a non-emtpy delta candidate, there are nothing to trim
397 398 if revs[endidx - 1] < len(revlog):
398 399 # Trim empty revs at the end, except the very first revision of a chain
399 400 while (endidx > 1
400 401 and endidx > startidx
401 402 and length(revs[endidx - 1]) == 0):
402 403 endidx -= 1
403 404
404 405 return revs[startidx:endidx]
405 406
406 407 def segmentspan(revlog, revs, deltainfo=None):
407 408 """Get the byte span of a segment of revisions
408 409
409 410 revs is a sorted array of revision numbers
410 411
411 412 >>> revlog = _testrevlog([
412 413 ... 5, #0
413 414 ... 10, #1
414 415 ... 12, #2
415 416 ... 12, #3 (empty)
416 417 ... 17, #4
417 418 ... ])
418 419
419 420 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
420 421 17
421 422 >>> segmentspan(revlog, [0, 4])
422 423 17
423 424 >>> segmentspan(revlog, [3, 4])
424 425 5
425 426 >>> segmentspan(revlog, [1, 2, 3,])
426 427 7
427 428 >>> segmentspan(revlog, [1, 3])
428 429 7
429 430 """
430 431 if not revs:
431 432 return 0
432 433 if deltainfo is not None and len(revlog) <= revs[-1]:
433 434 if len(revs) == 1:
434 435 return deltainfo.deltalen
435 436 offset = revlog.end(len(revlog) - 1)
436 437 end = deltainfo.deltalen + offset
437 438 else:
438 439 end = revlog.end(revs[-1])
439 440 return end - revlog.start(revs[0])
440 441
441 442 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
442 443 """build full text from a (base, delta) pair and other metadata"""
443 444 # special case deltas which replace entire base; no need to decode
444 445 # base revision. this neatly avoids censored bases, which throw when
445 446 # they're decoded.
446 447 hlen = struct.calcsize(">lll")
447 448 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
448 449 len(delta) - hlen):
449 450 fulltext = delta[hlen:]
450 451 else:
451 452 # deltabase is rawtext before changed by flag processors, which is
452 453 # equivalent to non-raw text
453 454 basetext = revlog.revision(baserev, _df=fh, raw=False)
454 455 fulltext = mdiff.patch(basetext, delta)
455 456
456 457 try:
457 458 res = revlog._processflags(fulltext, flags, 'read', raw=True)
458 459 fulltext, validatehash = res
459 460 if validatehash:
460 461 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
461 462 if flags & REVIDX_ISCENSORED:
462 463 raise RevlogError(_('node %s is not censored') % expectednode)
463 464 except CensoredNodeError:
464 465 # must pass the censored index flag to add censored revisions
465 466 if not flags & REVIDX_ISCENSORED:
466 467 raise
467 468 return fulltext
468 469
469 470 @attr.s(slots=True, frozen=True)
470 471 class _deltainfo(object):
471 472 distance = attr.ib()
472 473 deltalen = attr.ib()
473 474 data = attr.ib()
474 475 base = attr.ib()
475 476 chainbase = attr.ib()
476 477 chainlen = attr.ib()
477 478 compresseddeltalen = attr.ib()
478 479 snapshotdepth = attr.ib()
479 480
480 481 def isgooddeltainfo(revlog, deltainfo, revinfo):
481 482 """Returns True if the given delta is good. Good means that it is within
482 483 the disk span, disk size, and chain length bounds that we know to be
483 484 performant."""
484 485 if deltainfo is None:
485 486 return False
486 487
487 488 # - 'deltainfo.distance' is the distance from the base revision --
488 489 # bounding it limits the amount of I/O we need to do.
489 490 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
490 491 # deltas we need to apply -- bounding it limits the amount of CPU
491 492 # we consume.
492 493
493 494 if revlog._sparserevlog:
494 495 # As sparse-read will be used, we can consider that the distance,
495 496 # instead of being the span of the whole chunk,
496 497 # is the span of the largest read chunk
497 498 base = deltainfo.base
498 499
499 500 if base != nullrev:
500 501 deltachain = revlog._deltachain(base)[0]
501 502 else:
502 503 deltachain = []
503 504
504 505 # search for the first non-snapshot revision
505 506 for idx, r in enumerate(deltachain):
506 507 if not revlog.issnapshot(r):
507 508 break
508 509 deltachain = deltachain[idx:]
509 510 chunks = slicechunk(revlog, deltachain, deltainfo)
510 511 all_span = [segmentspan(revlog, revs, deltainfo)
511 512 for revs in chunks]
512 513 distance = max(all_span)
513 514 else:
514 515 distance = deltainfo.distance
515 516
516 517 textlen = revinfo.textlen
517 518 defaultmax = textlen * 4
518 519 maxdist = revlog._maxdeltachainspan
519 520 if not maxdist:
520 521 maxdist = distance # ensure the conditional pass
521 522 maxdist = max(maxdist, defaultmax)
522 523 if revlog._sparserevlog and maxdist < revlog._srmingapsize:
523 524 # In multiple place, we are ignoring irrelevant data range below a
524 525 # certain size. Be also apply this tradeoff here and relax span
525 526 # constraint for small enought content.
526 527 maxdist = revlog._srmingapsize
527 528
528 529 # Bad delta from read span:
529 530 #
530 531 # If the span of data read is larger than the maximum allowed.
531 532 if maxdist < distance:
532 533 return False
533 534
534 535 # Bad delta from new delta size:
535 536 #
536 537 # If the delta size is larger than the target text, storing the
537 538 # delta will be inefficient.
538 539 if textlen < deltainfo.deltalen:
539 540 return False
540 541
541 542 # Bad delta from cumulated payload size:
542 543 #
543 544 # If the sum of delta get larger than K * target text length.
544 545 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
545 546 return False
546 547
547 548 # Bad delta from chain length:
548 549 #
549 550 # If the number of delta in the chain gets too high.
550 551 if (revlog._maxchainlen
551 552 and revlog._maxchainlen < deltainfo.chainlen):
552 553 return False
553 554
554 555 # bad delta from intermediate snapshot size limit
555 556 #
556 557 # If an intermediate snapshot size is higher than the limit. The
557 558 # limit exist to prevent endless chain of intermediate delta to be
558 559 # created.
559 560 if (deltainfo.snapshotdepth is not None and
560 561 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
561 562 return False
562 563
563 564 # bad delta if new intermediate snapshot is larger than the previous
564 565 # snapshot
565 566 if (deltainfo.snapshotdepth
566 567 and revlog.length(deltainfo.base) < deltainfo.deltalen):
567 568 return False
568 569
569 570 return True
570 571
571 572 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
572 573 """Provides group of revision to be tested as delta base
573 574
574 575 This top level function focus on emitting groups with unique and worthwhile
575 576 content. See _raw_candidate_groups for details about the group order.
576 577 """
577 578 # should we try to build a delta?
578 579 if not (len(revlog) and revlog._storedeltachains):
579 580 return
580 581
581 582 deltalength = revlog.length
582 583 deltaparent = revlog.deltaparent
583 584
584 585 deltas_limit = textlen * LIMIT_DELTA2TEXT
585 586
586 587 tested = set([nullrev])
587 588 for temptative in _rawgroups(revlog, p1, p2, cachedelta):
588 589 group = []
589 590 for rev in temptative:
590 591 # skip over empty delta (no need to include them in a chain)
591 592 while not (rev == nullrev or rev in tested or deltalength(rev)):
592 593 rev = deltaparent(rev)
593 594 tested.add(rev)
594 595 # filter out revision we tested already
595 596 if rev in tested:
596 597 continue
597 598 tested.add(rev)
598 599 # filter out delta base that will never produce good delta
599 600 if deltas_limit < revlog.length(rev):
600 601 continue
601 602 # no need to try a delta against nullrev, this will be done as a
602 603 # last resort.
603 604 if rev == nullrev:
604 605 continue
605 606 # no delta for rawtext-changing revs (see "candelta" for why)
606 607 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
607 608 continue
608 609 group.append(rev)
609 610 if group:
611 # XXX: in the sparse revlog case, group can become large,
612 # impacting performances. Some bounding or slicing mecanism
613 # would help to reduce this impact.
610 614 yield tuple(group)
611 615
616 def _findsnapshots(revlog, cache, start_rev):
617 """find snapshot from start_rev to tip"""
618 deltaparent = revlog.deltaparent
619 for rev in revlog.revs(start_rev):
620 if deltaparent(rev) == nullrev:
621 cache[nullrev].append(rev)
622
612 623 def _rawgroups(revlog, p1, p2, cachedelta):
613 624 """Provides group of revision to be tested as delta base
614 625
615 626 This lower level function focus on emitting delta theorically interresting
616 627 without looking it any practical details.
617 628
618 629 The group order aims at providing fast or small candidates first.
619 630 """
620 631 gdelta = revlog._generaldelta
621 632 sparse = revlog._sparserevlog
622 633 curr = len(revlog)
623 634 prev = curr - 1
624 635 deltachain = lambda rev: revlog._deltachain(rev)[0]
625 636
626 637 # First we try to reuse a the delta contained in the bundle.
627 638 # (or from the source revlog)
628 639 #
629 640 # This logic only applies to general delta repositories and can be disabled
630 641 # through configuration. Disabling reuse of source delta is useful when
631 642 # we want to make sure we recomputed "optimal" deltas.
632 643 if cachedelta and gdelta and revlog._lazydeltabase:
633 644 # Assume what we received from the server is a good choice
634 645 # build delta will reuse the cache
635 646 yield (cachedelta[0],)
636 647
637 648 if gdelta:
638 649 # exclude already lazy tested base if any
639 650 parents = [p for p in (p1, p2) if p != nullrev]
640 651
641 652 if not revlog._deltabothparents and len(parents) == 2:
642 653 parents.sort()
643 654 # To minimize the chance of having to build a fulltext,
644 655 # pick first whichever parent is closest to us (max rev)
645 656 yield (parents[1],)
646 657 # then the other one (min rev) if the first did not fit
647 658 yield (parents[0],)
648 659 elif len(parents) > 0:
649 660 # Test all parents (1 or 2), and keep the best candidate
650 661 yield parents
651 662
652 663 if sparse and parents:
653 664 # See if we can use an existing snapshot in the parent chains to use as
654 665 # a base for a new intermediate-snapshot
655 666 bases = []
656 667 for p in parents:
657 668 bases.append(deltachain(p)[0])
658 669 yield tuple(sorted(bases))
670 # No suitable base found in the parent chain, search if any full
671 # snapshots emitted since parent's base would be a suitable base for an
672 # intermediate snapshot.
673 #
674 # It give a chance to reuse a delta chain unrelated to the current
675 # revisions instead of starting our own. Without such re-use,
676 # topological branches would keep reopening new full chains. Creating
677 # more and more snapshot as the repository grow.
678 snapfloor = min(bases) + 1
679 snapshots = collections.defaultdict(list)
680 _findsnapshots(revlog, snapshots, snapfloor)
681 yield tuple(snapshots[nullrev])
659 682
660 683 # other approach failed try against prev to hopefully save us a
661 684 # fulltext.
662 685 yield (prev,)
663 686
664 687 class deltacomputer(object):
665 688 def __init__(self, revlog):
666 689 self.revlog = revlog
667 690
668 691 def buildtext(self, revinfo, fh):
669 692 """Builds a fulltext version of a revision
670 693
671 694 revinfo: _revisioninfo instance that contains all needed info
672 695 fh: file handle to either the .i or the .d revlog file,
673 696 depending on whether it is inlined or not
674 697 """
675 698 btext = revinfo.btext
676 699 if btext[0] is not None:
677 700 return btext[0]
678 701
679 702 revlog = self.revlog
680 703 cachedelta = revinfo.cachedelta
681 704 baserev = cachedelta[0]
682 705 delta = cachedelta[1]
683 706
684 707 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
685 708 revinfo.p1, revinfo.p2,
686 709 revinfo.flags, revinfo.node)
687 710 return fulltext
688 711
689 712 def _builddeltadiff(self, base, revinfo, fh):
690 713 revlog = self.revlog
691 714 t = self.buildtext(revinfo, fh)
692 715 if revlog.iscensored(base):
693 716 # deltas based on a censored revision must replace the
694 717 # full content in one patch, so delta works everywhere
695 718 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
696 719 delta = header + t
697 720 else:
698 721 ptext = revlog.revision(base, _df=fh, raw=True)
699 722 delta = mdiff.textdiff(ptext, t)
700 723
701 724 return delta
702 725
703 726 def _builddeltainfo(self, revinfo, base, fh):
704 727 # can we use the cached delta?
705 728 if revinfo.cachedelta and revinfo.cachedelta[0] == base:
706 729 delta = revinfo.cachedelta[1]
707 730 else:
708 731 delta = self._builddeltadiff(base, revinfo, fh)
709 732 revlog = self.revlog
710 733 header, data = revlog.compress(delta)
711 734 deltalen = len(header) + len(data)
712 735 chainbase = revlog.chainbase(base)
713 736 offset = revlog.end(len(revlog) - 1)
714 737 dist = deltalen + offset - revlog.start(chainbase)
715 738 if revlog._generaldelta:
716 739 deltabase = base
717 740 else:
718 741 deltabase = chainbase
719 742 chainlen, compresseddeltalen = revlog._chaininfo(base)
720 743 chainlen += 1
721 744 compresseddeltalen += deltalen
722 745
723 746 revlog = self.revlog
724 747 snapshotdepth = None
725 748 if deltabase == nullrev:
726 749 snapshotdepth = 0
727 750 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
728 751 # A delta chain should always be one full snapshot,
729 752 # zero or more semi-snapshots, and zero or more deltas
730 753 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
731 754 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
732 755 snapshotdepth = len(revlog._deltachain(deltabase)[0])
733 756
734 757 return _deltainfo(dist, deltalen, (header, data), deltabase,
735 758 chainbase, chainlen, compresseddeltalen,
736 759 snapshotdepth)
737 760
738 761 def _fullsnapshotinfo(self, fh, revinfo):
739 762 curr = len(self.revlog)
740 763 rawtext = self.buildtext(revinfo, fh)
741 764 data = self.revlog.compress(rawtext)
742 765 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
743 766 deltabase = chainbase = curr
744 767 snapshotdepth = 0
745 768 chainlen = 1
746 769
747 770 return _deltainfo(dist, deltalen, data, deltabase,
748 771 chainbase, chainlen, compresseddeltalen,
749 772 snapshotdepth)
750 773
751 774 def finddeltainfo(self, revinfo, fh):
752 775 """Find an acceptable delta against a candidate revision
753 776
754 777 revinfo: information about the revision (instance of _revisioninfo)
755 778 fh: file handle to either the .i or the .d revlog file,
756 779 depending on whether it is inlined or not
757 780
758 781 Returns the first acceptable candidate revision, as ordered by
759 782 _candidategroups
760 783
761 784 If no suitable deltabase is found, we return delta info for a full
762 785 snapshot.
763 786 """
764 787 if not revinfo.textlen:
765 788 return self._fullsnapshotinfo(fh, revinfo)
766 789
767 790 # no delta for flag processor revision (see "candelta" for why)
768 791 # not calling candelta since only one revision needs test, also to
769 792 # avoid overhead fetching flags again.
770 793 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
771 794 return self._fullsnapshotinfo(fh, revinfo)
772 795
773 796 cachedelta = revinfo.cachedelta
774 797 p1 = revinfo.p1
775 798 p2 = revinfo.p2
776 799 revlog = self.revlog
777 800
778 801 deltainfo = None
779 802 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
780 803 groups = _candidategroups(self.revlog, revinfo.textlen,
781 804 p1r, p2r, cachedelta)
782 805 for candidaterevs in groups:
783 806 nominateddeltas = []
784 807 for candidaterev in candidaterevs:
785 808 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
786 809 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
787 810 nominateddeltas.append(candidatedelta)
788 811 if nominateddeltas:
789 812 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
790 813 break
791 814
792 815 if deltainfo is None:
793 816 deltainfo = self._fullsnapshotinfo(fh, revinfo)
794 817 return deltainfo
@@ -1,124 +1,127 b''
1 1 ====================================
2 2 Test delta choice with sparse revlog
3 3 ====================================
4 4
5 5 Sparse-revlog usually shows the most gain on Manifest. However, it is simpler
6 6 to general an appropriate file, so we test with a single file instead. The
7 7 goal is to observe intermediate snapshot being created.
8 8
9 9 We need a large enough file. Part of the content needs to be replaced
10 10 repeatedly while some of it changes rarely.
11 11
12 12 $ bundlepath="$TESTDIR/artifacts/cache/big-file-churn.hg"
13 13
14 14 $ expectedhash=`cat "$bundlepath".md5`
15 15 $ if [ ! -f "$bundlepath" ]; then
16 16 > echo 'skipped: missing artifact, run "'"$TESTDIR"'/artifacts/scripts/generate-churning-bundle.py"'
17 17 > exit 80
18 18 > fi
19 19 $ currenthash=`f -M "$bundlepath" | cut -d = -f 2`
20 20 $ if [ "$currenthash" != "$expectedhash" ]; then
21 21 > echo 'skipped: outdated artifact, md5 "'"$currenthash"'" expected "'"$expectedhash"'" run "'"$TESTDIR"'/artifacts/scripts/generate-churning-bundle.py"'
22 22 > exit 80
23 23 > fi
24 24
25 25 $ cat >> $HGRCPATH << EOF
26 26 > [format]
27 27 > sparse-revlog = yes
28 28 > [storage]
29 29 > revlog.optimize-delta-parent-choice = yes
30 30 > EOF
31 31 $ hg init sparse-repo
32 32 $ cd sparse-repo
33 33 $ hg unbundle $bundlepath
34 34 adding changesets
35 35 adding manifests
36 36 adding file changes
37 37 added 5001 changesets with 5001 changes to 1 files (+89 heads)
38 38 new changesets 9706f5af64f4:d9032adc8114 (5001 drafts)
39 39 (run 'hg heads' to see heads, 'hg merge' to merge)
40 40 $ hg up
41 41 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
42 42 updated to "d9032adc8114: commit #5000"
43 43 89 other heads for branch "default"
44 44
45 45 $ hg log --stat -r 0:3
46 46 changeset: 0:9706f5af64f4
47 47 user: test
48 48 date: Thu Jan 01 00:00:00 1970 +0000
49 49 summary: initial commit
50 50
51 51 SPARSE-REVLOG-TEST-FILE | 10500 ++++++++++++++++++++++++++++++++++++++++++++++
52 52 1 files changed, 10500 insertions(+), 0 deletions(-)
53 53
54 54 changeset: 1:724907deaa5e
55 55 user: test
56 56 date: Thu Jan 01 00:00:00 1970 +0000
57 57 summary: commit #1
58 58
59 59 SPARSE-REVLOG-TEST-FILE | 1068 +++++++++++++++++++++++-----------------------
60 60 1 files changed, 534 insertions(+), 534 deletions(-)
61 61
62 62 changeset: 2:62c41bce3e5d
63 63 user: test
64 64 date: Thu Jan 01 00:00:00 1970 +0000
65 65 summary: commit #2
66 66
67 67 SPARSE-REVLOG-TEST-FILE | 1068 +++++++++++++++++++++++-----------------------
68 68 1 files changed, 534 insertions(+), 534 deletions(-)
69 69
70 70 changeset: 3:348a9cbd6959
71 71 user: test
72 72 date: Thu Jan 01 00:00:00 1970 +0000
73 73 summary: commit #3
74 74
75 75 SPARSE-REVLOG-TEST-FILE | 1068 +++++++++++++++++++++++-----------------------
76 76 1 files changed, 534 insertions(+), 534 deletions(-)
77 77
78 78
79 79 $ f -s .hg/store/data/*.d
80 .hg/store/data/_s_p_a_r_s_e-_r_e_v_l_o_g-_t_e_s_t-_f_i_l_e.d: size=72315280
80 .hg/store/data/_s_p_a_r_s_e-_r_e_v_l_o_g-_t_e_s_t-_f_i_l_e.d: size=67810463
81 81 $ hg debugrevlog *
82 82 format : 1
83 83 flags : generaldelta
84 84
85 85 revisions : 5001
86 86 merges : 625 (12.50%)
87 87 normal : 4376 (87.50%)
88 88 revisions : 5001
89 89 empty : 0 ( 0.00%)
90 90 text : 0 (100.00%)
91 91 delta : 0 (100.00%)
92 snapshot : 145 ( 2.90%)
93 lvl-0 : 15 ( 0.30%)
94 lvl-1 : 130 ( 2.60%)
95 deltas : 4856 (97.10%)
96 revision size : 72315280
97 snapshot : 18481085 (25.56%)
98 lvl-0 : 3016019 ( 4.17%)
99 lvl-1 : 15465066 (21.39%)
100 deltas : 53834195 (74.44%)
92 snapshot : 126 ( 2.52%)
93 lvl-0 : 4 ( 0.08%)
94 lvl-1 : 120 ( 2.40%)
95 lvl-2 : 2 ( 0.04%)
96 deltas : 4875 (97.48%)
97 revision size : 67810463
98 snapshot : 14373347 (21.20%)
99 lvl-0 : 804235 ( 1.19%)
100 lvl-1 : 13535903 (19.96%)
101 lvl-2 : 33209 ( 0.05%)
102 deltas : 53437116 (78.80%)
101 103
102 104 chunks : 5001
103 105 0x78 (x) : 5001 (100.00%)
104 chunks size : 72315280
105 0x78 (x) : 72315280 (100.00%)
106 chunks size : 67810463
107 0x78 (x) : 67810463 (100.00%)
106 108
107 109 avg chain length : 18
108 110 max chain length : 45
109 max chain reach : 32095083
110 compression ratio : 23
111 max chain reach : 25808240
112 compression ratio : 25
111 113
112 114 uncompressed data size (min/max/avg) : 346468 / 346472 / 346471
113 full revision size (min/max/avg) : 200990 / 201151 / 201067
114 inter-snapshot size (min/max/avg) : 37202 / 173034 / 118962
115 level-1 (min/max/avg) : 37202 / 173034 / 118962
116 delta size (min/max/avg) : 10649 / 104791 / 11086
115 full revision size (min/max/avg) : 201014 / 201116 / 201058
116 inter-snapshot size (min/max/avg) : 11623 / 173150 / 111222
117 level-1 (min/max/avg) : 11623 / 173150 / 112799
118 level-2 (min/max/avg) : 14151 / 19058 / 16604
119 delta size (min/max/avg) : 10649 / 101790 / 10961
117 120
118 deltas against prev : 4185 (86.18%)
119 where prev = p1 : 4139 (98.90%)
121 deltas against prev : 4207 (86.30%)
122 where prev = p1 : 4164 (98.98%)
120 123 where prev = p2 : 0 ( 0.00%)
121 other : 46 ( 1.10%)
122 deltas against p1 : 647 (13.32%)
123 deltas against p2 : 24 ( 0.49%)
124 other : 43 ( 1.02%)
125 deltas against p1 : 653 (13.39%)
126 deltas against p2 : 15 ( 0.31%)
124 127 deltas against other : 0 ( 0.00%)
General Comments 0
You need to be logged in to leave comments. Login now