##// END OF EJS Templates
sparse-revlog: only refine delta candidates in the sparse case (issue6006)...
Boris Feld -
r40479:bafa1c4b stable
parent child Browse files
Show More
@@ -1,939 +1,944 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import heapq
14 14 import struct
15 15
16 16 # import stuff from node for others to import from revlog
17 17 from ..node import (
18 18 nullrev,
19 19 )
20 20 from ..i18n import _
21 21
22 22 from .constants import (
23 23 REVIDX_ISCENSORED,
24 24 REVIDX_RAWTEXT_CHANGING_FLAGS,
25 25 )
26 26
27 27 from ..thirdparty import (
28 28 attr,
29 29 )
30 30
31 31 from .. import (
32 32 error,
33 33 mdiff,
34 34 )
35 35
36 36 # maximum <delta-chain-data>/<revision-text-length> ratio
37 37 LIMIT_DELTA2TEXT = 2
38 38
39 39 class _testrevlog(object):
40 40 """minimalist fake revlog to use in doctests"""
41 41
42 42 def __init__(self, data, density=0.5, mingap=0):
43 43 """data is an list of revision payload boundaries"""
44 44 self._data = data
45 45 self._srdensitythreshold = density
46 46 self._srmingapsize = mingap
47 47
48 48 def start(self, rev):
49 49 if rev == 0:
50 50 return 0
51 51 return self._data[rev - 1]
52 52
53 53 def end(self, rev):
54 54 return self._data[rev]
55 55
56 56 def length(self, rev):
57 57 return self.end(rev) - self.start(rev)
58 58
59 59 def __len__(self):
60 60 return len(self._data)
61 61
62 62 def slicechunk(revlog, revs, deltainfo=None, targetsize=None):
63 63 """slice revs to reduce the amount of unrelated data to be read from disk.
64 64
65 65 ``revs`` is sliced into groups that should be read in one time.
66 66 Assume that revs are sorted.
67 67
68 68 The initial chunk is sliced until the overall density (payload/chunks-span
69 69 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
70 70 `revlog._srmingapsize` is skipped.
71 71
72 72 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
73 73 For consistency with other slicing choice, this limit won't go lower than
74 74 `revlog._srmingapsize`.
75 75
76 76 If individual revisions chunk are larger than this limit, they will still
77 77 be raised individually.
78 78
79 79 >>> revlog = _testrevlog([
80 80 ... 5, #00 (5)
81 81 ... 10, #01 (5)
82 82 ... 12, #02 (2)
83 83 ... 12, #03 (empty)
84 84 ... 27, #04 (15)
85 85 ... 31, #05 (4)
86 86 ... 31, #06 (empty)
87 87 ... 42, #07 (11)
88 88 ... 47, #08 (5)
89 89 ... 47, #09 (empty)
90 90 ... 48, #10 (1)
91 91 ... 51, #11 (3)
92 92 ... 74, #12 (23)
93 93 ... 85, #13 (11)
94 94 ... 86, #14 (1)
95 95 ... 91, #15 (5)
96 96 ... ])
97 97
98 98 >>> list(slicechunk(revlog, list(range(16))))
99 99 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
100 100 >>> list(slicechunk(revlog, [0, 15]))
101 101 [[0], [15]]
102 102 >>> list(slicechunk(revlog, [0, 11, 15]))
103 103 [[0], [11], [15]]
104 104 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
105 105 [[0], [11, 13, 15]]
106 106 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
107 107 [[1, 2], [5, 8, 10, 11], [14]]
108 108
109 109 Slicing with a maximum chunk size
110 110 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
111 111 [[0], [11], [13], [15]]
112 112 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
113 113 [[0], [11], [13, 15]]
114 114 """
115 115 if targetsize is not None:
116 116 targetsize = max(targetsize, revlog._srmingapsize)
117 117 # targetsize should not be specified when evaluating delta candidates:
118 118 # * targetsize is used to ensure we stay within specification when reading,
119 119 # * deltainfo is used to pick are good delta chain when writing.
120 120 if not (deltainfo is None or targetsize is None):
121 121 msg = 'cannot use `targetsize` with a `deltainfo`'
122 122 raise error.ProgrammingError(msg)
123 123 for chunk in _slicechunktodensity(revlog, revs,
124 124 deltainfo,
125 125 revlog._srdensitythreshold,
126 126 revlog._srmingapsize):
127 127 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
128 128 yield subchunk
129 129
130 130 def _slicechunktosize(revlog, revs, targetsize=None):
131 131 """slice revs to match the target size
132 132
133 133 This is intended to be used on chunk that density slicing selected by that
134 134 are still too large compared to the read garantee of revlog. This might
135 135 happens when "minimal gap size" interrupted the slicing or when chain are
136 136 built in a way that create large blocks next to each other.
137 137
138 138 >>> revlog = _testrevlog([
139 139 ... 3, #0 (3)
140 140 ... 5, #1 (2)
141 141 ... 6, #2 (1)
142 142 ... 8, #3 (2)
143 143 ... 8, #4 (empty)
144 144 ... 11, #5 (3)
145 145 ... 12, #6 (1)
146 146 ... 13, #7 (1)
147 147 ... 14, #8 (1)
148 148 ... ])
149 149
150 150 Cases where chunk is already small enough
151 151 >>> list(_slicechunktosize(revlog, [0], 3))
152 152 [[0]]
153 153 >>> list(_slicechunktosize(revlog, [6, 7], 3))
154 154 [[6, 7]]
155 155 >>> list(_slicechunktosize(revlog, [0], None))
156 156 [[0]]
157 157 >>> list(_slicechunktosize(revlog, [6, 7], None))
158 158 [[6, 7]]
159 159
160 160 cases where we need actual slicing
161 161 >>> list(_slicechunktosize(revlog, [0, 1], 3))
162 162 [[0], [1]]
163 163 >>> list(_slicechunktosize(revlog, [1, 3], 3))
164 164 [[1], [3]]
165 165 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
166 166 [[1, 2], [3]]
167 167 >>> list(_slicechunktosize(revlog, [3, 5], 3))
168 168 [[3], [5]]
169 169 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
170 170 [[3], [5]]
171 171 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
172 172 [[5], [6, 7, 8]]
173 173 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
174 174 [[0], [1, 2], [3], [5], [6, 7, 8]]
175 175
176 176 Case with too large individual chunk (must return valid chunk)
177 177 >>> list(_slicechunktosize(revlog, [0, 1], 2))
178 178 [[0], [1]]
179 179 >>> list(_slicechunktosize(revlog, [1, 3], 1))
180 180 [[1], [3]]
181 181 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
182 182 [[3], [5]]
183 183 """
184 184 assert targetsize is None or 0 <= targetsize
185 185 if targetsize is None or segmentspan(revlog, revs) <= targetsize:
186 186 yield revs
187 187 return
188 188
189 189 startrevidx = 0
190 190 startdata = revlog.start(revs[0])
191 191 endrevidx = 0
192 192 iterrevs = enumerate(revs)
193 193 next(iterrevs) # skip first rev.
194 194 for idx, r in iterrevs:
195 195 span = revlog.end(r) - startdata
196 196 if span <= targetsize:
197 197 endrevidx = idx
198 198 else:
199 199 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
200 200 if chunk:
201 201 yield chunk
202 202 startrevidx = idx
203 203 startdata = revlog.start(r)
204 204 endrevidx = idx
205 205 yield _trimchunk(revlog, revs, startrevidx)
206 206
207 207 def _slicechunktodensity(revlog, revs, deltainfo=None, targetdensity=0.5,
208 208 mingapsize=0):
209 209 """slice revs to reduce the amount of unrelated data to be read from disk.
210 210
211 211 ``revs`` is sliced into groups that should be read in one time.
212 212 Assume that revs are sorted.
213 213
214 214 ``deltainfo`` is a _deltainfo instance of a revision that we would append
215 215 to the top of the revlog.
216 216
217 217 The initial chunk is sliced until the overall density (payload/chunks-span
218 218 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
219 219 skipped.
220 220
221 221 >>> revlog = _testrevlog([
222 222 ... 5, #00 (5)
223 223 ... 10, #01 (5)
224 224 ... 12, #02 (2)
225 225 ... 12, #03 (empty)
226 226 ... 27, #04 (15)
227 227 ... 31, #05 (4)
228 228 ... 31, #06 (empty)
229 229 ... 42, #07 (11)
230 230 ... 47, #08 (5)
231 231 ... 47, #09 (empty)
232 232 ... 48, #10 (1)
233 233 ... 51, #11 (3)
234 234 ... 74, #12 (23)
235 235 ... 85, #13 (11)
236 236 ... 86, #14 (1)
237 237 ... 91, #15 (5)
238 238 ... ])
239 239
240 240 >>> list(_slicechunktodensity(revlog, list(range(16))))
241 241 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
242 242 >>> list(_slicechunktodensity(revlog, [0, 15]))
243 243 [[0], [15]]
244 244 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
245 245 [[0], [11], [15]]
246 246 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
247 247 [[0], [11, 13, 15]]
248 248 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
249 249 [[1, 2], [5, 8, 10, 11], [14]]
250 250 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
251 251 ... mingapsize=20))
252 252 [[1, 2, 3, 5, 8, 10, 11], [14]]
253 253 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
254 254 ... targetdensity=0.95))
255 255 [[1, 2], [5], [8, 10, 11], [14]]
256 256 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
257 257 ... targetdensity=0.95, mingapsize=12))
258 258 [[1, 2], [5, 8, 10, 11], [14]]
259 259 """
260 260 start = revlog.start
261 261 length = revlog.length
262 262
263 263 if len(revs) <= 1:
264 264 yield revs
265 265 return
266 266
267 267 nextrev = len(revlog)
268 268 nextoffset = revlog.end(nextrev - 1)
269 269
270 270 if deltainfo is None:
271 271 deltachainspan = segmentspan(revlog, revs)
272 272 chainpayload = sum(length(r) for r in revs)
273 273 else:
274 274 deltachainspan = deltainfo.distance
275 275 chainpayload = deltainfo.compresseddeltalen
276 276
277 277 if deltachainspan < mingapsize:
278 278 yield revs
279 279 return
280 280
281 281 readdata = deltachainspan
282 282
283 283 if deltachainspan:
284 284 density = chainpayload / float(deltachainspan)
285 285 else:
286 286 density = 1.0
287 287
288 288 if density >= targetdensity:
289 289 yield revs
290 290 return
291 291
292 292 if deltainfo is not None and deltainfo.deltalen:
293 293 revs = list(revs)
294 294 revs.append(nextrev)
295 295
296 296 # Store the gaps in a heap to have them sorted by decreasing size
297 297 gapsheap = []
298 298 heapq.heapify(gapsheap)
299 299 prevend = None
300 300 for i, rev in enumerate(revs):
301 301 if rev < nextrev:
302 302 revstart = start(rev)
303 303 revlen = length(rev)
304 304 else:
305 305 revstart = nextoffset
306 306 revlen = deltainfo.deltalen
307 307
308 308 # Skip empty revisions to form larger holes
309 309 if revlen == 0:
310 310 continue
311 311
312 312 if prevend is not None:
313 313 gapsize = revstart - prevend
314 314 # only consider holes that are large enough
315 315 if gapsize > mingapsize:
316 316 heapq.heappush(gapsheap, (-gapsize, i))
317 317
318 318 prevend = revstart + revlen
319 319
320 320 # Collect the indices of the largest holes until the density is acceptable
321 321 indicesheap = []
322 322 heapq.heapify(indicesheap)
323 323 while gapsheap and density < targetdensity:
324 324 oppgapsize, gapidx = heapq.heappop(gapsheap)
325 325
326 326 heapq.heappush(indicesheap, gapidx)
327 327
328 328 # the gap sizes are stored as negatives to be sorted decreasingly
329 329 # by the heap
330 330 readdata -= (-oppgapsize)
331 331 if readdata > 0:
332 332 density = chainpayload / float(readdata)
333 333 else:
334 334 density = 1.0
335 335
336 336 # Cut the revs at collected indices
337 337 previdx = 0
338 338 while indicesheap:
339 339 idx = heapq.heappop(indicesheap)
340 340
341 341 chunk = _trimchunk(revlog, revs, previdx, idx)
342 342 if chunk:
343 343 yield chunk
344 344
345 345 previdx = idx
346 346
347 347 chunk = _trimchunk(revlog, revs, previdx)
348 348 if chunk:
349 349 yield chunk
350 350
351 351 def _trimchunk(revlog, revs, startidx, endidx=None):
352 352 """returns revs[startidx:endidx] without empty trailing revs
353 353
354 354 Doctest Setup
355 355 >>> revlog = _testrevlog([
356 356 ... 5, #0
357 357 ... 10, #1
358 358 ... 12, #2
359 359 ... 12, #3 (empty)
360 360 ... 17, #4
361 361 ... 21, #5
362 362 ... 21, #6 (empty)
363 363 ... ])
364 364
365 365 Contiguous cases:
366 366 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
367 367 [0, 1, 2, 3, 4, 5]
368 368 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
369 369 [0, 1, 2, 3, 4]
370 370 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
371 371 [0, 1, 2]
372 372 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
373 373 [2]
374 374 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
375 375 [3, 4, 5]
376 376 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
377 377 [3, 4]
378 378
379 379 Discontiguous cases:
380 380 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
381 381 [1, 3, 5]
382 382 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
383 383 [1]
384 384 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
385 385 [3, 5]
386 386 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
387 387 [3, 5]
388 388 """
389 389 length = revlog.length
390 390
391 391 if endidx is None:
392 392 endidx = len(revs)
393 393
394 394 # If we have a non-emtpy delta candidate, there are nothing to trim
395 395 if revs[endidx - 1] < len(revlog):
396 396 # Trim empty revs at the end, except the very first revision of a chain
397 397 while (endidx > 1
398 398 and endidx > startidx
399 399 and length(revs[endidx - 1]) == 0):
400 400 endidx -= 1
401 401
402 402 return revs[startidx:endidx]
403 403
404 404 def segmentspan(revlog, revs, deltainfo=None):
405 405 """Get the byte span of a segment of revisions
406 406
407 407 revs is a sorted array of revision numbers
408 408
409 409 >>> revlog = _testrevlog([
410 410 ... 5, #0
411 411 ... 10, #1
412 412 ... 12, #2
413 413 ... 12, #3 (empty)
414 414 ... 17, #4
415 415 ... ])
416 416
417 417 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
418 418 17
419 419 >>> segmentspan(revlog, [0, 4])
420 420 17
421 421 >>> segmentspan(revlog, [3, 4])
422 422 5
423 423 >>> segmentspan(revlog, [1, 2, 3,])
424 424 7
425 425 >>> segmentspan(revlog, [1, 3])
426 426 7
427 427 """
428 428 if not revs:
429 429 return 0
430 430 if deltainfo is not None and len(revlog) <= revs[-1]:
431 431 if len(revs) == 1:
432 432 return deltainfo.deltalen
433 433 offset = revlog.end(len(revlog) - 1)
434 434 end = deltainfo.deltalen + offset
435 435 else:
436 436 end = revlog.end(revs[-1])
437 437 return end - revlog.start(revs[0])
438 438
439 439 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
440 440 """build full text from a (base, delta) pair and other metadata"""
441 441 # special case deltas which replace entire base; no need to decode
442 442 # base revision. this neatly avoids censored bases, which throw when
443 443 # they're decoded.
444 444 hlen = struct.calcsize(">lll")
445 445 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
446 446 len(delta) - hlen):
447 447 fulltext = delta[hlen:]
448 448 else:
449 449 # deltabase is rawtext before changed by flag processors, which is
450 450 # equivalent to non-raw text
451 451 basetext = revlog.revision(baserev, _df=fh, raw=False)
452 452 fulltext = mdiff.patch(basetext, delta)
453 453
454 454 try:
455 455 res = revlog._processflags(fulltext, flags, 'read', raw=True)
456 456 fulltext, validatehash = res
457 457 if validatehash:
458 458 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
459 459 if flags & REVIDX_ISCENSORED:
460 460 raise error.StorageError(_('node %s is not censored') %
461 461 expectednode)
462 462 except error.CensoredNodeError:
463 463 # must pass the censored index flag to add censored revisions
464 464 if not flags & REVIDX_ISCENSORED:
465 465 raise
466 466 return fulltext
467 467
468 468 @attr.s(slots=True, frozen=True)
469 469 class _deltainfo(object):
470 470 distance = attr.ib()
471 471 deltalen = attr.ib()
472 472 data = attr.ib()
473 473 base = attr.ib()
474 474 chainbase = attr.ib()
475 475 chainlen = attr.ib()
476 476 compresseddeltalen = attr.ib()
477 477 snapshotdepth = attr.ib()
478 478
479 479 def isgooddeltainfo(revlog, deltainfo, revinfo):
480 480 """Returns True if the given delta is good. Good means that it is within
481 481 the disk span, disk size, and chain length bounds that we know to be
482 482 performant."""
483 483 if deltainfo is None:
484 484 return False
485 485
486 486 # - 'deltainfo.distance' is the distance from the base revision --
487 487 # bounding it limits the amount of I/O we need to do.
488 488 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
489 489 # deltas we need to apply -- bounding it limits the amount of CPU
490 490 # we consume.
491 491
492 492 if revlog._sparserevlog:
493 493 # As sparse-read will be used, we can consider that the distance,
494 494 # instead of being the span of the whole chunk,
495 495 # is the span of the largest read chunk
496 496 base = deltainfo.base
497 497
498 498 if base != nullrev:
499 499 deltachain = revlog._deltachain(base)[0]
500 500 else:
501 501 deltachain = []
502 502
503 503 # search for the first non-snapshot revision
504 504 for idx, r in enumerate(deltachain):
505 505 if not revlog.issnapshot(r):
506 506 break
507 507 deltachain = deltachain[idx:]
508 508 chunks = slicechunk(revlog, deltachain, deltainfo)
509 509 all_span = [segmentspan(revlog, revs, deltainfo)
510 510 for revs in chunks]
511 511 distance = max(all_span)
512 512 else:
513 513 distance = deltainfo.distance
514 514
515 515 textlen = revinfo.textlen
516 516 defaultmax = textlen * 4
517 517 maxdist = revlog._maxdeltachainspan
518 518 if not maxdist:
519 519 maxdist = distance # ensure the conditional pass
520 520 maxdist = max(maxdist, defaultmax)
521 521 if revlog._sparserevlog and maxdist < revlog._srmingapsize:
522 522 # In multiple place, we are ignoring irrelevant data range below a
523 523 # certain size. Be also apply this tradeoff here and relax span
524 524 # constraint for small enought content.
525 525 maxdist = revlog._srmingapsize
526 526
527 527 # Bad delta from read span:
528 528 #
529 529 # If the span of data read is larger than the maximum allowed.
530 530 if maxdist < distance:
531 531 return False
532 532
533 533 # Bad delta from new delta size:
534 534 #
535 535 # If the delta size is larger than the target text, storing the
536 536 # delta will be inefficient.
537 537 if textlen < deltainfo.deltalen:
538 538 return False
539 539
540 540 # Bad delta from cumulated payload size:
541 541 #
542 542 # If the sum of delta get larger than K * target text length.
543 543 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
544 544 return False
545 545
546 546 # Bad delta from chain length:
547 547 #
548 548 # If the number of delta in the chain gets too high.
549 549 if (revlog._maxchainlen
550 550 and revlog._maxchainlen < deltainfo.chainlen):
551 551 return False
552 552
553 553 # bad delta from intermediate snapshot size limit
554 554 #
555 555 # If an intermediate snapshot size is higher than the limit. The
556 556 # limit exist to prevent endless chain of intermediate delta to be
557 557 # created.
558 558 if (deltainfo.snapshotdepth is not None and
559 559 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
560 560 return False
561 561
562 562 # bad delta if new intermediate snapshot is larger than the previous
563 563 # snapshot
564 564 if (deltainfo.snapshotdepth
565 565 and revlog.length(deltainfo.base) < deltainfo.deltalen):
566 566 return False
567 567
568 568 return True
569 569
570 570 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
571 571 """Provides group of revision to be tested as delta base
572 572
573 573 This top level function focus on emitting groups with unique and worthwhile
574 574 content. See _raw_candidate_groups for details about the group order.
575 575 """
576 576 # should we try to build a delta?
577 577 if not (len(revlog) and revlog._storedeltachains):
578 578 yield None
579 579 return
580 580
581 581 deltalength = revlog.length
582 582 deltaparent = revlog.deltaparent
583 583 good = None
584 584
585 585 deltas_limit = textlen * LIMIT_DELTA2TEXT
586 586
587 587 tested = set([nullrev])
588 588 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
589 589 while True:
590 590 temptative = candidates.send(good)
591 591 if temptative is None:
592 592 break
593 593 group = []
594 594 for rev in temptative:
595 595 # skip over empty delta (no need to include them in a chain)
596 596 while not (rev == nullrev or rev in tested or deltalength(rev)):
597 597 tested.add(rev)
598 598 rev = deltaparent(rev)
599 599 # filter out revision we tested already
600 600 if rev in tested:
601 601 continue
602 602 tested.add(rev)
603 603 # filter out delta base that will never produce good delta
604 604 if deltas_limit < revlog.length(rev):
605 605 continue
606 606 # no need to try a delta against nullrev, this will be done as a
607 607 # last resort.
608 608 if rev == nullrev:
609 609 continue
610 610 # no delta for rawtext-changing revs (see "candelta" for why)
611 611 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
612 612 continue
613 613 group.append(rev)
614 614 if group:
615 615 # XXX: in the sparse revlog case, group can become large,
616 616 # impacting performances. Some bounding or slicing mecanism
617 617 # would help to reduce this impact.
618 618 good = yield tuple(group)
619 619 yield None
620 620
621 621 def _findsnapshots(revlog, cache, start_rev):
622 622 """find snapshot from start_rev to tip"""
623 623 deltaparent = revlog.deltaparent
624 624 issnapshot = revlog.issnapshot
625 625 for rev in revlog.revs(start_rev):
626 626 if issnapshot(rev):
627 627 cache[deltaparent(rev)].append(rev)
628 628
629 629 def _refinedgroups(revlog, p1, p2, cachedelta):
630 630 good = None
631 631 # First we try to reuse a the delta contained in the bundle.
632 632 # (or from the source revlog)
633 633 #
634 634 # This logic only applies to general delta repositories and can be disabled
635 635 # through configuration. Disabling reuse source delta is useful when
636 636 # we want to make sure we recomputed "optimal" deltas.
637 637 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
638 638 # Assume what we received from the server is a good choice
639 639 # build delta will reuse the cache
640 640 good = yield (cachedelta[0],)
641 641 if good is not None:
642 642 yield None
643 643 return
644 644 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
645 645 good = yield candidates
646 646 if good is not None:
647 647 break
648 648
649 # If sparse revlog is enabled, we can try to refine the available deltas
650 if not revlog._sparserevlog:
651 yield None
652 return
653
649 654 # if we have a refinable value, try to refine it
650 655 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
651 656 # refine snapshot down
652 657 previous = None
653 658 while previous != good:
654 659 previous = good
655 660 base = revlog.deltaparent(good)
656 661 if base == nullrev:
657 662 break
658 663 good = yield (base,)
659 664 # refine snapshot up
660 665 #
661 666 # XXX the _findsnapshots call can be expensive and is "duplicated" with
662 667 # the one done in `_rawgroups`. Once we start working on performance,
663 668 # we should make the two logics share this computation.
664 669 snapshots = collections.defaultdict(list)
665 670 _findsnapshots(revlog, snapshots, good + 1)
666 671 previous = None
667 672 while good != previous:
668 673 previous = good
669 674 children = tuple(sorted(c for c in snapshots[good]))
670 675 good = yield children
671 676
672 677 # we have found nothing
673 678 yield None
674 679
675 680 def _rawgroups(revlog, p1, p2, cachedelta):
676 681 """Provides group of revision to be tested as delta base
677 682
678 683 This lower level function focus on emitting delta theorically interresting
679 684 without looking it any practical details.
680 685
681 686 The group order aims at providing fast or small candidates first.
682 687 """
683 688 gdelta = revlog._generaldelta
684 689 sparse = revlog._sparserevlog
685 690 curr = len(revlog)
686 691 prev = curr - 1
687 692 deltachain = lambda rev: revlog._deltachain(rev)[0]
688 693
689 694 if gdelta:
690 695 # exclude already lazy tested base if any
691 696 parents = [p for p in (p1, p2) if p != nullrev]
692 697
693 698 if not revlog._deltabothparents and len(parents) == 2:
694 699 parents.sort()
695 700 # To minimize the chance of having to build a fulltext,
696 701 # pick first whichever parent is closest to us (max rev)
697 702 yield (parents[1],)
698 703 # then the other one (min rev) if the first did not fit
699 704 yield (parents[0],)
700 705 elif len(parents) > 0:
701 706 # Test all parents (1 or 2), and keep the best candidate
702 707 yield parents
703 708
704 709 if sparse and parents:
705 710 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
706 711 # See if we can use an existing snapshot in the parent chains to use as
707 712 # a base for a new intermediate-snapshot
708 713 #
709 714 # search for snapshot in parents delta chain
710 715 # map: snapshot-level: snapshot-rev
711 716 parents_snaps = collections.defaultdict(set)
712 717 candidate_chains = [deltachain(p) for p in parents]
713 718 for chain in candidate_chains:
714 719 for idx, s in enumerate(chain):
715 720 if not revlog.issnapshot(s):
716 721 break
717 722 parents_snaps[idx].add(s)
718 723 snapfloor = min(parents_snaps[0]) + 1
719 724 _findsnapshots(revlog, snapshots, snapfloor)
720 725 # search for the highest "unrelated" revision
721 726 #
722 727 # Adding snapshots used by "unrelated" revision increase the odd we
723 728 # reuse an independant, yet better snapshot chain.
724 729 #
725 730 # XXX instead of building a set of revisions, we could lazily enumerate
726 731 # over the chains. That would be more efficient, however we stick to
727 732 # simple code for now.
728 733 all_revs = set()
729 734 for chain in candidate_chains:
730 735 all_revs.update(chain)
731 736 other = None
732 737 for r in revlog.revs(prev, snapfloor):
733 738 if r not in all_revs:
734 739 other = r
735 740 break
736 741 if other is not None:
737 742 # To avoid unfair competition, we won't use unrelated intermediate
738 743 # snapshot that are deeper than the ones from the parent delta
739 744 # chain.
740 745 max_depth = max(parents_snaps.keys())
741 746 chain = deltachain(other)
742 747 for idx, s in enumerate(chain):
743 748 if s < snapfloor:
744 749 continue
745 750 if max_depth < idx:
746 751 break
747 752 if not revlog.issnapshot(s):
748 753 break
749 754 parents_snaps[idx].add(s)
750 755 # Test them as possible intermediate snapshot base
751 756 # We test them from highest to lowest level. High level one are more
752 757 # likely to result in small delta
753 758 floor = None
754 759 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
755 760 siblings = set()
756 761 for s in snaps:
757 762 siblings.update(snapshots[s])
758 763 # Before considering making a new intermediate snapshot, we check
759 764 # if an existing snapshot, children of base we consider, would be
760 765 # suitable.
761 766 #
762 767 # It give a change to reuse a delta chain "unrelated" to the
763 768 # current revision instead of starting our own. Without such
764 769 # re-use, topological branches would keep reopening new chains.
765 770 # Creating more and more snapshot as the repository grow.
766 771
767 772 if floor is not None:
768 773 # We only do this for siblings created after the one in our
769 774 # parent's delta chain. Those created before has less chances
770 775 # to be valid base since our ancestors had to create a new
771 776 # snapshot.
772 777 siblings = [r for r in siblings if floor < r]
773 778 yield tuple(sorted(siblings))
774 779 # then test the base from our parent's delta chain.
775 780 yield tuple(sorted(snaps))
776 781 floor = min(snaps)
777 782 # No suitable base found in the parent chain, search if any full
778 783 # snapshots emitted since parent's base would be a suitable base for an
779 784 # intermediate snapshot.
780 785 #
781 786 # It give a chance to reuse a delta chain unrelated to the current
782 787 # revisions instead of starting our own. Without such re-use,
783 788 # topological branches would keep reopening new full chains. Creating
784 789 # more and more snapshot as the repository grow.
785 790 yield tuple(snapshots[nullrev])
786 791
787 792 if not sparse:
788 793 # other approach failed try against prev to hopefully save us a
789 794 # fulltext.
790 795 yield (prev,)
791 796
792 797 class deltacomputer(object):
793 798 def __init__(self, revlog):
794 799 self.revlog = revlog
795 800
796 801 def buildtext(self, revinfo, fh):
797 802 """Builds a fulltext version of a revision
798 803
799 804 revinfo: _revisioninfo instance that contains all needed info
800 805 fh: file handle to either the .i or the .d revlog file,
801 806 depending on whether it is inlined or not
802 807 """
803 808 btext = revinfo.btext
804 809 if btext[0] is not None:
805 810 return btext[0]
806 811
807 812 revlog = self.revlog
808 813 cachedelta = revinfo.cachedelta
809 814 baserev = cachedelta[0]
810 815 delta = cachedelta[1]
811 816
812 817 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
813 818 revinfo.p1, revinfo.p2,
814 819 revinfo.flags, revinfo.node)
815 820 return fulltext
816 821
817 822 def _builddeltadiff(self, base, revinfo, fh):
818 823 revlog = self.revlog
819 824 t = self.buildtext(revinfo, fh)
820 825 if revlog.iscensored(base):
821 826 # deltas based on a censored revision must replace the
822 827 # full content in one patch, so delta works everywhere
823 828 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
824 829 delta = header + t
825 830 else:
826 831 ptext = revlog.revision(base, _df=fh, raw=True)
827 832 delta = mdiff.textdiff(ptext, t)
828 833
829 834 return delta
830 835
831 836 def _builddeltainfo(self, revinfo, base, fh):
832 837 # can we use the cached delta?
833 838 delta = None
834 839 if revinfo.cachedelta:
835 840 cachebase, cachediff = revinfo.cachedelta
836 841 #check if the diff still apply
837 842 currentbase = cachebase
838 843 while (currentbase != nullrev
839 844 and currentbase != base
840 845 and self.revlog.length(currentbase) == 0):
841 846 currentbase = self.revlog.deltaparent(currentbase)
842 847 if currentbase == base:
843 848 delta = revinfo.cachedelta[1]
844 849 if delta is None:
845 850 delta = self._builddeltadiff(base, revinfo, fh)
846 851 revlog = self.revlog
847 852 header, data = revlog.compress(delta)
848 853 deltalen = len(header) + len(data)
849 854 chainbase = revlog.chainbase(base)
850 855 offset = revlog.end(len(revlog) - 1)
851 856 dist = deltalen + offset - revlog.start(chainbase)
852 857 if revlog._generaldelta:
853 858 deltabase = base
854 859 else:
855 860 deltabase = chainbase
856 861 chainlen, compresseddeltalen = revlog._chaininfo(base)
857 862 chainlen += 1
858 863 compresseddeltalen += deltalen
859 864
860 865 revlog = self.revlog
861 866 snapshotdepth = None
862 867 if deltabase == nullrev:
863 868 snapshotdepth = 0
864 869 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
865 870 # A delta chain should always be one full snapshot,
866 871 # zero or more semi-snapshots, and zero or more deltas
867 872 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
868 873 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
869 874 snapshotdepth = len(revlog._deltachain(deltabase)[0])
870 875
871 876 return _deltainfo(dist, deltalen, (header, data), deltabase,
872 877 chainbase, chainlen, compresseddeltalen,
873 878 snapshotdepth)
874 879
875 880 def _fullsnapshotinfo(self, fh, revinfo):
876 881 curr = len(self.revlog)
877 882 rawtext = self.buildtext(revinfo, fh)
878 883 data = self.revlog.compress(rawtext)
879 884 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
880 885 deltabase = chainbase = curr
881 886 snapshotdepth = 0
882 887 chainlen = 1
883 888
884 889 return _deltainfo(dist, deltalen, data, deltabase,
885 890 chainbase, chainlen, compresseddeltalen,
886 891 snapshotdepth)
887 892
888 893 def finddeltainfo(self, revinfo, fh):
889 894 """Find an acceptable delta against a candidate revision
890 895
891 896 revinfo: information about the revision (instance of _revisioninfo)
892 897 fh: file handle to either the .i or the .d revlog file,
893 898 depending on whether it is inlined or not
894 899
895 900 Returns the first acceptable candidate revision, as ordered by
896 901 _candidategroups
897 902
898 903 If no suitable deltabase is found, we return delta info for a full
899 904 snapshot.
900 905 """
901 906 if not revinfo.textlen:
902 907 return self._fullsnapshotinfo(fh, revinfo)
903 908
904 909 # no delta for flag processor revision (see "candelta" for why)
905 910 # not calling candelta since only one revision needs test, also to
906 911 # avoid overhead fetching flags again.
907 912 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
908 913 return self._fullsnapshotinfo(fh, revinfo)
909 914
910 915 cachedelta = revinfo.cachedelta
911 916 p1 = revinfo.p1
912 917 p2 = revinfo.p2
913 918 revlog = self.revlog
914 919
915 920 deltainfo = None
916 921 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
917 922 groups = _candidategroups(self.revlog, revinfo.textlen,
918 923 p1r, p2r, cachedelta)
919 924 candidaterevs = next(groups)
920 925 while candidaterevs is not None:
921 926 nominateddeltas = []
922 927 if deltainfo is not None:
923 928 # if we already found a good delta,
924 929 # challenge it against refined candidates
925 930 nominateddeltas.append(deltainfo)
926 931 for candidaterev in candidaterevs:
927 932 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
928 933 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
929 934 nominateddeltas.append(candidatedelta)
930 935 if nominateddeltas:
931 936 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
932 937 if deltainfo is not None:
933 938 candidaterevs = groups.send(deltainfo.base)
934 939 else:
935 940 candidaterevs = next(groups)
936 941
937 942 if deltainfo is None:
938 943 deltainfo = self._fullsnapshotinfo(fh, revinfo)
939 944 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now