##// END OF EJS Templates
sparse-revlog: stop using a heap to track gaps...
Boris Feld -
r40643:54de2340 default
parent child Browse files
Show More
@@ -1,896 +1,897 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import heapq
14 14 import struct
15 15
16 16 # import stuff from node for others to import from revlog
17 17 from ..node import (
18 18 nullrev,
19 19 )
20 20 from ..i18n import _
21 21
22 22 from .constants import (
23 23 REVIDX_ISCENSORED,
24 24 REVIDX_RAWTEXT_CHANGING_FLAGS,
25 25 )
26 26
27 27 from ..thirdparty import (
28 28 attr,
29 29 )
30 30
31 31 from .. import (
32 32 error,
33 33 mdiff,
34 34 )
35 35
36 36 # maximum <delta-chain-data>/<revision-text-length> ratio
37 37 LIMIT_DELTA2TEXT = 2
38 38
39 39 class _testrevlog(object):
40 40 """minimalist fake revlog to use in doctests"""
41 41
42 42 def __init__(self, data, density=0.5, mingap=0):
43 43 """data is an list of revision payload boundaries"""
44 44 self._data = data
45 45 self._srdensitythreshold = density
46 46 self._srmingapsize = mingap
47 47
48 48 def start(self, rev):
49 49 if rev == 0:
50 50 return 0
51 51 return self._data[rev - 1]
52 52
53 53 def end(self, rev):
54 54 return self._data[rev]
55 55
56 56 def length(self, rev):
57 57 return self.end(rev) - self.start(rev)
58 58
59 59 def __len__(self):
60 60 return len(self._data)
61 61
62 62 def slicechunk(revlog, revs, targetsize=None):
63 63 """slice revs to reduce the amount of unrelated data to be read from disk.
64 64
65 65 ``revs`` is sliced into groups that should be read in one time.
66 66 Assume that revs are sorted.
67 67
68 68 The initial chunk is sliced until the overall density (payload/chunks-span
69 69 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
70 70 `revlog._srmingapsize` is skipped.
71 71
72 72 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
73 73 For consistency with other slicing choice, this limit won't go lower than
74 74 `revlog._srmingapsize`.
75 75
76 76 If individual revisions chunk are larger than this limit, they will still
77 77 be raised individually.
78 78
79 79 >>> revlog = _testrevlog([
80 80 ... 5, #00 (5)
81 81 ... 10, #01 (5)
82 82 ... 12, #02 (2)
83 83 ... 12, #03 (empty)
84 84 ... 27, #04 (15)
85 85 ... 31, #05 (4)
86 86 ... 31, #06 (empty)
87 87 ... 42, #07 (11)
88 88 ... 47, #08 (5)
89 89 ... 47, #09 (empty)
90 90 ... 48, #10 (1)
91 91 ... 51, #11 (3)
92 92 ... 74, #12 (23)
93 93 ... 85, #13 (11)
94 94 ... 86, #14 (1)
95 95 ... 91, #15 (5)
96 96 ... ])
97 97
98 98 >>> list(slicechunk(revlog, list(range(16))))
99 99 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
100 100 >>> list(slicechunk(revlog, [0, 15]))
101 101 [[0], [15]]
102 102 >>> list(slicechunk(revlog, [0, 11, 15]))
103 103 [[0], [11], [15]]
104 104 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
105 105 [[0], [11, 13, 15]]
106 106 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
107 107 [[1, 2], [5, 8, 10, 11], [14]]
108 108
109 109 Slicing with a maximum chunk size
110 110 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
111 111 [[0], [11], [13], [15]]
112 112 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
113 113 [[0], [11], [13, 15]]
114 114 """
115 115 if targetsize is not None:
116 116 targetsize = max(targetsize, revlog._srmingapsize)
117 117 # targetsize should not be specified when evaluating delta candidates:
118 118 # * targetsize is used to ensure we stay within specification when reading,
119 119 for chunk in _slicechunktodensity(revlog, revs,
120 120 revlog._srdensitythreshold,
121 121 revlog._srmingapsize):
122 122 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
123 123 yield subchunk
124 124
125 125 def _slicechunktosize(revlog, revs, targetsize=None):
126 126 """slice revs to match the target size
127 127
128 128 This is intended to be used on chunk that density slicing selected by that
129 129 are still too large compared to the read garantee of revlog. This might
130 130 happens when "minimal gap size" interrupted the slicing or when chain are
131 131 built in a way that create large blocks next to each other.
132 132
133 133 >>> revlog = _testrevlog([
134 134 ... 3, #0 (3)
135 135 ... 5, #1 (2)
136 136 ... 6, #2 (1)
137 137 ... 8, #3 (2)
138 138 ... 8, #4 (empty)
139 139 ... 11, #5 (3)
140 140 ... 12, #6 (1)
141 141 ... 13, #7 (1)
142 142 ... 14, #8 (1)
143 143 ... ])
144 144
145 145 Cases where chunk is already small enough
146 146 >>> list(_slicechunktosize(revlog, [0], 3))
147 147 [[0]]
148 148 >>> list(_slicechunktosize(revlog, [6, 7], 3))
149 149 [[6, 7]]
150 150 >>> list(_slicechunktosize(revlog, [0], None))
151 151 [[0]]
152 152 >>> list(_slicechunktosize(revlog, [6, 7], None))
153 153 [[6, 7]]
154 154
155 155 cases where we need actual slicing
156 156 >>> list(_slicechunktosize(revlog, [0, 1], 3))
157 157 [[0], [1]]
158 158 >>> list(_slicechunktosize(revlog, [1, 3], 3))
159 159 [[1], [3]]
160 160 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
161 161 [[1, 2], [3]]
162 162 >>> list(_slicechunktosize(revlog, [3, 5], 3))
163 163 [[3], [5]]
164 164 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
165 165 [[3], [5]]
166 166 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
167 167 [[5], [6, 7, 8]]
168 168 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
169 169 [[0], [1, 2], [3], [5], [6, 7, 8]]
170 170
171 171 Case with too large individual chunk (must return valid chunk)
172 172 >>> list(_slicechunktosize(revlog, [0, 1], 2))
173 173 [[0], [1]]
174 174 >>> list(_slicechunktosize(revlog, [1, 3], 1))
175 175 [[1], [3]]
176 176 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
177 177 [[3], [5]]
178 178 """
179 179 assert targetsize is None or 0 <= targetsize
180 180 if targetsize is None or segmentspan(revlog, revs) <= targetsize:
181 181 yield revs
182 182 return
183 183
184 184 startrevidx = 0
185 185 startdata = revlog.start(revs[0])
186 186 endrevidx = 0
187 187 iterrevs = enumerate(revs)
188 188 next(iterrevs) # skip first rev.
189 189 for idx, r in iterrevs:
190 190 span = revlog.end(r) - startdata
191 191 if span <= targetsize:
192 192 endrevidx = idx
193 193 else:
194 194 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
195 195 if chunk:
196 196 yield chunk
197 197 startrevidx = idx
198 198 startdata = revlog.start(r)
199 199 endrevidx = idx
200 200 yield _trimchunk(revlog, revs, startrevidx)
201 201
202 202 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
203 203 mingapsize=0):
204 204 """slice revs to reduce the amount of unrelated data to be read from disk.
205 205
206 206 ``revs`` is sliced into groups that should be read in one time.
207 207 Assume that revs are sorted.
208 208
209 209 The initial chunk is sliced until the overall density (payload/chunks-span
210 210 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
211 211 skipped.
212 212
213 213 >>> revlog = _testrevlog([
214 214 ... 5, #00 (5)
215 215 ... 10, #01 (5)
216 216 ... 12, #02 (2)
217 217 ... 12, #03 (empty)
218 218 ... 27, #04 (15)
219 219 ... 31, #05 (4)
220 220 ... 31, #06 (empty)
221 221 ... 42, #07 (11)
222 222 ... 47, #08 (5)
223 223 ... 47, #09 (empty)
224 224 ... 48, #10 (1)
225 225 ... 51, #11 (3)
226 226 ... 74, #12 (23)
227 227 ... 85, #13 (11)
228 228 ... 86, #14 (1)
229 229 ... 91, #15 (5)
230 230 ... ])
231 231
232 232 >>> list(_slicechunktodensity(revlog, list(range(16))))
233 233 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
234 234 >>> list(_slicechunktodensity(revlog, [0, 15]))
235 235 [[0], [15]]
236 236 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
237 237 [[0], [11], [15]]
238 238 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
239 239 [[0], [11, 13, 15]]
240 240 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
241 241 [[1, 2], [5, 8, 10, 11], [14]]
242 242 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
243 243 ... mingapsize=20))
244 244 [[1, 2, 3, 5, 8, 10, 11], [14]]
245 245 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
246 246 ... targetdensity=0.95))
247 247 [[1, 2], [5], [8, 10, 11], [14]]
248 248 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
249 249 ... targetdensity=0.95, mingapsize=12))
250 250 [[1, 2], [5, 8, 10, 11], [14]]
251 251 """
252 252 start = revlog.start
253 253 length = revlog.length
254 254
255 255 if len(revs) <= 1:
256 256 yield revs
257 257 return
258 258
259 259 deltachainspan = segmentspan(revlog, revs)
260 260
261 261 if deltachainspan < mingapsize:
262 262 yield revs
263 263 return
264 264
265 265 readdata = deltachainspan
266 266 chainpayload = sum(length(r) for r in revs)
267 267
268 268 if deltachainspan:
269 269 density = chainpayload / float(deltachainspan)
270 270 else:
271 271 density = 1.0
272 272
273 273 if density >= targetdensity:
274 274 yield revs
275 275 return
276 276
277 277 # Store the gaps in a heap to have them sorted by decreasing size
278 gapsheap = []
279 heapq.heapify(gapsheap)
278 gaps = []
280 279 prevend = None
281 280 for i, rev in enumerate(revs):
282 281 revstart = start(rev)
283 282 revlen = length(rev)
284 283
285 284 # Skip empty revisions to form larger holes
286 285 if revlen == 0:
287 286 continue
288 287
289 288 if prevend is not None:
290 289 gapsize = revstart - prevend
291 290 # only consider holes that are large enough
292 291 if gapsize > mingapsize:
293 heapq.heappush(gapsheap, (-gapsize, i))
292 gaps.append((gapsize, i))
294 293
295 294 prevend = revstart + revlen
295 # sort the gaps to pop them from largest to small
296 gaps.sort()
296 297
297 298 # Collect the indices of the largest holes until the density is acceptable
298 299 indicesheap = []
299 300 heapq.heapify(indicesheap)
300 while gapsheap and density < targetdensity:
301 oppgapsize, gapidx = heapq.heappop(gapsheap)
301 while gaps and density < targetdensity:
302 gapsize, gapidx = gaps.pop()
302 303
303 304 heapq.heappush(indicesheap, gapidx)
304 305
305 306 # the gap sizes are stored as negatives to be sorted decreasingly
306 307 # by the heap
307 readdata -= (-oppgapsize)
308 readdata -= gapsize
308 309 if readdata > 0:
309 310 density = chainpayload / float(readdata)
310 311 else:
311 312 density = 1.0
312 313
313 314 # Cut the revs at collected indices
314 315 previdx = 0
315 316 while indicesheap:
316 317 idx = heapq.heappop(indicesheap)
317 318
318 319 chunk = _trimchunk(revlog, revs, previdx, idx)
319 320 if chunk:
320 321 yield chunk
321 322
322 323 previdx = idx
323 324
324 325 chunk = _trimchunk(revlog, revs, previdx)
325 326 if chunk:
326 327 yield chunk
327 328
328 329 def _trimchunk(revlog, revs, startidx, endidx=None):
329 330 """returns revs[startidx:endidx] without empty trailing revs
330 331
331 332 Doctest Setup
332 333 >>> revlog = _testrevlog([
333 334 ... 5, #0
334 335 ... 10, #1
335 336 ... 12, #2
336 337 ... 12, #3 (empty)
337 338 ... 17, #4
338 339 ... 21, #5
339 340 ... 21, #6 (empty)
340 341 ... ])
341 342
342 343 Contiguous cases:
343 344 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
344 345 [0, 1, 2, 3, 4, 5]
345 346 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
346 347 [0, 1, 2, 3, 4]
347 348 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
348 349 [0, 1, 2]
349 350 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
350 351 [2]
351 352 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
352 353 [3, 4, 5]
353 354 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
354 355 [3, 4]
355 356
356 357 Discontiguous cases:
357 358 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
358 359 [1, 3, 5]
359 360 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
360 361 [1]
361 362 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
362 363 [3, 5]
363 364 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
364 365 [3, 5]
365 366 """
366 367 length = revlog.length
367 368
368 369 if endidx is None:
369 370 endidx = len(revs)
370 371
371 372 # If we have a non-emtpy delta candidate, there are nothing to trim
372 373 if revs[endidx - 1] < len(revlog):
373 374 # Trim empty revs at the end, except the very first revision of a chain
374 375 while (endidx > 1
375 376 and endidx > startidx
376 377 and length(revs[endidx - 1]) == 0):
377 378 endidx -= 1
378 379
379 380 return revs[startidx:endidx]
380 381
381 382 def segmentspan(revlog, revs):
382 383 """Get the byte span of a segment of revisions
383 384
384 385 revs is a sorted array of revision numbers
385 386
386 387 >>> revlog = _testrevlog([
387 388 ... 5, #0
388 389 ... 10, #1
389 390 ... 12, #2
390 391 ... 12, #3 (empty)
391 392 ... 17, #4
392 393 ... ])
393 394
394 395 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
395 396 17
396 397 >>> segmentspan(revlog, [0, 4])
397 398 17
398 399 >>> segmentspan(revlog, [3, 4])
399 400 5
400 401 >>> segmentspan(revlog, [1, 2, 3,])
401 402 7
402 403 >>> segmentspan(revlog, [1, 3])
403 404 7
404 405 """
405 406 if not revs:
406 407 return 0
407 408 end = revlog.end(revs[-1])
408 409 return end - revlog.start(revs[0])
409 410
410 411 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
411 412 """build full text from a (base, delta) pair and other metadata"""
412 413 # special case deltas which replace entire base; no need to decode
413 414 # base revision. this neatly avoids censored bases, which throw when
414 415 # they're decoded.
415 416 hlen = struct.calcsize(">lll")
416 417 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
417 418 len(delta) - hlen):
418 419 fulltext = delta[hlen:]
419 420 else:
420 421 # deltabase is rawtext before changed by flag processors, which is
421 422 # equivalent to non-raw text
422 423 basetext = revlog.revision(baserev, _df=fh, raw=False)
423 424 fulltext = mdiff.patch(basetext, delta)
424 425
425 426 try:
426 427 res = revlog._processflags(fulltext, flags, 'read', raw=True)
427 428 fulltext, validatehash = res
428 429 if validatehash:
429 430 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
430 431 if flags & REVIDX_ISCENSORED:
431 432 raise error.StorageError(_('node %s is not censored') %
432 433 expectednode)
433 434 except error.CensoredNodeError:
434 435 # must pass the censored index flag to add censored revisions
435 436 if not flags & REVIDX_ISCENSORED:
436 437 raise
437 438 return fulltext
438 439
439 440 @attr.s(slots=True, frozen=True)
440 441 class _deltainfo(object):
441 442 distance = attr.ib()
442 443 deltalen = attr.ib()
443 444 data = attr.ib()
444 445 base = attr.ib()
445 446 chainbase = attr.ib()
446 447 chainlen = attr.ib()
447 448 compresseddeltalen = attr.ib()
448 449 snapshotdepth = attr.ib()
449 450
450 451 def isgooddeltainfo(revlog, deltainfo, revinfo):
451 452 """Returns True if the given delta is good. Good means that it is within
452 453 the disk span, disk size, and chain length bounds that we know to be
453 454 performant."""
454 455 if deltainfo is None:
455 456 return False
456 457
457 458 # - 'deltainfo.distance' is the distance from the base revision --
458 459 # bounding it limits the amount of I/O we need to do.
459 460 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
460 461 # deltas we need to apply -- bounding it limits the amount of CPU
461 462 # we consume.
462 463
463 464 textlen = revinfo.textlen
464 465 defaultmax = textlen * 4
465 466 maxdist = revlog._maxdeltachainspan
466 467 if not maxdist:
467 468 maxdist = deltainfo.distance # ensure the conditional pass
468 469 maxdist = max(maxdist, defaultmax)
469 470
470 471 # Bad delta from read span:
471 472 #
472 473 # If the span of data read is larger than the maximum allowed.
473 474 #
474 475 # In the sparse-revlog case, we rely on the associated "sparse reading"
475 476 # to avoid issue related to the span of data. In theory, it would be
476 477 # possible to build pathological revlog where delta pattern would lead
477 478 # to too many reads. However, they do not happen in practice at all. So
478 479 # we skip the span check entirely.
479 480 if not revlog._sparserevlog and maxdist < deltainfo.distance:
480 481 return False
481 482
482 483 # Bad delta from new delta size:
483 484 #
484 485 # If the delta size is larger than the target text, storing the
485 486 # delta will be inefficient.
486 487 if textlen < deltainfo.deltalen:
487 488 return False
488 489
489 490 # Bad delta from cumulated payload size:
490 491 #
491 492 # If the sum of delta get larger than K * target text length.
492 493 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
493 494 return False
494 495
495 496 # Bad delta from chain length:
496 497 #
497 498 # If the number of delta in the chain gets too high.
498 499 if (revlog._maxchainlen
499 500 and revlog._maxchainlen < deltainfo.chainlen):
500 501 return False
501 502
502 503 # bad delta from intermediate snapshot size limit
503 504 #
504 505 # If an intermediate snapshot size is higher than the limit. The
505 506 # limit exist to prevent endless chain of intermediate delta to be
506 507 # created.
507 508 if (deltainfo.snapshotdepth is not None and
508 509 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
509 510 return False
510 511
511 512 # bad delta if new intermediate snapshot is larger than the previous
512 513 # snapshot
513 514 if (deltainfo.snapshotdepth
514 515 and revlog.length(deltainfo.base) < deltainfo.deltalen):
515 516 return False
516 517
517 518 return True
518 519
519 520 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
520 521 """Provides group of revision to be tested as delta base
521 522
522 523 This top level function focus on emitting groups with unique and worthwhile
523 524 content. See _raw_candidate_groups for details about the group order.
524 525 """
525 526 # should we try to build a delta?
526 527 if not (len(revlog) and revlog._storedeltachains):
527 528 yield None
528 529 return
529 530
530 531 deltalength = revlog.length
531 532 deltaparent = revlog.deltaparent
532 533 good = None
533 534
534 535 deltas_limit = textlen * LIMIT_DELTA2TEXT
535 536
536 537 tested = set([nullrev])
537 538 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
538 539 while True:
539 540 temptative = candidates.send(good)
540 541 if temptative is None:
541 542 break
542 543 group = []
543 544 for rev in temptative:
544 545 # skip over empty delta (no need to include them in a chain)
545 546 while (revlog._generaldelta
546 547 and not (rev == nullrev
547 548 or rev in tested
548 549 or deltalength(rev))):
549 550 tested.add(rev)
550 551 rev = deltaparent(rev)
551 552 # filter out revision we tested already
552 553 if rev in tested:
553 554 continue
554 555 tested.add(rev)
555 556 # filter out delta base that will never produce good delta
556 557 if deltas_limit < revlog.length(rev):
557 558 continue
558 559 # no need to try a delta against nullrev, this will be done as a
559 560 # last resort.
560 561 if rev == nullrev:
561 562 continue
562 563 # no delta for rawtext-changing revs (see "candelta" for why)
563 564 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
564 565 continue
565 566 group.append(rev)
566 567 if group:
567 568 # XXX: in the sparse revlog case, group can become large,
568 569 # impacting performances. Some bounding or slicing mecanism
569 570 # would help to reduce this impact.
570 571 good = yield tuple(group)
571 572 yield None
572 573
573 574 def _findsnapshots(revlog, cache, start_rev):
574 575 """find snapshot from start_rev to tip"""
575 576 deltaparent = revlog.deltaparent
576 577 issnapshot = revlog.issnapshot
577 578 for rev in revlog.revs(start_rev):
578 579 if issnapshot(rev):
579 580 cache[deltaparent(rev)].append(rev)
580 581
581 582 def _refinedgroups(revlog, p1, p2, cachedelta):
582 583 good = None
583 584 # First we try to reuse a the delta contained in the bundle.
584 585 # (or from the source revlog)
585 586 #
586 587 # This logic only applies to general delta repositories and can be disabled
587 588 # through configuration. Disabling reuse source delta is useful when
588 589 # we want to make sure we recomputed "optimal" deltas.
589 590 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
590 591 # Assume what we received from the server is a good choice
591 592 # build delta will reuse the cache
592 593 good = yield (cachedelta[0],)
593 594 if good is not None:
594 595 yield None
595 596 return
596 597 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
597 598 good = yield candidates
598 599 if good is not None:
599 600 break
600 601
601 602 # If sparse revlog is enabled, we can try to refine the available deltas
602 603 if not revlog._sparserevlog:
603 604 yield None
604 605 return
605 606
606 607 # if we have a refinable value, try to refine it
607 608 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
608 609 # refine snapshot down
609 610 previous = None
610 611 while previous != good:
611 612 previous = good
612 613 base = revlog.deltaparent(good)
613 614 if base == nullrev:
614 615 break
615 616 good = yield (base,)
616 617 # refine snapshot up
617 618 #
618 619 # XXX the _findsnapshots call can be expensive and is "duplicated" with
619 620 # the one done in `_rawgroups`. Once we start working on performance,
620 621 # we should make the two logics share this computation.
621 622 snapshots = collections.defaultdict(list)
622 623 _findsnapshots(revlog, snapshots, good + 1)
623 624 previous = None
624 625 while good != previous:
625 626 previous = good
626 627 children = tuple(sorted(c for c in snapshots[good]))
627 628 good = yield children
628 629
629 630 # we have found nothing
630 631 yield None
631 632
632 633 def _rawgroups(revlog, p1, p2, cachedelta):
633 634 """Provides group of revision to be tested as delta base
634 635
635 636 This lower level function focus on emitting delta theorically interresting
636 637 without looking it any practical details.
637 638
638 639 The group order aims at providing fast or small candidates first.
639 640 """
640 641 gdelta = revlog._generaldelta
641 642 sparse = revlog._sparserevlog
642 643 curr = len(revlog)
643 644 prev = curr - 1
644 645 deltachain = lambda rev: revlog._deltachain(rev)[0]
645 646
646 647 if gdelta:
647 648 # exclude already lazy tested base if any
648 649 parents = [p for p in (p1, p2) if p != nullrev]
649 650
650 651 if not revlog._deltabothparents and len(parents) == 2:
651 652 parents.sort()
652 653 # To minimize the chance of having to build a fulltext,
653 654 # pick first whichever parent is closest to us (max rev)
654 655 yield (parents[1],)
655 656 # then the other one (min rev) if the first did not fit
656 657 yield (parents[0],)
657 658 elif len(parents) > 0:
658 659 # Test all parents (1 or 2), and keep the best candidate
659 660 yield parents
660 661
661 662 if sparse and parents:
662 663 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
663 664 # See if we can use an existing snapshot in the parent chains to use as
664 665 # a base for a new intermediate-snapshot
665 666 #
666 667 # search for snapshot in parents delta chain
667 668 # map: snapshot-level: snapshot-rev
668 669 parents_snaps = collections.defaultdict(set)
669 670 candidate_chains = [deltachain(p) for p in parents]
670 671 for chain in candidate_chains:
671 672 for idx, s in enumerate(chain):
672 673 if not revlog.issnapshot(s):
673 674 break
674 675 parents_snaps[idx].add(s)
675 676 snapfloor = min(parents_snaps[0]) + 1
676 677 _findsnapshots(revlog, snapshots, snapfloor)
677 678 # search for the highest "unrelated" revision
678 679 #
679 680 # Adding snapshots used by "unrelated" revision increase the odd we
680 681 # reuse an independant, yet better snapshot chain.
681 682 #
682 683 # XXX instead of building a set of revisions, we could lazily enumerate
683 684 # over the chains. That would be more efficient, however we stick to
684 685 # simple code for now.
685 686 all_revs = set()
686 687 for chain in candidate_chains:
687 688 all_revs.update(chain)
688 689 other = None
689 690 for r in revlog.revs(prev, snapfloor):
690 691 if r not in all_revs:
691 692 other = r
692 693 break
693 694 if other is not None:
694 695 # To avoid unfair competition, we won't use unrelated intermediate
695 696 # snapshot that are deeper than the ones from the parent delta
696 697 # chain.
697 698 max_depth = max(parents_snaps.keys())
698 699 chain = deltachain(other)
699 700 for idx, s in enumerate(chain):
700 701 if s < snapfloor:
701 702 continue
702 703 if max_depth < idx:
703 704 break
704 705 if not revlog.issnapshot(s):
705 706 break
706 707 parents_snaps[idx].add(s)
707 708 # Test them as possible intermediate snapshot base
708 709 # We test them from highest to lowest level. High level one are more
709 710 # likely to result in small delta
710 711 floor = None
711 712 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
712 713 siblings = set()
713 714 for s in snaps:
714 715 siblings.update(snapshots[s])
715 716 # Before considering making a new intermediate snapshot, we check
716 717 # if an existing snapshot, children of base we consider, would be
717 718 # suitable.
718 719 #
719 720 # It give a change to reuse a delta chain "unrelated" to the
720 721 # current revision instead of starting our own. Without such
721 722 # re-use, topological branches would keep reopening new chains.
722 723 # Creating more and more snapshot as the repository grow.
723 724
724 725 if floor is not None:
725 726 # We only do this for siblings created after the one in our
726 727 # parent's delta chain. Those created before has less chances
727 728 # to be valid base since our ancestors had to create a new
728 729 # snapshot.
729 730 siblings = [r for r in siblings if floor < r]
730 731 yield tuple(sorted(siblings))
731 732 # then test the base from our parent's delta chain.
732 733 yield tuple(sorted(snaps))
733 734 floor = min(snaps)
734 735 # No suitable base found in the parent chain, search if any full
735 736 # snapshots emitted since parent's base would be a suitable base for an
736 737 # intermediate snapshot.
737 738 #
738 739 # It give a chance to reuse a delta chain unrelated to the current
739 740 # revisions instead of starting our own. Without such re-use,
740 741 # topological branches would keep reopening new full chains. Creating
741 742 # more and more snapshot as the repository grow.
742 743 yield tuple(snapshots[nullrev])
743 744
744 745 if not sparse:
745 746 # other approach failed try against prev to hopefully save us a
746 747 # fulltext.
747 748 yield (prev,)
748 749
749 750 class deltacomputer(object):
750 751 def __init__(self, revlog):
751 752 self.revlog = revlog
752 753
753 754 def buildtext(self, revinfo, fh):
754 755 """Builds a fulltext version of a revision
755 756
756 757 revinfo: _revisioninfo instance that contains all needed info
757 758 fh: file handle to either the .i or the .d revlog file,
758 759 depending on whether it is inlined or not
759 760 """
760 761 btext = revinfo.btext
761 762 if btext[0] is not None:
762 763 return btext[0]
763 764
764 765 revlog = self.revlog
765 766 cachedelta = revinfo.cachedelta
766 767 baserev = cachedelta[0]
767 768 delta = cachedelta[1]
768 769
769 770 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
770 771 revinfo.p1, revinfo.p2,
771 772 revinfo.flags, revinfo.node)
772 773 return fulltext
773 774
774 775 def _builddeltadiff(self, base, revinfo, fh):
775 776 revlog = self.revlog
776 777 t = self.buildtext(revinfo, fh)
777 778 if revlog.iscensored(base):
778 779 # deltas based on a censored revision must replace the
779 780 # full content in one patch, so delta works everywhere
780 781 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
781 782 delta = header + t
782 783 else:
783 784 ptext = revlog.revision(base, _df=fh, raw=True)
784 785 delta = mdiff.textdiff(ptext, t)
785 786
786 787 return delta
787 788
788 789 def _builddeltainfo(self, revinfo, base, fh):
789 790 # can we use the cached delta?
790 791 delta = None
791 792 if revinfo.cachedelta:
792 793 cachebase, cachediff = revinfo.cachedelta
793 794 #check if the diff still apply
794 795 currentbase = cachebase
795 796 while (currentbase != nullrev
796 797 and currentbase != base
797 798 and self.revlog.length(currentbase) == 0):
798 799 currentbase = self.revlog.deltaparent(currentbase)
799 800 if currentbase == base:
800 801 delta = revinfo.cachedelta[1]
801 802 if delta is None:
802 803 delta = self._builddeltadiff(base, revinfo, fh)
803 804 revlog = self.revlog
804 805 header, data = revlog.compress(delta)
805 806 deltalen = len(header) + len(data)
806 807 chainbase = revlog.chainbase(base)
807 808 offset = revlog.end(len(revlog) - 1)
808 809 dist = deltalen + offset - revlog.start(chainbase)
809 810 if revlog._generaldelta:
810 811 deltabase = base
811 812 else:
812 813 deltabase = chainbase
813 814 chainlen, compresseddeltalen = revlog._chaininfo(base)
814 815 chainlen += 1
815 816 compresseddeltalen += deltalen
816 817
817 818 revlog = self.revlog
818 819 snapshotdepth = None
819 820 if deltabase == nullrev:
820 821 snapshotdepth = 0
821 822 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
822 823 # A delta chain should always be one full snapshot,
823 824 # zero or more semi-snapshots, and zero or more deltas
824 825 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
825 826 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
826 827 snapshotdepth = len(revlog._deltachain(deltabase)[0])
827 828
828 829 return _deltainfo(dist, deltalen, (header, data), deltabase,
829 830 chainbase, chainlen, compresseddeltalen,
830 831 snapshotdepth)
831 832
832 833 def _fullsnapshotinfo(self, fh, revinfo):
833 834 curr = len(self.revlog)
834 835 rawtext = self.buildtext(revinfo, fh)
835 836 data = self.revlog.compress(rawtext)
836 837 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
837 838 deltabase = chainbase = curr
838 839 snapshotdepth = 0
839 840 chainlen = 1
840 841
841 842 return _deltainfo(dist, deltalen, data, deltabase,
842 843 chainbase, chainlen, compresseddeltalen,
843 844 snapshotdepth)
844 845
845 846 def finddeltainfo(self, revinfo, fh):
846 847 """Find an acceptable delta against a candidate revision
847 848
848 849 revinfo: information about the revision (instance of _revisioninfo)
849 850 fh: file handle to either the .i or the .d revlog file,
850 851 depending on whether it is inlined or not
851 852
852 853 Returns the first acceptable candidate revision, as ordered by
853 854 _candidategroups
854 855
855 856 If no suitable deltabase is found, we return delta info for a full
856 857 snapshot.
857 858 """
858 859 if not revinfo.textlen:
859 860 return self._fullsnapshotinfo(fh, revinfo)
860 861
861 862 # no delta for flag processor revision (see "candelta" for why)
862 863 # not calling candelta since only one revision needs test, also to
863 864 # avoid overhead fetching flags again.
864 865 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
865 866 return self._fullsnapshotinfo(fh, revinfo)
866 867
867 868 cachedelta = revinfo.cachedelta
868 869 p1 = revinfo.p1
869 870 p2 = revinfo.p2
870 871 revlog = self.revlog
871 872
872 873 deltainfo = None
873 874 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
874 875 groups = _candidategroups(self.revlog, revinfo.textlen,
875 876 p1r, p2r, cachedelta)
876 877 candidaterevs = next(groups)
877 878 while candidaterevs is not None:
878 879 nominateddeltas = []
879 880 if deltainfo is not None:
880 881 # if we already found a good delta,
881 882 # challenge it against refined candidates
882 883 nominateddeltas.append(deltainfo)
883 884 for candidaterev in candidaterevs:
884 885 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
885 886 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
886 887 nominateddeltas.append(candidatedelta)
887 888 if nominateddeltas:
888 889 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
889 890 if deltainfo is not None:
890 891 candidaterevs = groups.send(deltainfo.base)
891 892 else:
892 893 candidaterevs = next(groups)
893 894
894 895 if deltainfo is None:
895 896 deltainfo = self._fullsnapshotinfo(fh, revinfo)
896 897 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now