##// END OF EJS Templates
revlog: add a ways to blacklist some revision when searching for a delta...
marmoute -
r48193:1efe3cde default
parent child Browse files
Show More
@@ -1,1106 +1,1115
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import nullrev
17 17 from ..i18n import _
18 18 from ..pycompat import getattr
19 19
20 20 from .constants import (
21 21 REVIDX_ISCENSORED,
22 22 REVIDX_RAWTEXT_CHANGING_FLAGS,
23 23 )
24 24
25 25 from ..thirdparty import attr
26 26
27 27 from .. import (
28 28 error,
29 29 mdiff,
30 30 util,
31 31 )
32 32
33 33 from . import flagutil
34 34
35 35 # maximum <delta-chain-data>/<revision-text-length> ratio
36 36 LIMIT_DELTA2TEXT = 2
37 37
38 38
39 39 class _testrevlog(object):
40 40 """minimalist fake revlog to use in doctests"""
41 41
42 42 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
43 43 """data is an list of revision payload boundaries"""
44 44 self._data = data
45 45 self._srdensitythreshold = density
46 46 self._srmingapsize = mingap
47 47 self._snapshot = set(snapshot)
48 48 self.index = None
49 49
50 50 def start(self, rev):
51 51 if rev == nullrev:
52 52 return 0
53 53 if rev == 0:
54 54 return 0
55 55 return self._data[rev - 1]
56 56
57 57 def end(self, rev):
58 58 if rev == nullrev:
59 59 return 0
60 60 return self._data[rev]
61 61
62 62 def length(self, rev):
63 63 return self.end(rev) - self.start(rev)
64 64
65 65 def __len__(self):
66 66 return len(self._data)
67 67
68 68 def issnapshot(self, rev):
69 69 if rev == nullrev:
70 70 return True
71 71 return rev in self._snapshot
72 72
73 73
74 74 def slicechunk(revlog, revs, targetsize=None):
75 75 """slice revs to reduce the amount of unrelated data to be read from disk.
76 76
77 77 ``revs`` is sliced into groups that should be read in one time.
78 78 Assume that revs are sorted.
79 79
80 80 The initial chunk is sliced until the overall density (payload/chunks-span
81 81 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
82 82 `revlog._srmingapsize` is skipped.
83 83
84 84 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
85 85 For consistency with other slicing choice, this limit won't go lower than
86 86 `revlog._srmingapsize`.
87 87
88 88 If individual revisions chunk are larger than this limit, they will still
89 89 be raised individually.
90 90
91 91 >>> data = [
92 92 ... 5, #00 (5)
93 93 ... 10, #01 (5)
94 94 ... 12, #02 (2)
95 95 ... 12, #03 (empty)
96 96 ... 27, #04 (15)
97 97 ... 31, #05 (4)
98 98 ... 31, #06 (empty)
99 99 ... 42, #07 (11)
100 100 ... 47, #08 (5)
101 101 ... 47, #09 (empty)
102 102 ... 48, #10 (1)
103 103 ... 51, #11 (3)
104 104 ... 74, #12 (23)
105 105 ... 85, #13 (11)
106 106 ... 86, #14 (1)
107 107 ... 91, #15 (5)
108 108 ... ]
109 109 >>> revlog = _testrevlog(data, snapshot=range(16))
110 110
111 111 >>> list(slicechunk(revlog, list(range(16))))
112 112 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
113 113 >>> list(slicechunk(revlog, [0, 15]))
114 114 [[0], [15]]
115 115 >>> list(slicechunk(revlog, [0, 11, 15]))
116 116 [[0], [11], [15]]
117 117 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
118 118 [[0], [11, 13, 15]]
119 119 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
120 120 [[1, 2], [5, 8, 10, 11], [14]]
121 121
122 122 Slicing with a maximum chunk size
123 123 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
124 124 [[0], [11], [13], [15]]
125 125 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
126 126 [[0], [11], [13, 15]]
127 127
128 128 Slicing involving nullrev
129 129 >>> list(slicechunk(revlog, [-1, 0, 11, 13, 15], targetsize=20))
130 130 [[-1, 0], [11], [13, 15]]
131 131 >>> list(slicechunk(revlog, [-1, 13, 15], targetsize=5))
132 132 [[-1], [13], [15]]
133 133 """
134 134 if targetsize is not None:
135 135 targetsize = max(targetsize, revlog._srmingapsize)
136 136 # targetsize should not be specified when evaluating delta candidates:
137 137 # * targetsize is used to ensure we stay within specification when reading,
138 138 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
139 139 if densityslicing is None:
140 140 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
141 141 for chunk in densityslicing(
142 142 revs, revlog._srdensitythreshold, revlog._srmingapsize
143 143 ):
144 144 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
145 145 yield subchunk
146 146
147 147
148 148 def _slicechunktosize(revlog, revs, targetsize=None):
149 149 """slice revs to match the target size
150 150
151 151 This is intended to be used on chunk that density slicing selected by that
152 152 are still too large compared to the read garantee of revlog. This might
153 153 happens when "minimal gap size" interrupted the slicing or when chain are
154 154 built in a way that create large blocks next to each other.
155 155
156 156 >>> data = [
157 157 ... 3, #0 (3)
158 158 ... 5, #1 (2)
159 159 ... 6, #2 (1)
160 160 ... 8, #3 (2)
161 161 ... 8, #4 (empty)
162 162 ... 11, #5 (3)
163 163 ... 12, #6 (1)
164 164 ... 13, #7 (1)
165 165 ... 14, #8 (1)
166 166 ... ]
167 167
168 168 == All snapshots cases ==
169 169 >>> revlog = _testrevlog(data, snapshot=range(9))
170 170
171 171 Cases where chunk is already small enough
172 172 >>> list(_slicechunktosize(revlog, [0], 3))
173 173 [[0]]
174 174 >>> list(_slicechunktosize(revlog, [6, 7], 3))
175 175 [[6, 7]]
176 176 >>> list(_slicechunktosize(revlog, [0], None))
177 177 [[0]]
178 178 >>> list(_slicechunktosize(revlog, [6, 7], None))
179 179 [[6, 7]]
180 180
181 181 cases where we need actual slicing
182 182 >>> list(_slicechunktosize(revlog, [0, 1], 3))
183 183 [[0], [1]]
184 184 >>> list(_slicechunktosize(revlog, [1, 3], 3))
185 185 [[1], [3]]
186 186 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
187 187 [[1, 2], [3]]
188 188 >>> list(_slicechunktosize(revlog, [3, 5], 3))
189 189 [[3], [5]]
190 190 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
191 191 [[3], [5]]
192 192 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
193 193 [[5], [6, 7, 8]]
194 194 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
195 195 [[0], [1, 2], [3], [5], [6, 7, 8]]
196 196
197 197 Case with too large individual chunk (must return valid chunk)
198 198 >>> list(_slicechunktosize(revlog, [0, 1], 2))
199 199 [[0], [1]]
200 200 >>> list(_slicechunktosize(revlog, [1, 3], 1))
201 201 [[1], [3]]
202 202 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
203 203 [[3], [5]]
204 204
205 205 == No Snapshot cases ==
206 206 >>> revlog = _testrevlog(data)
207 207
208 208 Cases where chunk is already small enough
209 209 >>> list(_slicechunktosize(revlog, [0], 3))
210 210 [[0]]
211 211 >>> list(_slicechunktosize(revlog, [6, 7], 3))
212 212 [[6, 7]]
213 213 >>> list(_slicechunktosize(revlog, [0], None))
214 214 [[0]]
215 215 >>> list(_slicechunktosize(revlog, [6, 7], None))
216 216 [[6, 7]]
217 217
218 218 cases where we need actual slicing
219 219 >>> list(_slicechunktosize(revlog, [0, 1], 3))
220 220 [[0], [1]]
221 221 >>> list(_slicechunktosize(revlog, [1, 3], 3))
222 222 [[1], [3]]
223 223 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
224 224 [[1], [2, 3]]
225 225 >>> list(_slicechunktosize(revlog, [3, 5], 3))
226 226 [[3], [5]]
227 227 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
228 228 [[3], [4, 5]]
229 229 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
230 230 [[5], [6, 7, 8]]
231 231 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
232 232 [[0], [1, 2], [3], [5], [6, 7, 8]]
233 233
234 234 Case with too large individual chunk (must return valid chunk)
235 235 >>> list(_slicechunktosize(revlog, [0, 1], 2))
236 236 [[0], [1]]
237 237 >>> list(_slicechunktosize(revlog, [1, 3], 1))
238 238 [[1], [3]]
239 239 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
240 240 [[3], [5]]
241 241
242 242 == mixed case ==
243 243 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
244 244 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
245 245 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
246 246 """
247 247 assert targetsize is None or 0 <= targetsize
248 248 startdata = revlog.start(revs[0])
249 249 enddata = revlog.end(revs[-1])
250 250 fullspan = enddata - startdata
251 251 if targetsize is None or fullspan <= targetsize:
252 252 yield revs
253 253 return
254 254
255 255 startrevidx = 0
256 256 endrevidx = 1
257 257 iterrevs = enumerate(revs)
258 258 next(iterrevs) # skip first rev.
259 259 # first step: get snapshots out of the way
260 260 for idx, r in iterrevs:
261 261 span = revlog.end(r) - startdata
262 262 snapshot = revlog.issnapshot(r)
263 263 if span <= targetsize and snapshot:
264 264 endrevidx = idx + 1
265 265 else:
266 266 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
267 267 if chunk:
268 268 yield chunk
269 269 startrevidx = idx
270 270 startdata = revlog.start(r)
271 271 endrevidx = idx + 1
272 272 if not snapshot:
273 273 break
274 274
275 275 # for the others, we use binary slicing to quickly converge toward valid
276 276 # chunks (otherwise, we might end up looking for start/end of many
277 277 # revisions). This logic is not looking for the perfect slicing point, it
278 278 # focuses on quickly converging toward valid chunks.
279 279 nbitem = len(revs)
280 280 while (enddata - startdata) > targetsize:
281 281 endrevidx = nbitem
282 282 if nbitem - startrevidx <= 1:
283 283 break # protect against individual chunk larger than limit
284 284 localenddata = revlog.end(revs[endrevidx - 1])
285 285 span = localenddata - startdata
286 286 while span > targetsize:
287 287 if endrevidx - startrevidx <= 1:
288 288 break # protect against individual chunk larger than limit
289 289 endrevidx -= (endrevidx - startrevidx) // 2
290 290 localenddata = revlog.end(revs[endrevidx - 1])
291 291 span = localenddata - startdata
292 292 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
293 293 if chunk:
294 294 yield chunk
295 295 startrevidx = endrevidx
296 296 startdata = revlog.start(revs[startrevidx])
297 297
298 298 chunk = _trimchunk(revlog, revs, startrevidx)
299 299 if chunk:
300 300 yield chunk
301 301
302 302
303 303 def _slicechunktodensity(revlog, revs, targetdensity=0.5, mingapsize=0):
304 304 """slice revs to reduce the amount of unrelated data to be read from disk.
305 305
306 306 ``revs`` is sliced into groups that should be read in one time.
307 307 Assume that revs are sorted.
308 308
309 309 The initial chunk is sliced until the overall density (payload/chunks-span
310 310 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
311 311 skipped.
312 312
313 313 >>> revlog = _testrevlog([
314 314 ... 5, #00 (5)
315 315 ... 10, #01 (5)
316 316 ... 12, #02 (2)
317 317 ... 12, #03 (empty)
318 318 ... 27, #04 (15)
319 319 ... 31, #05 (4)
320 320 ... 31, #06 (empty)
321 321 ... 42, #07 (11)
322 322 ... 47, #08 (5)
323 323 ... 47, #09 (empty)
324 324 ... 48, #10 (1)
325 325 ... 51, #11 (3)
326 326 ... 74, #12 (23)
327 327 ... 85, #13 (11)
328 328 ... 86, #14 (1)
329 329 ... 91, #15 (5)
330 330 ... ])
331 331
332 332 >>> list(_slicechunktodensity(revlog, list(range(16))))
333 333 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
334 334 >>> list(_slicechunktodensity(revlog, [0, 15]))
335 335 [[0], [15]]
336 336 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
337 337 [[0], [11], [15]]
338 338 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
339 339 [[0], [11, 13, 15]]
340 340 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
341 341 [[1, 2], [5, 8, 10, 11], [14]]
342 342 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
343 343 ... mingapsize=20))
344 344 [[1, 2, 3, 5, 8, 10, 11], [14]]
345 345 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
346 346 ... targetdensity=0.95))
347 347 [[1, 2], [5], [8, 10, 11], [14]]
348 348 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
349 349 ... targetdensity=0.95, mingapsize=12))
350 350 [[1, 2], [5, 8, 10, 11], [14]]
351 351 """
352 352 start = revlog.start
353 353 length = revlog.length
354 354
355 355 if len(revs) <= 1:
356 356 yield revs
357 357 return
358 358
359 359 deltachainspan = segmentspan(revlog, revs)
360 360
361 361 if deltachainspan < mingapsize:
362 362 yield revs
363 363 return
364 364
365 365 readdata = deltachainspan
366 366 chainpayload = sum(length(r) for r in revs)
367 367
368 368 if deltachainspan:
369 369 density = chainpayload / float(deltachainspan)
370 370 else:
371 371 density = 1.0
372 372
373 373 if density >= targetdensity:
374 374 yield revs
375 375 return
376 376
377 377 # Store the gaps in a heap to have them sorted by decreasing size
378 378 gaps = []
379 379 prevend = None
380 380 for i, rev in enumerate(revs):
381 381 revstart = start(rev)
382 382 revlen = length(rev)
383 383
384 384 # Skip empty revisions to form larger holes
385 385 if revlen == 0:
386 386 continue
387 387
388 388 if prevend is not None:
389 389 gapsize = revstart - prevend
390 390 # only consider holes that are large enough
391 391 if gapsize > mingapsize:
392 392 gaps.append((gapsize, i))
393 393
394 394 prevend = revstart + revlen
395 395 # sort the gaps to pop them from largest to small
396 396 gaps.sort()
397 397
398 398 # Collect the indices of the largest holes until the density is acceptable
399 399 selected = []
400 400 while gaps and density < targetdensity:
401 401 gapsize, gapidx = gaps.pop()
402 402
403 403 selected.append(gapidx)
404 404
405 405 # the gap sizes are stored as negatives to be sorted decreasingly
406 406 # by the heap
407 407 readdata -= gapsize
408 408 if readdata > 0:
409 409 density = chainpayload / float(readdata)
410 410 else:
411 411 density = 1.0
412 412 selected.sort()
413 413
414 414 # Cut the revs at collected indices
415 415 previdx = 0
416 416 for idx in selected:
417 417
418 418 chunk = _trimchunk(revlog, revs, previdx, idx)
419 419 if chunk:
420 420 yield chunk
421 421
422 422 previdx = idx
423 423
424 424 chunk = _trimchunk(revlog, revs, previdx)
425 425 if chunk:
426 426 yield chunk
427 427
428 428
429 429 def _trimchunk(revlog, revs, startidx, endidx=None):
430 430 """returns revs[startidx:endidx] without empty trailing revs
431 431
432 432 Doctest Setup
433 433 >>> revlog = _testrevlog([
434 434 ... 5, #0
435 435 ... 10, #1
436 436 ... 12, #2
437 437 ... 12, #3 (empty)
438 438 ... 17, #4
439 439 ... 21, #5
440 440 ... 21, #6 (empty)
441 441 ... ])
442 442
443 443 Contiguous cases:
444 444 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
445 445 [0, 1, 2, 3, 4, 5]
446 446 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
447 447 [0, 1, 2, 3, 4]
448 448 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
449 449 [0, 1, 2]
450 450 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
451 451 [2]
452 452 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
453 453 [3, 4, 5]
454 454 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
455 455 [3, 4]
456 456
457 457 Discontiguous cases:
458 458 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
459 459 [1, 3, 5]
460 460 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
461 461 [1]
462 462 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
463 463 [3, 5]
464 464 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
465 465 [3, 5]
466 466 """
467 467 length = revlog.length
468 468
469 469 if endidx is None:
470 470 endidx = len(revs)
471 471
472 472 # If we have a non-emtpy delta candidate, there are nothing to trim
473 473 if revs[endidx - 1] < len(revlog):
474 474 # Trim empty revs at the end, except the very first revision of a chain
475 475 while (
476 476 endidx > 1 and endidx > startidx and length(revs[endidx - 1]) == 0
477 477 ):
478 478 endidx -= 1
479 479
480 480 return revs[startidx:endidx]
481 481
482 482
483 483 def segmentspan(revlog, revs):
484 484 """Get the byte span of a segment of revisions
485 485
486 486 revs is a sorted array of revision numbers
487 487
488 488 >>> revlog = _testrevlog([
489 489 ... 5, #0
490 490 ... 10, #1
491 491 ... 12, #2
492 492 ... 12, #3 (empty)
493 493 ... 17, #4
494 494 ... ])
495 495
496 496 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
497 497 17
498 498 >>> segmentspan(revlog, [0, 4])
499 499 17
500 500 >>> segmentspan(revlog, [3, 4])
501 501 5
502 502 >>> segmentspan(revlog, [1, 2, 3,])
503 503 7
504 504 >>> segmentspan(revlog, [1, 3])
505 505 7
506 506 """
507 507 if not revs:
508 508 return 0
509 509 end = revlog.end(revs[-1])
510 510 return end - revlog.start(revs[0])
511 511
512 512
513 513 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
514 514 """build full text from a (base, delta) pair and other metadata"""
515 515 # special case deltas which replace entire base; no need to decode
516 516 # base revision. this neatly avoids censored bases, which throw when
517 517 # they're decoded.
518 518 hlen = struct.calcsize(b">lll")
519 519 if delta[:hlen] == mdiff.replacediffheader(
520 520 revlog.rawsize(baserev), len(delta) - hlen
521 521 ):
522 522 fulltext = delta[hlen:]
523 523 else:
524 524 # deltabase is rawtext before changed by flag processors, which is
525 525 # equivalent to non-raw text
526 526 basetext = revlog.revision(baserev, _df=fh, raw=False)
527 527 fulltext = mdiff.patch(basetext, delta)
528 528
529 529 try:
530 530 validatehash = flagutil.processflagsraw(revlog, fulltext, flags)
531 531 if validatehash:
532 532 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
533 533 if flags & REVIDX_ISCENSORED:
534 534 raise error.StorageError(
535 535 _(b'node %s is not censored') % expectednode
536 536 )
537 537 except error.CensoredNodeError:
538 538 # must pass the censored index flag to add censored revisions
539 539 if not flags & REVIDX_ISCENSORED:
540 540 raise
541 541 return fulltext
542 542
543 543
544 544 @attr.s(slots=True, frozen=True)
545 545 class _deltainfo(object):
546 546 distance = attr.ib()
547 547 deltalen = attr.ib()
548 548 data = attr.ib()
549 549 base = attr.ib()
550 550 chainbase = attr.ib()
551 551 chainlen = attr.ib()
552 552 compresseddeltalen = attr.ib()
553 553 snapshotdepth = attr.ib()
554 554
555 555
556 556 def drop_u_compression(delta):
557 557 """turn into a "u" (no-compression) into no-compression without header
558 558
559 559 This is useful for revlog format that has better compression method.
560 560 """
561 561 assert delta.data[0] == b'u', delta.data[0]
562 562 return _deltainfo(
563 563 delta.distance,
564 564 delta.deltalen - 1,
565 565 (b'', delta.data[1]),
566 566 delta.base,
567 567 delta.chainbase,
568 568 delta.chainlen,
569 569 delta.compresseddeltalen,
570 570 delta.snapshotdepth,
571 571 )
572 572
573 573
574 574 def isgooddeltainfo(revlog, deltainfo, revinfo):
575 575 """Returns True if the given delta is good. Good means that it is within
576 576 the disk span, disk size, and chain length bounds that we know to be
577 577 performant."""
578 578 if deltainfo is None:
579 579 return False
580 580
581 581 # - 'deltainfo.distance' is the distance from the base revision --
582 582 # bounding it limits the amount of I/O we need to do.
583 583 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
584 584 # deltas we need to apply -- bounding it limits the amount of CPU
585 585 # we consume.
586 586
587 587 textlen = revinfo.textlen
588 588 defaultmax = textlen * 4
589 589 maxdist = revlog._maxdeltachainspan
590 590 if not maxdist:
591 591 maxdist = deltainfo.distance # ensure the conditional pass
592 592 maxdist = max(maxdist, defaultmax)
593 593
594 594 # Bad delta from read span:
595 595 #
596 596 # If the span of data read is larger than the maximum allowed.
597 597 #
598 598 # In the sparse-revlog case, we rely on the associated "sparse reading"
599 599 # to avoid issue related to the span of data. In theory, it would be
600 600 # possible to build pathological revlog where delta pattern would lead
601 601 # to too many reads. However, they do not happen in practice at all. So
602 602 # we skip the span check entirely.
603 603 if not revlog._sparserevlog and maxdist < deltainfo.distance:
604 604 return False
605 605
606 606 # Bad delta from new delta size:
607 607 #
608 608 # If the delta size is larger than the target text, storing the
609 609 # delta will be inefficient.
610 610 if textlen < deltainfo.deltalen:
611 611 return False
612 612
613 613 # Bad delta from cumulated payload size:
614 614 #
615 615 # If the sum of delta get larger than K * target text length.
616 616 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
617 617 return False
618 618
619 619 # Bad delta from chain length:
620 620 #
621 621 # If the number of delta in the chain gets too high.
622 622 if revlog._maxchainlen and revlog._maxchainlen < deltainfo.chainlen:
623 623 return False
624 624
625 625 # bad delta from intermediate snapshot size limit
626 626 #
627 627 # If an intermediate snapshot size is higher than the limit. The
628 628 # limit exist to prevent endless chain of intermediate delta to be
629 629 # created.
630 630 if (
631 631 deltainfo.snapshotdepth is not None
632 632 and (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen
633 633 ):
634 634 return False
635 635
636 636 # bad delta if new intermediate snapshot is larger than the previous
637 637 # snapshot
638 638 if (
639 639 deltainfo.snapshotdepth
640 640 and revlog.length(deltainfo.base) < deltainfo.deltalen
641 641 ):
642 642 return False
643 643
644 644 return True
645 645
646 646
647 647 # If a revision's full text is that much bigger than a base candidate full
648 648 # text's, it is very unlikely that it will produce a valid delta. We no longer
649 649 # consider these candidates.
650 650 LIMIT_BASE2TEXT = 500
651 651
652 652
653 653 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
654 654 """Provides group of revision to be tested as delta base
655 655
656 656 This top level function focus on emitting groups with unique and worthwhile
657 657 content. See _raw_candidate_groups for details about the group order.
658 658 """
659 659 # should we try to build a delta?
660 660 if not (len(revlog) and revlog._storedeltachains):
661 661 yield None
662 662 return
663 663
664 664 deltalength = revlog.length
665 665 deltaparent = revlog.deltaparent
666 666 sparse = revlog._sparserevlog
667 667 good = None
668 668
669 669 deltas_limit = textlen * LIMIT_DELTA2TEXT
670 670
671 671 tested = {nullrev}
672 672 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
673 673 while True:
674 674 temptative = candidates.send(good)
675 675 if temptative is None:
676 676 break
677 677 group = []
678 678 for rev in temptative:
679 679 # skip over empty delta (no need to include them in a chain)
680 680 while revlog._generaldelta and not (
681 681 rev == nullrev or rev in tested or deltalength(rev)
682 682 ):
683 683 tested.add(rev)
684 684 rev = deltaparent(rev)
685 685 # no need to try a delta against nullrev, this will be done as a
686 686 # last resort.
687 687 if rev == nullrev:
688 688 continue
689 689 # filter out revision we tested already
690 690 if rev in tested:
691 691 continue
692 692 tested.add(rev)
693 693 # filter out delta base that will never produce good delta
694 694 if deltas_limit < revlog.length(rev):
695 695 continue
696 696 if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
697 697 continue
698 698 # no delta for rawtext-changing revs (see "candelta" for why)
699 699 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
700 700 continue
701 701 # If we reach here, we are about to build and test a delta.
702 702 # The delta building process will compute the chaininfo in all
703 703 # case, since that computation is cached, it is fine to access it
704 704 # here too.
705 705 chainlen, chainsize = revlog._chaininfo(rev)
706 706 # if chain will be too long, skip base
707 707 if revlog._maxchainlen and chainlen >= revlog._maxchainlen:
708 708 continue
709 709 # if chain already have too much data, skip base
710 710 if deltas_limit < chainsize:
711 711 continue
712 712 if sparse and revlog.upperboundcomp is not None:
713 713 maxcomp = revlog.upperboundcomp
714 714 basenotsnap = (p1, p2, nullrev)
715 715 if rev not in basenotsnap and revlog.issnapshot(rev):
716 716 snapshotdepth = revlog.snapshotdepth(rev)
717 717 # If text is significantly larger than the base, we can
718 718 # expect the resulting delta to be proportional to the size
719 719 # difference
720 720 revsize = revlog.rawsize(rev)
721 721 rawsizedistance = max(textlen - revsize, 0)
722 722 # use an estimate of the compression upper bound.
723 723 lowestrealisticdeltalen = rawsizedistance // maxcomp
724 724
725 725 # check the absolute constraint on the delta size
726 726 snapshotlimit = textlen >> snapshotdepth
727 727 if snapshotlimit < lowestrealisticdeltalen:
728 728 # delta lower bound is larger than accepted upper bound
729 729 continue
730 730
731 731 # check the relative constraint on the delta size
732 732 revlength = revlog.length(rev)
733 733 if revlength < lowestrealisticdeltalen:
734 734 # delta probable lower bound is larger than target base
735 735 continue
736 736
737 737 group.append(rev)
738 738 if group:
739 739 # XXX: in the sparse revlog case, group can become large,
740 740 # impacting performances. Some bounding or slicing mecanism
741 741 # would help to reduce this impact.
742 742 good = yield tuple(group)
743 743 yield None
744 744
745 745
746 746 def _findsnapshots(revlog, cache, start_rev):
747 747 """find snapshot from start_rev to tip"""
748 748 if util.safehasattr(revlog.index, b'findsnapshots'):
749 749 revlog.index.findsnapshots(cache, start_rev)
750 750 else:
751 751 deltaparent = revlog.deltaparent
752 752 issnapshot = revlog.issnapshot
753 753 for rev in revlog.revs(start_rev):
754 754 if issnapshot(rev):
755 755 cache[deltaparent(rev)].append(rev)
756 756
757 757
758 758 def _refinedgroups(revlog, p1, p2, cachedelta):
759 759 good = None
760 760 # First we try to reuse a the delta contained in the bundle.
761 761 # (or from the source revlog)
762 762 #
763 763 # This logic only applies to general delta repositories and can be disabled
764 764 # through configuration. Disabling reuse source delta is useful when
765 765 # we want to make sure we recomputed "optimal" deltas.
766 766 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
767 767 # Assume what we received from the server is a good choice
768 768 # build delta will reuse the cache
769 769 good = yield (cachedelta[0],)
770 770 if good is not None:
771 771 yield None
772 772 return
773 773 snapshots = collections.defaultdict(list)
774 774 for candidates in _rawgroups(revlog, p1, p2, cachedelta, snapshots):
775 775 good = yield candidates
776 776 if good is not None:
777 777 break
778 778
779 779 # If sparse revlog is enabled, we can try to refine the available deltas
780 780 if not revlog._sparserevlog:
781 781 yield None
782 782 return
783 783
784 784 # if we have a refinable value, try to refine it
785 785 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
786 786 # refine snapshot down
787 787 previous = None
788 788 while previous != good:
789 789 previous = good
790 790 base = revlog.deltaparent(good)
791 791 if base == nullrev:
792 792 break
793 793 good = yield (base,)
794 794 # refine snapshot up
795 795 if not snapshots:
796 796 _findsnapshots(revlog, snapshots, good + 1)
797 797 previous = None
798 798 while good != previous:
799 799 previous = good
800 800 children = tuple(sorted(c for c in snapshots[good]))
801 801 good = yield children
802 802
803 803 # we have found nothing
804 804 yield None
805 805
806 806
807 807 def _rawgroups(revlog, p1, p2, cachedelta, snapshots=None):
808 808 """Provides group of revision to be tested as delta base
809 809
810 810 This lower level function focus on emitting delta theorically interresting
811 811 without looking it any practical details.
812 812
813 813 The group order aims at providing fast or small candidates first.
814 814 """
815 815 gdelta = revlog._generaldelta
816 816 # gate sparse behind general-delta because of issue6056
817 817 sparse = gdelta and revlog._sparserevlog
818 818 curr = len(revlog)
819 819 prev = curr - 1
820 820 deltachain = lambda rev: revlog._deltachain(rev)[0]
821 821
822 822 if gdelta:
823 823 # exclude already lazy tested base if any
824 824 parents = [p for p in (p1, p2) if p != nullrev]
825 825
826 826 if not revlog._deltabothparents and len(parents) == 2:
827 827 parents.sort()
828 828 # To minimize the chance of having to build a fulltext,
829 829 # pick first whichever parent is closest to us (max rev)
830 830 yield (parents[1],)
831 831 # then the other one (min rev) if the first did not fit
832 832 yield (parents[0],)
833 833 elif len(parents) > 0:
834 834 # Test all parents (1 or 2), and keep the best candidate
835 835 yield parents
836 836
837 837 if sparse and parents:
838 838 if snapshots is None:
839 839 # map: base-rev: snapshot-rev
840 840 snapshots = collections.defaultdict(list)
841 841 # See if we can use an existing snapshot in the parent chains to use as
842 842 # a base for a new intermediate-snapshot
843 843 #
844 844 # search for snapshot in parents delta chain
845 845 # map: snapshot-level: snapshot-rev
846 846 parents_snaps = collections.defaultdict(set)
847 847 candidate_chains = [deltachain(p) for p in parents]
848 848 for chain in candidate_chains:
849 849 for idx, s in enumerate(chain):
850 850 if not revlog.issnapshot(s):
851 851 break
852 852 parents_snaps[idx].add(s)
853 853 snapfloor = min(parents_snaps[0]) + 1
854 854 _findsnapshots(revlog, snapshots, snapfloor)
855 855 # search for the highest "unrelated" revision
856 856 #
857 857 # Adding snapshots used by "unrelated" revision increase the odd we
858 858 # reuse an independant, yet better snapshot chain.
859 859 #
860 860 # XXX instead of building a set of revisions, we could lazily enumerate
861 861 # over the chains. That would be more efficient, however we stick to
862 862 # simple code for now.
863 863 all_revs = set()
864 864 for chain in candidate_chains:
865 865 all_revs.update(chain)
866 866 other = None
867 867 for r in revlog.revs(prev, snapfloor):
868 868 if r not in all_revs:
869 869 other = r
870 870 break
871 871 if other is not None:
872 872 # To avoid unfair competition, we won't use unrelated intermediate
873 873 # snapshot that are deeper than the ones from the parent delta
874 874 # chain.
875 875 max_depth = max(parents_snaps.keys())
876 876 chain = deltachain(other)
877 877 for idx, s in enumerate(chain):
878 878 if s < snapfloor:
879 879 continue
880 880 if max_depth < idx:
881 881 break
882 882 if not revlog.issnapshot(s):
883 883 break
884 884 parents_snaps[idx].add(s)
885 885 # Test them as possible intermediate snapshot base
886 886 # We test them from highest to lowest level. High level one are more
887 887 # likely to result in small delta
888 888 floor = None
889 889 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
890 890 siblings = set()
891 891 for s in snaps:
892 892 siblings.update(snapshots[s])
893 893 # Before considering making a new intermediate snapshot, we check
894 894 # if an existing snapshot, children of base we consider, would be
895 895 # suitable.
896 896 #
897 897 # It give a change to reuse a delta chain "unrelated" to the
898 898 # current revision instead of starting our own. Without such
899 899 # re-use, topological branches would keep reopening new chains.
900 900 # Creating more and more snapshot as the repository grow.
901 901
902 902 if floor is not None:
903 903 # We only do this for siblings created after the one in our
904 904 # parent's delta chain. Those created before has less chances
905 905 # to be valid base since our ancestors had to create a new
906 906 # snapshot.
907 907 siblings = [r for r in siblings if floor < r]
908 908 yield tuple(sorted(siblings))
909 909 # then test the base from our parent's delta chain.
910 910 yield tuple(sorted(snaps))
911 911 floor = min(snaps)
912 912 # No suitable base found in the parent chain, search if any full
913 913 # snapshots emitted since parent's base would be a suitable base for an
914 914 # intermediate snapshot.
915 915 #
916 916 # It give a chance to reuse a delta chain unrelated to the current
917 917 # revisions instead of starting our own. Without such re-use,
918 918 # topological branches would keep reopening new full chains. Creating
919 919 # more and more snapshot as the repository grow.
920 920 yield tuple(snapshots[nullrev])
921 921
922 922 if not sparse:
923 923 # other approach failed try against prev to hopefully save us a
924 924 # fulltext.
925 925 yield (prev,)
926 926
927 927
928 928 class deltacomputer(object):
929 929 def __init__(self, revlog):
930 930 self.revlog = revlog
931 931
932 932 def buildtext(self, revinfo, fh):
933 933 """Builds a fulltext version of a revision
934 934
935 935 revinfo: revisioninfo instance that contains all needed info
936 936 fh: file handle to either the .i or the .d revlog file,
937 937 depending on whether it is inlined or not
938 938 """
939 939 btext = revinfo.btext
940 940 if btext[0] is not None:
941 941 return btext[0]
942 942
943 943 revlog = self.revlog
944 944 cachedelta = revinfo.cachedelta
945 945 baserev = cachedelta[0]
946 946 delta = cachedelta[1]
947 947
948 948 fulltext = btext[0] = _textfromdelta(
949 949 fh,
950 950 revlog,
951 951 baserev,
952 952 delta,
953 953 revinfo.p1,
954 954 revinfo.p2,
955 955 revinfo.flags,
956 956 revinfo.node,
957 957 )
958 958 return fulltext
959 959
960 960 def _builddeltadiff(self, base, revinfo, fh):
961 961 revlog = self.revlog
962 962 t = self.buildtext(revinfo, fh)
963 963 if revlog.iscensored(base):
964 964 # deltas based on a censored revision must replace the
965 965 # full content in one patch, so delta works everywhere
966 966 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
967 967 delta = header + t
968 968 else:
969 969 ptext = revlog.rawdata(base, _df=fh)
970 970 delta = mdiff.textdiff(ptext, t)
971 971
972 972 return delta
973 973
974 974 def _builddeltainfo(self, revinfo, base, fh):
975 975 # can we use the cached delta?
976 976 revlog = self.revlog
977 977 chainbase = revlog.chainbase(base)
978 978 if revlog._generaldelta:
979 979 deltabase = base
980 980 else:
981 981 deltabase = chainbase
982 982 snapshotdepth = None
983 983 if revlog._sparserevlog and deltabase == nullrev:
984 984 snapshotdepth = 0
985 985 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
986 986 # A delta chain should always be one full snapshot,
987 987 # zero or more semi-snapshots, and zero or more deltas
988 988 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
989 989 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
990 990 snapshotdepth = len(revlog._deltachain(deltabase)[0])
991 991 delta = None
992 992 if revinfo.cachedelta:
993 993 cachebase, cachediff = revinfo.cachedelta
994 994 # check if the diff still apply
995 995 currentbase = cachebase
996 996 while (
997 997 currentbase != nullrev
998 998 and currentbase != base
999 999 and self.revlog.length(currentbase) == 0
1000 1000 ):
1001 1001 currentbase = self.revlog.deltaparent(currentbase)
1002 1002 if self.revlog._lazydelta and currentbase == base:
1003 1003 delta = revinfo.cachedelta[1]
1004 1004 if delta is None:
1005 1005 delta = self._builddeltadiff(base, revinfo, fh)
1006 1006 # snapshotdept need to be neither None nor 0 level snapshot
1007 1007 if revlog.upperboundcomp is not None and snapshotdepth:
1008 1008 lowestrealisticdeltalen = len(delta) // revlog.upperboundcomp
1009 1009 snapshotlimit = revinfo.textlen >> snapshotdepth
1010 1010 if snapshotlimit < lowestrealisticdeltalen:
1011 1011 return None
1012 1012 if revlog.length(base) < lowestrealisticdeltalen:
1013 1013 return None
1014 1014 header, data = revlog.compress(delta)
1015 1015 deltalen = len(header) + len(data)
1016 1016 offset = revlog.end(len(revlog) - 1)
1017 1017 dist = deltalen + offset - revlog.start(chainbase)
1018 1018 chainlen, compresseddeltalen = revlog._chaininfo(base)
1019 1019 chainlen += 1
1020 1020 compresseddeltalen += deltalen
1021 1021
1022 1022 return _deltainfo(
1023 1023 dist,
1024 1024 deltalen,
1025 1025 (header, data),
1026 1026 deltabase,
1027 1027 chainbase,
1028 1028 chainlen,
1029 1029 compresseddeltalen,
1030 1030 snapshotdepth,
1031 1031 )
1032 1032
1033 1033 def _fullsnapshotinfo(self, fh, revinfo):
1034 1034 curr = len(self.revlog)
1035 1035 rawtext = self.buildtext(revinfo, fh)
1036 1036 data = self.revlog.compress(rawtext)
1037 1037 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
1038 1038 deltabase = chainbase = curr
1039 1039 snapshotdepth = 0
1040 1040 chainlen = 1
1041 1041
1042 1042 return _deltainfo(
1043 1043 dist,
1044 1044 deltalen,
1045 1045 data,
1046 1046 deltabase,
1047 1047 chainbase,
1048 1048 chainlen,
1049 1049 compresseddeltalen,
1050 1050 snapshotdepth,
1051 1051 )
1052 1052
1053 def finddeltainfo(self, revinfo, fh):
1053 def finddeltainfo(self, revinfo, fh, excluded_bases=None):
1054 1054 """Find an acceptable delta against a candidate revision
1055 1055
1056 1056 revinfo: information about the revision (instance of _revisioninfo)
1057 1057 fh: file handle to either the .i or the .d revlog file,
1058 1058 depending on whether it is inlined or not
1059 1059
1060 1060 Returns the first acceptable candidate revision, as ordered by
1061 1061 _candidategroups
1062 1062
1063 1063 If no suitable deltabase is found, we return delta info for a full
1064 1064 snapshot.
1065
1066 `excluded_bases` is an optional set of revision that cannot be used as
1067 a delta base. Use this to recompute delta suitable in censor or strip
1068 context.
1065 1069 """
1066 1070 if not revinfo.textlen:
1067 1071 return self._fullsnapshotinfo(fh, revinfo)
1068 1072
1073 if excluded_bases is None:
1074 excluded_bases = set()
1075
1069 1076 # no delta for flag processor revision (see "candelta" for why)
1070 1077 # not calling candelta since only one revision needs test, also to
1071 1078 # avoid overhead fetching flags again.
1072 1079 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
1073 1080 return self._fullsnapshotinfo(fh, revinfo)
1074 1081
1075 1082 cachedelta = revinfo.cachedelta
1076 1083 p1 = revinfo.p1
1077 1084 p2 = revinfo.p2
1078 1085 revlog = self.revlog
1079 1086
1080 1087 deltainfo = None
1081 1088 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
1082 1089 groups = _candidategroups(
1083 1090 self.revlog, revinfo.textlen, p1r, p2r, cachedelta
1084 1091 )
1085 1092 candidaterevs = next(groups)
1086 1093 while candidaterevs is not None:
1087 1094 nominateddeltas = []
1088 1095 if deltainfo is not None:
1089 1096 # if we already found a good delta,
1090 1097 # challenge it against refined candidates
1091 1098 nominateddeltas.append(deltainfo)
1092 1099 for candidaterev in candidaterevs:
1100 if candidaterev in excluded_bases:
1101 continue
1093 1102 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
1094 1103 if candidatedelta is not None:
1095 1104 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
1096 1105 nominateddeltas.append(candidatedelta)
1097 1106 if nominateddeltas:
1098 1107 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
1099 1108 if deltainfo is not None:
1100 1109 candidaterevs = groups.send(deltainfo.base)
1101 1110 else:
1102 1111 candidaterevs = next(groups)
1103 1112
1104 1113 if deltainfo is None:
1105 1114 deltainfo = self._fullsnapshotinfo(fh, revinfo)
1106 1115 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now