##// END OF EJS Templates
delta-find: move `_refinedgroups` on the `_DeltaSearch` object...
marmoute -
r52214:c8307440 default
parent child Browse files
Show More
@@ -1,1670 +1,1673 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10
11 11 import collections
12 12 import struct
13 13
14 14 # import stuff from node for others to import from revlog
15 15 from ..node import nullrev
16 16 from ..i18n import _
17 17
18 18 from .constants import (
19 19 COMP_MODE_DEFAULT,
20 20 COMP_MODE_INLINE,
21 21 COMP_MODE_PLAIN,
22 22 DELTA_BASE_REUSE_FORCE,
23 23 DELTA_BASE_REUSE_NO,
24 24 KIND_CHANGELOG,
25 25 KIND_FILELOG,
26 26 KIND_MANIFESTLOG,
27 27 REVIDX_ISCENSORED,
28 28 REVIDX_RAWTEXT_CHANGING_FLAGS,
29 29 )
30 30
31 31 from ..thirdparty import attr
32 32
33 33 from .. import (
34 34 error,
35 35 mdiff,
36 36 util,
37 37 )
38 38
39 39 from . import flagutil
40 40
41 41 # maximum <delta-chain-data>/<revision-text-length> ratio
42 42 LIMIT_DELTA2TEXT = 2
43 43
44 44
45 45 class _testrevlog:
46 46 """minimalist fake revlog to use in doctests"""
47 47
48 48 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
49 49 """data is an list of revision payload boundaries"""
50 50 from .. import revlog
51 51
52 52 self._data = data
53 53 self.data_config = revlog.DataConfig()
54 54 self.data_config.sr_density_threshold = density
55 55 self.data_config.sr_min_gap_size = mingap
56 56 self.delta_config = revlog.DeltaConfig()
57 57 self.feature_config = revlog.FeatureConfig()
58 58 self._snapshot = set(snapshot)
59 59 self.index = None
60 60
61 61 def start(self, rev):
62 62 if rev == nullrev:
63 63 return 0
64 64 if rev == 0:
65 65 return 0
66 66 return self._data[rev - 1]
67 67
68 68 def end(self, rev):
69 69 if rev == nullrev:
70 70 return 0
71 71 return self._data[rev]
72 72
73 73 def length(self, rev):
74 74 return self.end(rev) - self.start(rev)
75 75
76 76 def __len__(self):
77 77 return len(self._data)
78 78
79 79 def issnapshot(self, rev):
80 80 if rev == nullrev:
81 81 return True
82 82 return rev in self._snapshot
83 83
84 84
85 85 def slicechunk(revlog, revs, targetsize=None):
86 86 """slice revs to reduce the amount of unrelated data to be read from disk.
87 87
88 88 ``revs`` is sliced into groups that should be read in one time.
89 89 Assume that revs are sorted.
90 90
91 91 The initial chunk is sliced until the overall density (payload/chunks-span
92 92 ratio) is above `revlog.data_config.sr_density_threshold`. No gap smaller
93 93 than `revlog.data_config.sr_min_gap_size` is skipped.
94 94
95 95 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
96 96 For consistency with other slicing choice, this limit won't go lower than
97 97 `revlog.data_config.sr_min_gap_size`.
98 98
99 99 If individual revisions chunk are larger than this limit, they will still
100 100 be raised individually.
101 101
102 102 >>> data = [
103 103 ... 5, #00 (5)
104 104 ... 10, #01 (5)
105 105 ... 12, #02 (2)
106 106 ... 12, #03 (empty)
107 107 ... 27, #04 (15)
108 108 ... 31, #05 (4)
109 109 ... 31, #06 (empty)
110 110 ... 42, #07 (11)
111 111 ... 47, #08 (5)
112 112 ... 47, #09 (empty)
113 113 ... 48, #10 (1)
114 114 ... 51, #11 (3)
115 115 ... 74, #12 (23)
116 116 ... 85, #13 (11)
117 117 ... 86, #14 (1)
118 118 ... 91, #15 (5)
119 119 ... ]
120 120 >>> revlog = _testrevlog(data, snapshot=range(16))
121 121
122 122 >>> list(slicechunk(revlog, list(range(16))))
123 123 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
124 124 >>> list(slicechunk(revlog, [0, 15]))
125 125 [[0], [15]]
126 126 >>> list(slicechunk(revlog, [0, 11, 15]))
127 127 [[0], [11], [15]]
128 128 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
129 129 [[0], [11, 13, 15]]
130 130 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
131 131 [[1, 2], [5, 8, 10, 11], [14]]
132 132
133 133 Slicing with a maximum chunk size
134 134 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
135 135 [[0], [11], [13], [15]]
136 136 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
137 137 [[0], [11], [13, 15]]
138 138
139 139 Slicing involving nullrev
140 140 >>> list(slicechunk(revlog, [-1, 0, 11, 13, 15], targetsize=20))
141 141 [[-1, 0], [11], [13, 15]]
142 142 >>> list(slicechunk(revlog, [-1, 13, 15], targetsize=5))
143 143 [[-1], [13], [15]]
144 144 """
145 145 if targetsize is not None:
146 146 targetsize = max(targetsize, revlog.data_config.sr_min_gap_size)
147 147 # targetsize should not be specified when evaluating delta candidates:
148 148 # * targetsize is used to ensure we stay within specification when reading,
149 149 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
150 150 if densityslicing is None:
151 151 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
152 152 for chunk in densityslicing(
153 153 revs,
154 154 revlog.data_config.sr_density_threshold,
155 155 revlog.data_config.sr_min_gap_size,
156 156 ):
157 157 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
158 158 yield subchunk
159 159
160 160
161 161 def _slicechunktosize(revlog, revs, targetsize=None):
162 162 """slice revs to match the target size
163 163
164 164 This is intended to be used on chunk that density slicing selected by that
165 165 are still too large compared to the read garantee of revlog. This might
166 166 happens when "minimal gap size" interrupted the slicing or when chain are
167 167 built in a way that create large blocks next to each other.
168 168
169 169 >>> data = [
170 170 ... 3, #0 (3)
171 171 ... 5, #1 (2)
172 172 ... 6, #2 (1)
173 173 ... 8, #3 (2)
174 174 ... 8, #4 (empty)
175 175 ... 11, #5 (3)
176 176 ... 12, #6 (1)
177 177 ... 13, #7 (1)
178 178 ... 14, #8 (1)
179 179 ... ]
180 180
181 181 == All snapshots cases ==
182 182 >>> revlog = _testrevlog(data, snapshot=range(9))
183 183
184 184 Cases where chunk is already small enough
185 185 >>> list(_slicechunktosize(revlog, [0], 3))
186 186 [[0]]
187 187 >>> list(_slicechunktosize(revlog, [6, 7], 3))
188 188 [[6, 7]]
189 189 >>> list(_slicechunktosize(revlog, [0], None))
190 190 [[0]]
191 191 >>> list(_slicechunktosize(revlog, [6, 7], None))
192 192 [[6, 7]]
193 193
194 194 cases where we need actual slicing
195 195 >>> list(_slicechunktosize(revlog, [0, 1], 3))
196 196 [[0], [1]]
197 197 >>> list(_slicechunktosize(revlog, [1, 3], 3))
198 198 [[1], [3]]
199 199 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
200 200 [[1, 2], [3]]
201 201 >>> list(_slicechunktosize(revlog, [3, 5], 3))
202 202 [[3], [5]]
203 203 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
204 204 [[3], [5]]
205 205 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
206 206 [[5], [6, 7, 8]]
207 207 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
208 208 [[0], [1, 2], [3], [5], [6, 7, 8]]
209 209
210 210 Case with too large individual chunk (must return valid chunk)
211 211 >>> list(_slicechunktosize(revlog, [0, 1], 2))
212 212 [[0], [1]]
213 213 >>> list(_slicechunktosize(revlog, [1, 3], 1))
214 214 [[1], [3]]
215 215 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
216 216 [[3], [5]]
217 217
218 218 == No Snapshot cases ==
219 219 >>> revlog = _testrevlog(data)
220 220
221 221 Cases where chunk is already small enough
222 222 >>> list(_slicechunktosize(revlog, [0], 3))
223 223 [[0]]
224 224 >>> list(_slicechunktosize(revlog, [6, 7], 3))
225 225 [[6, 7]]
226 226 >>> list(_slicechunktosize(revlog, [0], None))
227 227 [[0]]
228 228 >>> list(_slicechunktosize(revlog, [6, 7], None))
229 229 [[6, 7]]
230 230
231 231 cases where we need actual slicing
232 232 >>> list(_slicechunktosize(revlog, [0, 1], 3))
233 233 [[0], [1]]
234 234 >>> list(_slicechunktosize(revlog, [1, 3], 3))
235 235 [[1], [3]]
236 236 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
237 237 [[1], [2, 3]]
238 238 >>> list(_slicechunktosize(revlog, [3, 5], 3))
239 239 [[3], [5]]
240 240 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
241 241 [[3], [4, 5]]
242 242 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
243 243 [[5], [6, 7, 8]]
244 244 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
245 245 [[0], [1, 2], [3], [5], [6, 7, 8]]
246 246
247 247 Case with too large individual chunk (must return valid chunk)
248 248 >>> list(_slicechunktosize(revlog, [0, 1], 2))
249 249 [[0], [1]]
250 250 >>> list(_slicechunktosize(revlog, [1, 3], 1))
251 251 [[1], [3]]
252 252 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
253 253 [[3], [5]]
254 254
255 255 == mixed case ==
256 256 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
257 257 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
258 258 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
259 259 """
260 260 assert targetsize is None or 0 <= targetsize
261 261 startdata = revlog.start(revs[0])
262 262 enddata = revlog.end(revs[-1])
263 263 fullspan = enddata - startdata
264 264 if targetsize is None or fullspan <= targetsize:
265 265 yield revs
266 266 return
267 267
268 268 startrevidx = 0
269 269 endrevidx = 1
270 270 iterrevs = enumerate(revs)
271 271 next(iterrevs) # skip first rev.
272 272 # first step: get snapshots out of the way
273 273 for idx, r in iterrevs:
274 274 span = revlog.end(r) - startdata
275 275 snapshot = revlog.issnapshot(r)
276 276 if span <= targetsize and snapshot:
277 277 endrevidx = idx + 1
278 278 else:
279 279 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
280 280 if chunk:
281 281 yield chunk
282 282 startrevidx = idx
283 283 startdata = revlog.start(r)
284 284 endrevidx = idx + 1
285 285 if not snapshot:
286 286 break
287 287
288 288 # for the others, we use binary slicing to quickly converge toward valid
289 289 # chunks (otherwise, we might end up looking for start/end of many
290 290 # revisions). This logic is not looking for the perfect slicing point, it
291 291 # focuses on quickly converging toward valid chunks.
292 292 nbitem = len(revs)
293 293 while (enddata - startdata) > targetsize:
294 294 endrevidx = nbitem
295 295 if nbitem - startrevidx <= 1:
296 296 break # protect against individual chunk larger than limit
297 297 localenddata = revlog.end(revs[endrevidx - 1])
298 298 span = localenddata - startdata
299 299 while span > targetsize:
300 300 if endrevidx - startrevidx <= 1:
301 301 break # protect against individual chunk larger than limit
302 302 endrevidx -= (endrevidx - startrevidx) // 2
303 303 localenddata = revlog.end(revs[endrevidx - 1])
304 304 span = localenddata - startdata
305 305 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
306 306 if chunk:
307 307 yield chunk
308 308 startrevidx = endrevidx
309 309 startdata = revlog.start(revs[startrevidx])
310 310
311 311 chunk = _trimchunk(revlog, revs, startrevidx)
312 312 if chunk:
313 313 yield chunk
314 314
315 315
316 316 def _slicechunktodensity(revlog, revs, targetdensity=0.5, mingapsize=0):
317 317 """slice revs to reduce the amount of unrelated data to be read from disk.
318 318
319 319 ``revs`` is sliced into groups that should be read in one time.
320 320 Assume that revs are sorted.
321 321
322 322 The initial chunk is sliced until the overall density (payload/chunks-span
323 323 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
324 324 skipped.
325 325
326 326 >>> revlog = _testrevlog([
327 327 ... 5, #00 (5)
328 328 ... 10, #01 (5)
329 329 ... 12, #02 (2)
330 330 ... 12, #03 (empty)
331 331 ... 27, #04 (15)
332 332 ... 31, #05 (4)
333 333 ... 31, #06 (empty)
334 334 ... 42, #07 (11)
335 335 ... 47, #08 (5)
336 336 ... 47, #09 (empty)
337 337 ... 48, #10 (1)
338 338 ... 51, #11 (3)
339 339 ... 74, #12 (23)
340 340 ... 85, #13 (11)
341 341 ... 86, #14 (1)
342 342 ... 91, #15 (5)
343 343 ... ])
344 344
345 345 >>> list(_slicechunktodensity(revlog, list(range(16))))
346 346 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
347 347 >>> list(_slicechunktodensity(revlog, [0, 15]))
348 348 [[0], [15]]
349 349 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
350 350 [[0], [11], [15]]
351 351 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
352 352 [[0], [11, 13, 15]]
353 353 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
354 354 [[1, 2], [5, 8, 10, 11], [14]]
355 355 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
356 356 ... mingapsize=20))
357 357 [[1, 2, 3, 5, 8, 10, 11], [14]]
358 358 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
359 359 ... targetdensity=0.95))
360 360 [[1, 2], [5], [8, 10, 11], [14]]
361 361 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
362 362 ... targetdensity=0.95, mingapsize=12))
363 363 [[1, 2], [5, 8, 10, 11], [14]]
364 364 """
365 365 start = revlog.start
366 366 length = revlog.length
367 367
368 368 if len(revs) <= 1:
369 369 yield revs
370 370 return
371 371
372 372 deltachainspan = segmentspan(revlog, revs)
373 373
374 374 if deltachainspan < mingapsize:
375 375 yield revs
376 376 return
377 377
378 378 readdata = deltachainspan
379 379 chainpayload = sum(length(r) for r in revs)
380 380
381 381 if deltachainspan:
382 382 density = chainpayload / float(deltachainspan)
383 383 else:
384 384 density = 1.0
385 385
386 386 if density >= targetdensity:
387 387 yield revs
388 388 return
389 389
390 390 # Store the gaps in a heap to have them sorted by decreasing size
391 391 gaps = []
392 392 prevend = None
393 393 for i, rev in enumerate(revs):
394 394 revstart = start(rev)
395 395 revlen = length(rev)
396 396
397 397 # Skip empty revisions to form larger holes
398 398 if revlen == 0:
399 399 continue
400 400
401 401 if prevend is not None:
402 402 gapsize = revstart - prevend
403 403 # only consider holes that are large enough
404 404 if gapsize > mingapsize:
405 405 gaps.append((gapsize, i))
406 406
407 407 prevend = revstart + revlen
408 408 # sort the gaps to pop them from largest to small
409 409 gaps.sort()
410 410
411 411 # Collect the indices of the largest holes until the density is acceptable
412 412 selected = []
413 413 while gaps and density < targetdensity:
414 414 gapsize, gapidx = gaps.pop()
415 415
416 416 selected.append(gapidx)
417 417
418 418 # the gap sizes are stored as negatives to be sorted decreasingly
419 419 # by the heap
420 420 readdata -= gapsize
421 421 if readdata > 0:
422 422 density = chainpayload / float(readdata)
423 423 else:
424 424 density = 1.0
425 425 selected.sort()
426 426
427 427 # Cut the revs at collected indices
428 428 previdx = 0
429 429 for idx in selected:
430 430
431 431 chunk = _trimchunk(revlog, revs, previdx, idx)
432 432 if chunk:
433 433 yield chunk
434 434
435 435 previdx = idx
436 436
437 437 chunk = _trimchunk(revlog, revs, previdx)
438 438 if chunk:
439 439 yield chunk
440 440
441 441
442 442 def _trimchunk(revlog, revs, startidx, endidx=None):
443 443 """returns revs[startidx:endidx] without empty trailing revs
444 444
445 445 Doctest Setup
446 446 >>> revlog = _testrevlog([
447 447 ... 5, #0
448 448 ... 10, #1
449 449 ... 12, #2
450 450 ... 12, #3 (empty)
451 451 ... 17, #4
452 452 ... 21, #5
453 453 ... 21, #6 (empty)
454 454 ... ])
455 455
456 456 Contiguous cases:
457 457 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
458 458 [0, 1, 2, 3, 4, 5]
459 459 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
460 460 [0, 1, 2, 3, 4]
461 461 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
462 462 [0, 1, 2]
463 463 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
464 464 [2]
465 465 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
466 466 [3, 4, 5]
467 467 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
468 468 [3, 4]
469 469
470 470 Discontiguous cases:
471 471 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
472 472 [1, 3, 5]
473 473 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
474 474 [1]
475 475 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
476 476 [3, 5]
477 477 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
478 478 [3, 5]
479 479 """
480 480 length = revlog.length
481 481
482 482 if endidx is None:
483 483 endidx = len(revs)
484 484
485 485 # If we have a non-emtpy delta candidate, there are nothing to trim
486 486 if revs[endidx - 1] < len(revlog):
487 487 # Trim empty revs at the end, except the very first revision of a chain
488 488 while (
489 489 endidx > 1 and endidx > startidx and length(revs[endidx - 1]) == 0
490 490 ):
491 491 endidx -= 1
492 492
493 493 return revs[startidx:endidx]
494 494
495 495
496 496 def segmentspan(revlog, revs):
497 497 """Get the byte span of a segment of revisions
498 498
499 499 revs is a sorted array of revision numbers
500 500
501 501 >>> revlog = _testrevlog([
502 502 ... 5, #0
503 503 ... 10, #1
504 504 ... 12, #2
505 505 ... 12, #3 (empty)
506 506 ... 17, #4
507 507 ... ])
508 508
509 509 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
510 510 17
511 511 >>> segmentspan(revlog, [0, 4])
512 512 17
513 513 >>> segmentspan(revlog, [3, 4])
514 514 5
515 515 >>> segmentspan(revlog, [1, 2, 3,])
516 516 7
517 517 >>> segmentspan(revlog, [1, 3])
518 518 7
519 519 """
520 520 if not revs:
521 521 return 0
522 522 end = revlog.end(revs[-1])
523 523 return end - revlog.start(revs[0])
524 524
525 525
526 526 def _textfromdelta(revlog, baserev, delta, p1, p2, flags, expectednode):
527 527 """build full text from a (base, delta) pair and other metadata"""
528 528 # special case deltas which replace entire base; no need to decode
529 529 # base revision. this neatly avoids censored bases, which throw when
530 530 # they're decoded.
531 531 hlen = struct.calcsize(b">lll")
532 532 if delta[:hlen] == mdiff.replacediffheader(
533 533 revlog.rawsize(baserev), len(delta) - hlen
534 534 ):
535 535 fulltext = delta[hlen:]
536 536 else:
537 537 # deltabase is rawtext before changed by flag processors, which is
538 538 # equivalent to non-raw text
539 539 basetext = revlog.revision(baserev)
540 540 fulltext = mdiff.patch(basetext, delta)
541 541
542 542 try:
543 543 validatehash = flagutil.processflagsraw(revlog, fulltext, flags)
544 544 if validatehash:
545 545 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
546 546 if flags & REVIDX_ISCENSORED:
547 547 raise error.StorageError(
548 548 _(b'node %s is not censored') % expectednode
549 549 )
550 550 except error.CensoredNodeError:
551 551 # must pass the censored index flag to add censored revisions
552 552 if not flags & REVIDX_ISCENSORED:
553 553 raise
554 554 return fulltext
555 555
556 556
557 557 @attr.s(slots=True, frozen=True)
558 558 class _deltainfo:
559 559 distance = attr.ib()
560 560 deltalen = attr.ib()
561 561 data = attr.ib()
562 562 base = attr.ib()
563 563 chainbase = attr.ib()
564 564 chainlen = attr.ib()
565 565 compresseddeltalen = attr.ib()
566 566 snapshotdepth = attr.ib()
567 567
568 568
569 569 def drop_u_compression(delta):
570 570 """turn into a "u" (no-compression) into no-compression without header
571 571
572 572 This is useful for revlog format that has better compression method.
573 573 """
574 574 assert delta.data[0] == b'u', delta.data[0]
575 575 return _deltainfo(
576 576 delta.distance,
577 577 delta.deltalen - 1,
578 578 (b'', delta.data[1]),
579 579 delta.base,
580 580 delta.chainbase,
581 581 delta.chainlen,
582 582 delta.compresseddeltalen,
583 583 delta.snapshotdepth,
584 584 )
585 585
586 586
587 587 def is_good_delta_info(revlog, deltainfo, revinfo):
588 588 """Returns True if the given delta is good. Good means that it is within
589 589 the disk span, disk size, and chain length bounds that we know to be
590 590 performant."""
591 591 if deltainfo is None:
592 592 return False
593 593
594 594 # the DELTA_BASE_REUSE_FORCE case should have been taken care of sooner so
595 595 # we should never end up asking such question. Adding the assert as a
596 596 # safe-guard to detect anything that would be fishy in this regard.
597 597 assert (
598 598 revinfo.cachedelta is None
599 599 or revinfo.cachedelta[2] != DELTA_BASE_REUSE_FORCE
600 600 or not revlog.delta_config.general_delta
601 601 )
602 602
603 603 # - 'deltainfo.distance' is the distance from the base revision --
604 604 # bounding it limits the amount of I/O we need to do.
605 605 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
606 606 # deltas we need to apply -- bounding it limits the amount of CPU
607 607 # we consume.
608 608
609 609 textlen = revinfo.textlen
610 610 defaultmax = textlen * 4
611 611 maxdist = revlog.delta_config.max_deltachain_span
612 612 if not maxdist:
613 613 maxdist = deltainfo.distance # ensure the conditional pass
614 614 maxdist = max(maxdist, defaultmax)
615 615
616 616 # Bad delta from read span:
617 617 #
618 618 # If the span of data read is larger than the maximum allowed.
619 619 #
620 620 # In the sparse-revlog case, we rely on the associated "sparse reading"
621 621 # to avoid issue related to the span of data. In theory, it would be
622 622 # possible to build pathological revlog where delta pattern would lead
623 623 # to too many reads. However, they do not happen in practice at all. So
624 624 # we skip the span check entirely.
625 625 if not revlog.delta_config.sparse_revlog and maxdist < deltainfo.distance:
626 626 return False
627 627
628 628 # Bad delta from new delta size:
629 629 #
630 630 # If the delta size is larger than the target text, storing the
631 631 # delta will be inefficient.
632 632 if textlen < deltainfo.deltalen:
633 633 return False
634 634
635 635 # Bad delta from cumulated payload size:
636 636 #
637 637 # If the sum of delta get larger than K * target text length.
638 638 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
639 639 return False
640 640
641 641 # Bad delta from chain length:
642 642 #
643 643 # If the number of delta in the chain gets too high.
644 644 if (
645 645 revlog.delta_config.max_chain_len
646 646 and revlog.delta_config.max_chain_len < deltainfo.chainlen
647 647 ):
648 648 return False
649 649
650 650 # bad delta from intermediate snapshot size limit
651 651 #
652 652 # If an intermediate snapshot size is higher than the limit. The
653 653 # limit exist to prevent endless chain of intermediate delta to be
654 654 # created.
655 655 if (
656 656 deltainfo.snapshotdepth is not None
657 657 and (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen
658 658 ):
659 659 return False
660 660
661 661 # bad delta if new intermediate snapshot is larger than the previous
662 662 # snapshot
663 663 if (
664 664 deltainfo.snapshotdepth
665 665 and revlog.length(deltainfo.base) < deltainfo.deltalen
666 666 ):
667 667 return False
668 668
669 669 return True
670 670
671 671
672 672 # If a revision's full text is that much bigger than a base candidate full
673 673 # text's, it is very unlikely that it will produce a valid delta. We no longer
674 674 # consider these candidates.
675 675 LIMIT_BASE2TEXT = 500
676 676
677 677
678 678 class _DeltaSearch:
679 679 """perform the search of a good delta for a single revlog revision
680 680
681 681 note: some of the deltacomputer.finddeltainfo logic should probably move
682 682 here.
683 683 """
684 684
685 685 def __init__(
686 686 self,
687 687 revlog,
688 688 textlen,
689 689 p1,
690 690 p2,
691 691 cachedelta,
692 692 excluded_bases=None,
693 693 target_rev=None,
694 694 snapshot_cache=None,
695 695 ):
696 696 self.revlog = revlog
697 697 self.textlen = textlen
698 698 self.p1 = p1
699 699 self.p2 = p2
700 700 self.cachedelta = cachedelta
701 701 self.excluded_bases = excluded_bases
702 702 self.target_rev = target_rev
703 703 self.snapshot_cache = snapshot_cache
704 704
705 705 def candidate_groups(self):
706 706 """Provides group of revision to be tested as delta base
707 707
708 708 This top level function focus on emitting groups with unique and
709 709 worthwhile content. See _raw_candidate_groups for details about the
710 710 group order.
711 711 """
712 712 # should we try to build a delta?
713 713 if not (len(self.revlog) and self.revlog._storedeltachains):
714 714 yield None
715 715 return
716 716
717 717 if self.target_rev is None:
718 718 self.target_rev = len(self.revlog)
719 719
720 720 if not self.revlog.delta_config.general_delta:
721 721 # before general delta, there is only one possible delta base
722 722 yield (self.target_rev - 1,)
723 723 yield None
724 724 return
725 725
726 726 # the DELTA_BASE_REUSE_FORCE case should have been taken care of sooner
727 727 # so we should never end up asking such question. Adding the assert as
728 728 # a safe-guard to detect anything that would be fishy in this regard.
729 729 assert (
730 730 self.cachedelta is None
731 731 or self.cachedelta[2] != DELTA_BASE_REUSE_FORCE
732 732 or not self.revlog.delta_config.general_delta
733 733 )
734 734
735 735 deltalength = self.revlog.length
736 736 deltaparent = self.revlog.deltaparent
737 737 sparse = self.revlog.delta_config.sparse_revlog
738 738 good = None
739 739
740 740 deltas_limit = self.textlen * LIMIT_DELTA2TEXT
741 741 group_chunk_size = self.revlog.delta_config.candidate_group_chunk_size
742 742
743 743 tested = {nullrev}
744 candidates = _refinedgroups(
745 self.revlog,
746 self.p1,
747 self.p2,
748 self.cachedelta,
749 snapshot_cache=self.snapshot_cache,
750 )
744 candidates = self._refined_groups()
751 745 while True:
752 746 temptative = candidates.send(good)
753 747 if temptative is None:
754 748 break
755 749 group = []
756 750 for rev in temptative:
757 751 # skip over empty delta (no need to include them in a chain)
758 752 while not (rev == nullrev or rev in tested or deltalength(rev)):
759 753 tested.add(rev)
760 754 rev = deltaparent(rev)
761 755 # no need to try a delta against nullrev, this will be done as
762 756 # a last resort.
763 757 if rev == nullrev:
764 758 continue
765 759 # filter out revision we tested already
766 760 if rev in tested:
767 761 continue
768 762
769 763 # an higher authority deamed the base unworthy (e.g. censored)
770 764 if (
771 765 self.excluded_bases is not None
772 766 and rev in self.excluded_bases
773 767 ):
774 768 tested.add(rev)
775 769 continue
776 770 # We are in some recomputation cases and that rev is too high
777 771 # in the revlog
778 772 if self.target_rev is not None and rev >= self.target_rev:
779 773 tested.add(rev)
780 774 continue
781 775 # filter out delta base that will never produce good delta
782 776 if deltas_limit < self.revlog.length(rev):
783 777 tested.add(rev)
784 778 continue
785 779 if sparse and self.revlog.rawsize(rev) < (
786 780 self.textlen // LIMIT_BASE2TEXT
787 781 ):
788 782 tested.add(rev)
789 783 continue
790 784 # no delta for rawtext-changing revs (see "candelta" for why)
791 785 if self.revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
792 786 tested.add(rev)
793 787 continue
794 788
795 789 # If we reach here, we are about to build and test a delta.
796 790 # The delta building process will compute the chaininfo in all
797 791 # case, since that computation is cached, it is fine to access
798 792 # it here too.
799 793 chainlen, chainsize = self.revlog._chaininfo(rev)
800 794 # if chain will be too long, skip base
801 795 if (
802 796 self.revlog.delta_config.max_chain_len
803 797 and chainlen >= self.revlog.delta_config.max_chain_len
804 798 ):
805 799 tested.add(rev)
806 800 continue
807 801 # if chain already have too much data, skip base
808 802 if deltas_limit < chainsize:
809 803 tested.add(rev)
810 804 continue
811 805 if (
812 806 sparse
813 807 and self.revlog.delta_config.upper_bound_comp is not None
814 808 ):
815 809 maxcomp = self.revlog.delta_config.upper_bound_comp
816 810 basenotsnap = (self.p1, self.p2, nullrev)
817 811 if rev not in basenotsnap and self.revlog.issnapshot(rev):
818 812 snapshotdepth = self.revlog.snapshotdepth(rev)
819 813 # If text is significantly larger than the base, we can
820 814 # expect the resulting delta to be proportional to the size
821 815 # difference
822 816 revsize = self.revlog.rawsize(rev)
823 817 rawsizedistance = max(self.textlen - revsize, 0)
824 818 # use an estimate of the compression upper bound.
825 819 lowestrealisticdeltalen = rawsizedistance // maxcomp
826 820
827 821 # check the absolute constraint on the delta size
828 822 snapshotlimit = self.textlen >> snapshotdepth
829 823 if snapshotlimit < lowestrealisticdeltalen:
830 824 # delta lower bound is larger than accepted upper
831 825 # bound
832 826 tested.add(rev)
833 827 continue
834 828
835 829 # check the relative constraint on the delta size
836 830 revlength = self.revlog.length(rev)
837 831 if revlength < lowestrealisticdeltalen:
838 832 # delta probable lower bound is larger than target
839 833 # base
840 834 tested.add(rev)
841 835 continue
842 836
843 837 group.append(rev)
844 838 if group:
845 839 # When the size of the candidate group is big, it can result in
846 840 # a quite significant performance impact. To reduce this, we
847 841 # can send them in smaller batches until the new batch does not
848 842 # provide any improvements.
849 843 #
850 844 # This might reduce the overall efficiency of the compression
851 845 # in some corner cases, but that should also prevent very
852 846 # pathological cases from being an issue. (eg. 20 000
853 847 # candidates).
854 848 #
855 849 # XXX note that the ordering of the group becomes important as
856 850 # it now impacts the final result. The current order is
857 851 # unprocessed and can be improved.
858 852 if group_chunk_size == 0:
859 853 tested.update(group)
860 854 good = yield tuple(group)
861 855 else:
862 856 prev_good = good
863 857 for start in range(0, len(group), group_chunk_size):
864 858 sub_group = group[start : start + group_chunk_size]
865 859 tested.update(sub_group)
866 860 good = yield tuple(sub_group)
867 861 if prev_good == good:
868 862 break
869 863
870 864 yield None
871 865
866 def _refined_groups(self):
867 good = None
868 # First we try to reuse a the delta contained in the bundle. (or from
869 # the source revlog)
870 #
871 # This logic only applies to general delta repositories and can be
872 # disabled through configuration. Disabling reuse source delta is
873 # useful when we want to make sure we recomputed "optimal" deltas.
874 debug_info = None
875 if (
876 self.cachedelta is not None
877 and self.cachedelta[2] > DELTA_BASE_REUSE_NO
878 ):
879 # Assume what we received from the server is a good choice
880 # build delta will reuse the cache
881 if debug_info is not None:
882 debug_info['cached-delta.tested'] += 1
883 good = yield (self.cachedelta[0],)
884 if good is not None:
885 if debug_info is not None:
886 debug_info['cached-delta.accepted'] += 1
887 yield None
888 return
889 if self.snapshot_cache is None:
890 self.snapshot_cache = SnapshotCache()
891 groups = _rawgroups(
892 self.revlog,
893 self.p1,
894 self.p2,
895 self.cachedelta,
896 self.snapshot_cache,
897 )
898 for candidates in groups:
899 good = yield candidates
900 if good is not None:
901 break
872 902
873 def _refinedgroups(revlog, p1, p2, cachedelta, snapshot_cache=None):
874 good = None
875 # First we try to reuse a the delta contained in the bundle.
876 # (or from the source revlog)
877 #
878 # This logic only applies to general delta repositories and can be disabled
879 # through configuration. Disabling reuse source delta is useful when
880 # we want to make sure we recomputed "optimal" deltas.
881 debug_info = None
882 if cachedelta is not None and cachedelta[2] > DELTA_BASE_REUSE_NO:
883 # Assume what we received from the server is a good choice
884 # build delta will reuse the cache
885 if debug_info is not None:
886 debug_info['cached-delta.tested'] += 1
887 good = yield (cachedelta[0],)
888 if good is not None:
889 if debug_info is not None:
890 debug_info['cached-delta.accepted'] += 1
903 # If sparse revlog is enabled, we can try to refine the available
904 # deltas
905 if not self.revlog.delta_config.sparse_revlog:
891 906 yield None
892 907 return
893 if snapshot_cache is None:
894 snapshot_cache = SnapshotCache()
895 groups = _rawgroups(
896 revlog,
897 p1,
898 p2,
899 cachedelta,
900 snapshot_cache,
901 )
902 for candidates in groups:
903 good = yield candidates
904 if good is not None:
905 break
906
907 # If sparse revlog is enabled, we can try to refine the available deltas
908 if not revlog.delta_config.sparse_revlog:
909 yield None
910 return
911 908
912 # if we have a refinable value, try to refine it
913 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
914 # refine snapshot down
915 previous = None
916 while previous != good:
917 previous = good
918 base = revlog.deltaparent(good)
919 if base == nullrev:
920 break
921 good = yield (base,)
922 # refine snapshot up
923 if not snapshot_cache.snapshots:
924 snapshot_cache.update(revlog, good + 1)
925 previous = None
926 while good != previous:
927 previous = good
928 children = tuple(sorted(c for c in snapshot_cache.snapshots[good]))
929 good = yield children
909 # if we have a refinable value, try to refine it
910 if (
911 good is not None
912 and good not in (self.p1, self.p2)
913 and self.revlog.issnapshot(good)
914 ):
915 # refine snapshot down
916 previous = None
917 while previous != good:
918 previous = good
919 base = self.revlog.deltaparent(good)
920 if base == nullrev:
921 break
922 good = yield (base,)
923 # refine snapshot up
924 if not self.snapshot_cache.snapshots:
925 self.snapshot_cache.update(self.revlog, good + 1)
926 previous = None
927 while good != previous:
928 previous = good
929 children = tuple(
930 sorted(c for c in self.snapshot_cache.snapshots[good])
931 )
932 good = yield children
930 933
931 if debug_info is not None:
932 if good is None:
933 debug_info['no-solution'] += 1
934 if debug_info is not None:
935 if good is None:
936 debug_info['no-solution'] += 1
934 937
935 yield None
938 yield None
936 939
937 940
938 941 def _rawgroups(revlog, p1, p2, cachedelta, snapshot_cache=None):
939 942 """Provides group of revision to be tested as delta base
940 943
941 944 This lower level function focus on emitting delta theorically interresting
942 945 without looking it any practical details.
943 946
944 947 The group order aims at providing fast or small candidates first.
945 948 """
946 949 # Why search for delta base if we cannot use a delta base ?
947 950 assert revlog.delta_config.general_delta
948 951 # also see issue6056
949 952 sparse = revlog.delta_config.sparse_revlog
950 953 curr = len(revlog)
951 954 prev = curr - 1
952 955 deltachain = lambda rev: revlog._deltachain(rev)[0]
953 956
954 957 # exclude already lazy tested base if any
955 958 parents = [p for p in (p1, p2) if p != nullrev]
956 959
957 960 if not revlog.delta_config.delta_both_parents and len(parents) == 2:
958 961 parents.sort()
959 962 # To minimize the chance of having to build a fulltext,
960 963 # pick first whichever parent is closest to us (max rev)
961 964 yield (parents[1],)
962 965 # then the other one (min rev) if the first did not fit
963 966 yield (parents[0],)
964 967 elif len(parents) > 0:
965 968 # Test all parents (1 or 2), and keep the best candidate
966 969 yield parents
967 970
968 971 if sparse and parents:
969 972 if snapshot_cache is None:
970 973 # map: base-rev: [snapshot-revs]
971 974 snapshot_cache = SnapshotCache()
972 975 # See if we can use an existing snapshot in the parent chains to use as
973 976 # a base for a new intermediate-snapshot
974 977 #
975 978 # search for snapshot in parents delta chain
976 979 # map: snapshot-level: snapshot-rev
977 980 parents_snaps = collections.defaultdict(set)
978 981 candidate_chains = [deltachain(p) for p in parents]
979 982 for chain in candidate_chains:
980 983 for idx, s in enumerate(chain):
981 984 if not revlog.issnapshot(s):
982 985 break
983 986 parents_snaps[idx].add(s)
984 987 snapfloor = min(parents_snaps[0]) + 1
985 988 snapshot_cache.update(revlog, snapfloor)
986 989 # search for the highest "unrelated" revision
987 990 #
988 991 # Adding snapshots used by "unrelated" revision increase the odd we
989 992 # reuse an independant, yet better snapshot chain.
990 993 #
991 994 # XXX instead of building a set of revisions, we could lazily enumerate
992 995 # over the chains. That would be more efficient, however we stick to
993 996 # simple code for now.
994 997 all_revs = set()
995 998 for chain in candidate_chains:
996 999 all_revs.update(chain)
997 1000 other = None
998 1001 for r in revlog.revs(prev, snapfloor):
999 1002 if r not in all_revs:
1000 1003 other = r
1001 1004 break
1002 1005 if other is not None:
1003 1006 # To avoid unfair competition, we won't use unrelated intermediate
1004 1007 # snapshot that are deeper than the ones from the parent delta
1005 1008 # chain.
1006 1009 max_depth = max(parents_snaps.keys())
1007 1010 chain = deltachain(other)
1008 1011 for depth, s in enumerate(chain):
1009 1012 if s < snapfloor:
1010 1013 continue
1011 1014 if max_depth < depth:
1012 1015 break
1013 1016 if not revlog.issnapshot(s):
1014 1017 break
1015 1018 parents_snaps[depth].add(s)
1016 1019 # Test them as possible intermediate snapshot base
1017 1020 # We test them from highest to lowest level. High level one are more
1018 1021 # likely to result in small delta
1019 1022 floor = None
1020 1023 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
1021 1024 siblings = set()
1022 1025 for s in snaps:
1023 1026 siblings.update(snapshot_cache.snapshots[s])
1024 1027 # Before considering making a new intermediate snapshot, we check
1025 1028 # if an existing snapshot, children of base we consider, would be
1026 1029 # suitable.
1027 1030 #
1028 1031 # It give a change to reuse a delta chain "unrelated" to the
1029 1032 # current revision instead of starting our own. Without such
1030 1033 # re-use, topological branches would keep reopening new chains.
1031 1034 # Creating more and more snapshot as the repository grow.
1032 1035
1033 1036 if floor is not None:
1034 1037 # We only do this for siblings created after the one in our
1035 1038 # parent's delta chain. Those created before has less chances
1036 1039 # to be valid base since our ancestors had to create a new
1037 1040 # snapshot.
1038 1041 siblings = [r for r in siblings if floor < r]
1039 1042 yield tuple(sorted(siblings))
1040 1043 # then test the base from our parent's delta chain.
1041 1044 yield tuple(sorted(snaps))
1042 1045 floor = min(snaps)
1043 1046 # No suitable base found in the parent chain, search if any full
1044 1047 # snapshots emitted since parent's base would be a suitable base for an
1045 1048 # intermediate snapshot.
1046 1049 #
1047 1050 # It give a chance to reuse a delta chain unrelated to the current
1048 1051 # revisions instead of starting our own. Without such re-use,
1049 1052 # topological branches would keep reopening new full chains. Creating
1050 1053 # more and more snapshot as the repository grow.
1051 1054 full = [r for r in snapshot_cache.snapshots[nullrev] if snapfloor <= r]
1052 1055 yield tuple(sorted(full))
1053 1056
1054 1057 if not sparse:
1055 1058 # other approach failed try against prev to hopefully save us a
1056 1059 # fulltext.
1057 1060 yield (prev,)
1058 1061
1059 1062
1060 1063 class SnapshotCache:
1061 1064 __slots__ = ('snapshots', '_start_rev', '_end_rev')
1062 1065
1063 1066 def __init__(self):
1064 1067 self.snapshots = collections.defaultdict(set)
1065 1068 self._start_rev = None
1066 1069 self._end_rev = None
1067 1070
1068 1071 def update(self, revlog, start_rev=0):
1069 1072 """find snapshots from start_rev to tip"""
1070 1073 nb_revs = len(revlog)
1071 1074 end_rev = nb_revs - 1
1072 1075 if start_rev > end_rev:
1073 1076 return # range is empty
1074 1077
1075 1078 if self._start_rev is None:
1076 1079 assert self._end_rev is None
1077 1080 self._update(revlog, start_rev, end_rev)
1078 1081 elif not (self._start_rev <= start_rev and end_rev <= self._end_rev):
1079 1082 if start_rev < self._start_rev:
1080 1083 self._update(revlog, start_rev, self._start_rev - 1)
1081 1084 if self._end_rev < end_rev:
1082 1085 self._update(revlog, self._end_rev + 1, end_rev)
1083 1086
1084 1087 if self._start_rev is None:
1085 1088 assert self._end_rev is None
1086 1089 self._end_rev = end_rev
1087 1090 self._start_rev = start_rev
1088 1091 else:
1089 1092 self._start_rev = min(self._start_rev, start_rev)
1090 1093 self._end_rev = max(self._end_rev, end_rev)
1091 1094 assert self._start_rev <= self._end_rev, (
1092 1095 self._start_rev,
1093 1096 self._end_rev,
1094 1097 )
1095 1098
1096 1099 def _update(self, revlog, start_rev, end_rev):
1097 1100 """internal method that actually do update content"""
1098 1101 assert self._start_rev is None or (
1099 1102 start_rev < self._start_rev or start_rev > self._end_rev
1100 1103 ), (self._start_rev, self._end_rev, start_rev, end_rev)
1101 1104 assert self._start_rev is None or (
1102 1105 end_rev < self._start_rev or end_rev > self._end_rev
1103 1106 ), (self._start_rev, self._end_rev, start_rev, end_rev)
1104 1107 cache = self.snapshots
1105 1108 if hasattr(revlog.index, 'findsnapshots'):
1106 1109 revlog.index.findsnapshots(cache, start_rev, end_rev)
1107 1110 else:
1108 1111 deltaparent = revlog.deltaparent
1109 1112 issnapshot = revlog.issnapshot
1110 1113 for rev in revlog.revs(start_rev, end_rev):
1111 1114 if issnapshot(rev):
1112 1115 cache[deltaparent(rev)].add(rev)
1113 1116
1114 1117
1115 1118 class deltacomputer:
1116 1119 """object capable of computing delta and finding delta for multiple revision
1117 1120
1118 1121 This object is meant to compute and find multiple delta applied to the same
1119 1122 revlog.
1120 1123 """
1121 1124
1122 1125 def __init__(
1123 1126 self,
1124 1127 revlog,
1125 1128 write_debug=None,
1126 1129 debug_search=False,
1127 1130 debug_info=None,
1128 1131 ):
1129 1132 self.revlog = revlog
1130 1133 self._write_debug = write_debug
1131 1134 if write_debug is None:
1132 1135 self._debug_search = False
1133 1136 else:
1134 1137 self._debug_search = debug_search
1135 1138 self._debug_info = debug_info
1136 1139 self._snapshot_cache = SnapshotCache()
1137 1140
1138 1141 @property
1139 1142 def _gather_debug(self):
1140 1143 return self._write_debug is not None or self._debug_info is not None
1141 1144
1142 1145 def buildtext(self, revinfo):
1143 1146 """Builds a fulltext version of a revision
1144 1147
1145 1148 revinfo: revisioninfo instance that contains all needed info
1146 1149 """
1147 1150 btext = revinfo.btext
1148 1151 if btext[0] is not None:
1149 1152 return btext[0]
1150 1153
1151 1154 revlog = self.revlog
1152 1155 cachedelta = revinfo.cachedelta
1153 1156 baserev = cachedelta[0]
1154 1157 delta = cachedelta[1]
1155 1158
1156 1159 fulltext = btext[0] = _textfromdelta(
1157 1160 revlog,
1158 1161 baserev,
1159 1162 delta,
1160 1163 revinfo.p1,
1161 1164 revinfo.p2,
1162 1165 revinfo.flags,
1163 1166 revinfo.node,
1164 1167 )
1165 1168 return fulltext
1166 1169
1167 1170 def _builddeltadiff(self, base, revinfo):
1168 1171 revlog = self.revlog
1169 1172 t = self.buildtext(revinfo)
1170 1173 if revlog.iscensored(base):
1171 1174 # deltas based on a censored revision must replace the
1172 1175 # full content in one patch, so delta works everywhere
1173 1176 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
1174 1177 delta = header + t
1175 1178 else:
1176 1179 ptext = revlog.rawdata(base)
1177 1180 delta = mdiff.textdiff(ptext, t)
1178 1181
1179 1182 return delta
1180 1183
1181 1184 def _builddeltainfo(self, revinfo, base, target_rev=None):
1182 1185 # can we use the cached delta?
1183 1186 revlog = self.revlog
1184 1187 chainbase = revlog.chainbase(base)
1185 1188 if revlog.delta_config.general_delta:
1186 1189 deltabase = base
1187 1190 else:
1188 1191 if target_rev is not None and base != target_rev - 1:
1189 1192 msg = (
1190 1193 b'general delta cannot use delta for something else '
1191 1194 b'than `prev`: %d<-%d'
1192 1195 )
1193 1196 msg %= (base, target_rev)
1194 1197 raise error.ProgrammingError(msg)
1195 1198 deltabase = chainbase
1196 1199 snapshotdepth = None
1197 1200 if revlog.delta_config.sparse_revlog and deltabase == nullrev:
1198 1201 snapshotdepth = 0
1199 1202 elif revlog.delta_config.sparse_revlog and revlog.issnapshot(deltabase):
1200 1203 # A delta chain should always be one full snapshot,
1201 1204 # zero or more semi-snapshots, and zero or more deltas
1202 1205 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
1203 1206 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
1204 1207 snapshotdepth = len(revlog._deltachain(deltabase)[0])
1205 1208 delta = None
1206 1209 if revinfo.cachedelta:
1207 1210 cachebase = revinfo.cachedelta[0]
1208 1211 # check if the diff still apply
1209 1212 currentbase = cachebase
1210 1213 while (
1211 1214 currentbase != nullrev
1212 1215 and currentbase != base
1213 1216 and self.revlog.length(currentbase) == 0
1214 1217 ):
1215 1218 currentbase = self.revlog.deltaparent(currentbase)
1216 1219 if self.revlog.delta_config.lazy_delta and currentbase == base:
1217 1220 delta = revinfo.cachedelta[1]
1218 1221 if delta is None:
1219 1222 delta = self._builddeltadiff(base, revinfo)
1220 1223 if self._debug_search:
1221 1224 msg = b"DBG-DELTAS-SEARCH: uncompressed-delta-size=%d\n"
1222 1225 msg %= len(delta)
1223 1226 self._write_debug(msg)
1224 1227 # snapshotdept need to be neither None nor 0 level snapshot
1225 1228 if revlog.delta_config.upper_bound_comp is not None and snapshotdepth:
1226 1229 lowestrealisticdeltalen = (
1227 1230 len(delta) // revlog.delta_config.upper_bound_comp
1228 1231 )
1229 1232 snapshotlimit = revinfo.textlen >> snapshotdepth
1230 1233 if self._debug_search:
1231 1234 msg = b"DBG-DELTAS-SEARCH: projected-lower-size=%d\n"
1232 1235 msg %= lowestrealisticdeltalen
1233 1236 self._write_debug(msg)
1234 1237 if snapshotlimit < lowestrealisticdeltalen:
1235 1238 if self._debug_search:
1236 1239 msg = b"DBG-DELTAS-SEARCH: DISCARDED (snapshot limit)\n"
1237 1240 self._write_debug(msg)
1238 1241 return None
1239 1242 if revlog.length(base) < lowestrealisticdeltalen:
1240 1243 if self._debug_search:
1241 1244 msg = b"DBG-DELTAS-SEARCH: DISCARDED (prev size)\n"
1242 1245 self._write_debug(msg)
1243 1246 return None
1244 1247 header, data = revlog._inner.compress(delta)
1245 1248 deltalen = len(header) + len(data)
1246 1249 offset = revlog.end(len(revlog) - 1)
1247 1250 dist = deltalen + offset - revlog.start(chainbase)
1248 1251 chainlen, compresseddeltalen = revlog._chaininfo(base)
1249 1252 chainlen += 1
1250 1253 compresseddeltalen += deltalen
1251 1254
1252 1255 return _deltainfo(
1253 1256 dist,
1254 1257 deltalen,
1255 1258 (header, data),
1256 1259 deltabase,
1257 1260 chainbase,
1258 1261 chainlen,
1259 1262 compresseddeltalen,
1260 1263 snapshotdepth,
1261 1264 )
1262 1265
1263 1266 def _fullsnapshotinfo(self, revinfo, curr):
1264 1267 rawtext = self.buildtext(revinfo)
1265 1268 data = self.revlog._inner.compress(rawtext)
1266 1269 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
1267 1270 deltabase = chainbase = curr
1268 1271 snapshotdepth = 0
1269 1272 chainlen = 1
1270 1273
1271 1274 return _deltainfo(
1272 1275 dist,
1273 1276 deltalen,
1274 1277 data,
1275 1278 deltabase,
1276 1279 chainbase,
1277 1280 chainlen,
1278 1281 compresseddeltalen,
1279 1282 snapshotdepth,
1280 1283 )
1281 1284
1282 1285 def finddeltainfo(self, revinfo, excluded_bases=None, target_rev=None):
1283 1286 """Find an acceptable delta against a candidate revision
1284 1287
1285 1288 revinfo: information about the revision (instance of _revisioninfo)
1286 1289
1287 1290 Returns the first acceptable candidate revision, as ordered by
1288 1291 _candidategroups
1289 1292
1290 1293 If no suitable deltabase is found, we return delta info for a full
1291 1294 snapshot.
1292 1295
1293 1296 `excluded_bases` is an optional set of revision that cannot be used as
1294 1297 a delta base. Use this to recompute delta suitable in censor or strip
1295 1298 context.
1296 1299 """
1297 1300 if target_rev is None:
1298 1301 target_rev = len(self.revlog)
1299 1302
1300 1303 gather_debug = self._gather_debug
1301 1304 cachedelta = revinfo.cachedelta
1302 1305 revlog = self.revlog
1303 1306 p1r = p2r = None
1304 1307
1305 1308 if excluded_bases is None:
1306 1309 excluded_bases = set()
1307 1310
1308 1311 if gather_debug:
1309 1312 start = util.timer()
1310 1313 dbg = self._one_dbg_data()
1311 1314 dbg['revision'] = target_rev
1312 1315 target_revlog = b"UNKNOWN"
1313 1316 target_type = self.revlog.target[0]
1314 1317 target_key = self.revlog.target[1]
1315 1318 if target_type == KIND_CHANGELOG:
1316 1319 target_revlog = b'CHANGELOG:'
1317 1320 elif target_type == KIND_MANIFESTLOG:
1318 1321 target_revlog = b'MANIFESTLOG:'
1319 1322 if target_key:
1320 1323 target_revlog += b'%s:' % target_key
1321 1324 elif target_type == KIND_FILELOG:
1322 1325 target_revlog = b'FILELOG:'
1323 1326 if target_key:
1324 1327 target_revlog += b'%s:' % target_key
1325 1328 dbg['target-revlog'] = target_revlog
1326 1329 p1r = revlog.rev(revinfo.p1)
1327 1330 p2r = revlog.rev(revinfo.p2)
1328 1331 if p1r != nullrev:
1329 1332 p1_chain_len = revlog._chaininfo(p1r)[0]
1330 1333 else:
1331 1334 p1_chain_len = -1
1332 1335 if p2r != nullrev:
1333 1336 p2_chain_len = revlog._chaininfo(p2r)[0]
1334 1337 else:
1335 1338 p2_chain_len = -1
1336 1339 dbg['p1-chain-len'] = p1_chain_len
1337 1340 dbg['p2-chain-len'] = p2_chain_len
1338 1341
1339 1342 # 1) if the revision is empty, no amount of delta can beat it
1340 1343 #
1341 1344 # 2) no delta for flag processor revision (see "candelta" for why)
1342 1345 # not calling candelta since only one revision needs test, also to
1343 1346 # avoid overhead fetching flags again.
1344 1347 if not revinfo.textlen or revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
1345 1348 deltainfo = self._fullsnapshotinfo(revinfo, target_rev)
1346 1349 if gather_debug:
1347 1350 end = util.timer()
1348 1351 dbg['duration'] = end - start
1349 1352 dbg[
1350 1353 'delta-base'
1351 1354 ] = deltainfo.base # pytype: disable=attribute-error
1352 1355 dbg['search_round_count'] = 0
1353 1356 dbg['using-cached-base'] = False
1354 1357 dbg['delta_try_count'] = 0
1355 1358 dbg['type'] = b"full"
1356 1359 dbg['snapshot-depth'] = 0
1357 1360 self._dbg_process_data(dbg)
1358 1361 return deltainfo
1359 1362
1360 1363 deltainfo = None
1361 1364
1362 1365 # If this source delta are to be forcibly reuse, let us comply early.
1363 1366 if (
1364 1367 revlog.delta_config.general_delta
1365 1368 and revinfo.cachedelta is not None
1366 1369 and revinfo.cachedelta[2] == DELTA_BASE_REUSE_FORCE
1367 1370 ):
1368 1371 base = revinfo.cachedelta[0]
1369 1372 if base == nullrev:
1370 1373 dbg_type = b"full"
1371 1374 deltainfo = self._fullsnapshotinfo(revinfo, target_rev)
1372 1375 if gather_debug:
1373 1376 snapshotdepth = 0
1374 1377 elif base not in excluded_bases:
1375 1378 delta = revinfo.cachedelta[1]
1376 1379 header, data = revlog.compress(delta)
1377 1380 deltalen = len(header) + len(data)
1378 1381 if gather_debug:
1379 1382 offset = revlog.end(len(revlog) - 1)
1380 1383 chainbase = revlog.chainbase(base)
1381 1384 distance = deltalen + offset - revlog.start(chainbase)
1382 1385 chainlen, compresseddeltalen = revlog._chaininfo(base)
1383 1386 chainlen += 1
1384 1387 compresseddeltalen += deltalen
1385 1388 if base == p1r or base == p2r:
1386 1389 dbg_type = b"delta"
1387 1390 snapshotdepth = None
1388 1391 elif not revlog.issnapshot(base):
1389 1392 snapshotdepth = None
1390 1393 else:
1391 1394 dbg_type = b"snapshot"
1392 1395 snapshotdepth = revlog.snapshotdepth(base) + 1
1393 1396 else:
1394 1397 distance = None
1395 1398 chainbase = None
1396 1399 chainlen = None
1397 1400 compresseddeltalen = None
1398 1401 snapshotdepth = None
1399 1402 deltainfo = _deltainfo(
1400 1403 distance=distance,
1401 1404 deltalen=deltalen,
1402 1405 data=(header, data),
1403 1406 base=base,
1404 1407 chainbase=chainbase,
1405 1408 chainlen=chainlen,
1406 1409 compresseddeltalen=compresseddeltalen,
1407 1410 snapshotdepth=snapshotdepth,
1408 1411 )
1409 1412
1410 1413 if deltainfo is not None:
1411 1414 if gather_debug:
1412 1415 end = util.timer()
1413 1416 dbg['duration'] = end - start
1414 1417 dbg[
1415 1418 'delta-base'
1416 1419 ] = deltainfo.base # pytype: disable=attribute-error
1417 1420 dbg['search_round_count'] = 0
1418 1421 dbg['using-cached-base'] = True
1419 1422 dbg['delta_try_count'] = 0
1420 1423 dbg['type'] = b"full"
1421 1424 if snapshotdepth is None:
1422 1425 dbg['snapshot-depth'] = 0
1423 1426 else:
1424 1427 dbg['snapshot-depth'] = snapshotdepth
1425 1428 self._dbg_process_data(dbg)
1426 1429 return deltainfo
1427 1430
1428 1431 # count the number of different delta we tried (for debug purpose)
1429 1432 dbg_try_count = 0
1430 1433 # count the number of "search round" we did. (for debug purpose)
1431 1434 dbg_try_rounds = 0
1432 1435 dbg_type = b'unknown'
1433 1436
1434 1437 if p1r is None:
1435 1438 p1r = revlog.rev(revinfo.p1)
1436 1439 p2r = revlog.rev(revinfo.p2)
1437 1440
1438 1441 if self._debug_search:
1439 1442 msg = b"DBG-DELTAS-SEARCH: SEARCH rev=%d\n"
1440 1443 msg %= target_rev
1441 1444 self._write_debug(msg)
1442 1445
1443 1446 search = _DeltaSearch(
1444 1447 self.revlog,
1445 1448 revinfo.textlen,
1446 1449 p1r,
1447 1450 p2r,
1448 1451 cachedelta,
1449 1452 excluded_bases,
1450 1453 target_rev,
1451 1454 snapshot_cache=self._snapshot_cache,
1452 1455 )
1453 1456
1454 1457 groups = search.candidate_groups()
1455 1458 candidaterevs = next(groups)
1456 1459 while candidaterevs is not None:
1457 1460 dbg_try_rounds += 1
1458 1461 if self._debug_search:
1459 1462 prev = None
1460 1463 if deltainfo is not None:
1461 1464 prev = deltainfo.base
1462 1465
1463 1466 if (
1464 1467 cachedelta is not None
1465 1468 and len(candidaterevs) == 1
1466 1469 and cachedelta[0] in candidaterevs
1467 1470 ):
1468 1471 round_type = b"cached-delta"
1469 1472 elif p1r in candidaterevs or p2r in candidaterevs:
1470 1473 round_type = b"parents"
1471 1474 elif prev is not None and all(c < prev for c in candidaterevs):
1472 1475 round_type = b"refine-down"
1473 1476 elif prev is not None and all(c > prev for c in candidaterevs):
1474 1477 round_type = b"refine-up"
1475 1478 else:
1476 1479 round_type = b"search-down"
1477 1480 msg = b"DBG-DELTAS-SEARCH: ROUND #%d - %d candidates - %s\n"
1478 1481 msg %= (dbg_try_rounds, len(candidaterevs), round_type)
1479 1482 self._write_debug(msg)
1480 1483 nominateddeltas = []
1481 1484 if deltainfo is not None:
1482 1485 if self._debug_search:
1483 1486 msg = (
1484 1487 b"DBG-DELTAS-SEARCH: CONTENDER: rev=%d - length=%d\n"
1485 1488 )
1486 1489 msg %= (deltainfo.base, deltainfo.deltalen)
1487 1490 self._write_debug(msg)
1488 1491 # if we already found a good delta,
1489 1492 # challenge it against refined candidates
1490 1493 nominateddeltas.append(deltainfo)
1491 1494 for candidaterev in candidaterevs:
1492 1495 if self._debug_search:
1493 1496 msg = b"DBG-DELTAS-SEARCH: CANDIDATE: rev=%d\n"
1494 1497 msg %= candidaterev
1495 1498 self._write_debug(msg)
1496 1499 candidate_type = None
1497 1500 if candidaterev == p1r:
1498 1501 candidate_type = b"p1"
1499 1502 elif candidaterev == p2r:
1500 1503 candidate_type = b"p2"
1501 1504 elif self.revlog.issnapshot(candidaterev):
1502 1505 candidate_type = b"snapshot-%d"
1503 1506 candidate_type %= self.revlog.snapshotdepth(
1504 1507 candidaterev
1505 1508 )
1506 1509
1507 1510 if candidate_type is not None:
1508 1511 msg = b"DBG-DELTAS-SEARCH: type=%s\n"
1509 1512 msg %= candidate_type
1510 1513 self._write_debug(msg)
1511 1514 msg = b"DBG-DELTAS-SEARCH: size=%d\n"
1512 1515 msg %= self.revlog.length(candidaterev)
1513 1516 self._write_debug(msg)
1514 1517 msg = b"DBG-DELTAS-SEARCH: base=%d\n"
1515 1518 msg %= self.revlog.deltaparent(candidaterev)
1516 1519 self._write_debug(msg)
1517 1520
1518 1521 dbg_try_count += 1
1519 1522
1520 1523 if self._debug_search:
1521 1524 delta_start = util.timer()
1522 1525 candidatedelta = self._builddeltainfo(
1523 1526 revinfo,
1524 1527 candidaterev,
1525 1528 target_rev=target_rev,
1526 1529 )
1527 1530 if self._debug_search:
1528 1531 delta_end = util.timer()
1529 1532 msg = b"DBG-DELTAS-SEARCH: delta-search-time=%f\n"
1530 1533 msg %= delta_end - delta_start
1531 1534 self._write_debug(msg)
1532 1535 if candidatedelta is not None:
1533 1536 if is_good_delta_info(self.revlog, candidatedelta, revinfo):
1534 1537 if self._debug_search:
1535 1538 msg = b"DBG-DELTAS-SEARCH: DELTA: length=%d (GOOD)\n"
1536 1539 msg %= candidatedelta.deltalen
1537 1540 self._write_debug(msg)
1538 1541 nominateddeltas.append(candidatedelta)
1539 1542 elif self._debug_search:
1540 1543 msg = b"DBG-DELTAS-SEARCH: DELTA: length=%d (BAD)\n"
1541 1544 msg %= candidatedelta.deltalen
1542 1545 self._write_debug(msg)
1543 1546 elif self._debug_search:
1544 1547 msg = b"DBG-DELTAS-SEARCH: NO-DELTA\n"
1545 1548 self._write_debug(msg)
1546 1549 if nominateddeltas:
1547 1550 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
1548 1551 if deltainfo is not None:
1549 1552 candidaterevs = groups.send(deltainfo.base)
1550 1553 else:
1551 1554 candidaterevs = next(groups)
1552 1555
1553 1556 if deltainfo is None:
1554 1557 dbg_type = b"full"
1555 1558 deltainfo = self._fullsnapshotinfo(revinfo, target_rev)
1556 1559 elif deltainfo.snapshotdepth: # pytype: disable=attribute-error
1557 1560 dbg_type = b"snapshot"
1558 1561 else:
1559 1562 dbg_type = b"delta"
1560 1563
1561 1564 if gather_debug:
1562 1565 end = util.timer()
1563 1566 if dbg_type == b'full':
1564 1567 used_cached = (
1565 1568 cachedelta is not None
1566 1569 and dbg_try_rounds == 0
1567 1570 and dbg_try_count == 0
1568 1571 and cachedelta[0] == nullrev
1569 1572 )
1570 1573 else:
1571 1574 used_cached = (
1572 1575 cachedelta is not None
1573 1576 and dbg_try_rounds == 1
1574 1577 and dbg_try_count == 1
1575 1578 and deltainfo.base == cachedelta[0]
1576 1579 )
1577 1580 dbg['duration'] = end - start
1578 1581 dbg[
1579 1582 'delta-base'
1580 1583 ] = deltainfo.base # pytype: disable=attribute-error
1581 1584 dbg['search_round_count'] = dbg_try_rounds
1582 1585 dbg['using-cached-base'] = used_cached
1583 1586 dbg['delta_try_count'] = dbg_try_count
1584 1587 dbg['type'] = dbg_type
1585 1588 if (
1586 1589 deltainfo.snapshotdepth # pytype: disable=attribute-error
1587 1590 is not None
1588 1591 ):
1589 1592 dbg[
1590 1593 'snapshot-depth'
1591 1594 ] = deltainfo.snapshotdepth # pytype: disable=attribute-error
1592 1595 else:
1593 1596 dbg['snapshot-depth'] = 0
1594 1597 self._dbg_process_data(dbg)
1595 1598 return deltainfo
1596 1599
1597 1600 def _one_dbg_data(self):
1598 1601 return {
1599 1602 'duration': None,
1600 1603 'revision': None,
1601 1604 'delta-base': None,
1602 1605 'search_round_count': None,
1603 1606 'using-cached-base': None,
1604 1607 'delta_try_count': None,
1605 1608 'type': None,
1606 1609 'p1-chain-len': None,
1607 1610 'p2-chain-len': None,
1608 1611 'snapshot-depth': None,
1609 1612 'target-revlog': None,
1610 1613 }
1611 1614
1612 1615 def _dbg_process_data(self, dbg):
1613 1616 if self._debug_info is not None:
1614 1617 self._debug_info.append(dbg)
1615 1618
1616 1619 if self._write_debug is not None:
1617 1620 msg = (
1618 1621 b"DBG-DELTAS:"
1619 1622 b" %-12s"
1620 1623 b" rev=%d:"
1621 1624 b" delta-base=%d"
1622 1625 b" is-cached=%d"
1623 1626 b" - search-rounds=%d"
1624 1627 b" try-count=%d"
1625 1628 b" - delta-type=%-6s"
1626 1629 b" snap-depth=%d"
1627 1630 b" - p1-chain-length=%d"
1628 1631 b" p2-chain-length=%d"
1629 1632 b" - duration=%f"
1630 1633 b"\n"
1631 1634 )
1632 1635 msg %= (
1633 1636 dbg["target-revlog"],
1634 1637 dbg["revision"],
1635 1638 dbg["delta-base"],
1636 1639 dbg["using-cached-base"],
1637 1640 dbg["search_round_count"],
1638 1641 dbg["delta_try_count"],
1639 1642 dbg["type"],
1640 1643 dbg["snapshot-depth"],
1641 1644 dbg["p1-chain-len"],
1642 1645 dbg["p2-chain-len"],
1643 1646 dbg["duration"],
1644 1647 )
1645 1648 self._write_debug(msg)
1646 1649
1647 1650
1648 1651 def delta_compression(default_compression_header, deltainfo):
1649 1652 """return (COMPRESSION_MODE, deltainfo)
1650 1653
1651 1654 used by revlog v2+ format to dispatch between PLAIN and DEFAULT
1652 1655 compression.
1653 1656 """
1654 1657 h, d = deltainfo.data
1655 1658 compression_mode = COMP_MODE_INLINE
1656 1659 if not h and not d:
1657 1660 # not data to store at all... declare them uncompressed
1658 1661 compression_mode = COMP_MODE_PLAIN
1659 1662 elif not h:
1660 1663 t = d[0:1]
1661 1664 if t == b'\0':
1662 1665 compression_mode = COMP_MODE_PLAIN
1663 1666 elif t == default_compression_header:
1664 1667 compression_mode = COMP_MODE_DEFAULT
1665 1668 elif h == b'u':
1666 1669 # we have a more efficient way to declare uncompressed
1667 1670 h = b''
1668 1671 compression_mode = COMP_MODE_PLAIN
1669 1672 deltainfo = drop_u_compression(deltainfo)
1670 1673 return compression_mode, deltainfo
General Comments 0
You need to be logged in to leave comments. Login now