##// END OF EJS Templates
deltas: skip if projected delta size is bigger than previous snapshot...
Valentin Gatien-Baron -
r42664:66c27df1 default
parent child Browse files
Show More
@@ -1,1035 +1,1041 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import (
17 17 nullrev,
18 18 )
19 19 from ..i18n import _
20 20
21 21 from .constants import (
22 22 REVIDX_ISCENSORED,
23 23 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 24 )
25 25
26 26 from ..thirdparty import (
27 27 attr,
28 28 )
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 util,
34 34 )
35 35
36 36 # maximum <delta-chain-data>/<revision-text-length> ratio
37 37 LIMIT_DELTA2TEXT = 2
38 38
39 39 class _testrevlog(object):
40 40 """minimalist fake revlog to use in doctests"""
41 41
42 42 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
43 43 """data is an list of revision payload boundaries"""
44 44 self._data = data
45 45 self._srdensitythreshold = density
46 46 self._srmingapsize = mingap
47 47 self._snapshot = set(snapshot)
48 48 self.index = None
49 49
50 50 def start(self, rev):
51 51 if rev == nullrev:
52 52 return 0
53 53 if rev == 0:
54 54 return 0
55 55 return self._data[rev - 1]
56 56
57 57 def end(self, rev):
58 58 if rev == nullrev:
59 59 return 0
60 60 return self._data[rev]
61 61
62 62 def length(self, rev):
63 63 return self.end(rev) - self.start(rev)
64 64
65 65 def __len__(self):
66 66 return len(self._data)
67 67
68 68 def issnapshot(self, rev):
69 69 if rev == nullrev:
70 70 return True
71 71 return rev in self._snapshot
72 72
73 73 def slicechunk(revlog, revs, targetsize=None):
74 74 """slice revs to reduce the amount of unrelated data to be read from disk.
75 75
76 76 ``revs`` is sliced into groups that should be read in one time.
77 77 Assume that revs are sorted.
78 78
79 79 The initial chunk is sliced until the overall density (payload/chunks-span
80 80 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
81 81 `revlog._srmingapsize` is skipped.
82 82
83 83 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
84 84 For consistency with other slicing choice, this limit won't go lower than
85 85 `revlog._srmingapsize`.
86 86
87 87 If individual revisions chunk are larger than this limit, they will still
88 88 be raised individually.
89 89
90 90 >>> data = [
91 91 ... 5, #00 (5)
92 92 ... 10, #01 (5)
93 93 ... 12, #02 (2)
94 94 ... 12, #03 (empty)
95 95 ... 27, #04 (15)
96 96 ... 31, #05 (4)
97 97 ... 31, #06 (empty)
98 98 ... 42, #07 (11)
99 99 ... 47, #08 (5)
100 100 ... 47, #09 (empty)
101 101 ... 48, #10 (1)
102 102 ... 51, #11 (3)
103 103 ... 74, #12 (23)
104 104 ... 85, #13 (11)
105 105 ... 86, #14 (1)
106 106 ... 91, #15 (5)
107 107 ... ]
108 108 >>> revlog = _testrevlog(data, snapshot=range(16))
109 109
110 110 >>> list(slicechunk(revlog, list(range(16))))
111 111 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
112 112 >>> list(slicechunk(revlog, [0, 15]))
113 113 [[0], [15]]
114 114 >>> list(slicechunk(revlog, [0, 11, 15]))
115 115 [[0], [11], [15]]
116 116 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
117 117 [[0], [11, 13, 15]]
118 118 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
119 119 [[1, 2], [5, 8, 10, 11], [14]]
120 120
121 121 Slicing with a maximum chunk size
122 122 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
123 123 [[0], [11], [13], [15]]
124 124 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
125 125 [[0], [11], [13, 15]]
126 126
127 127 Slicing involving nullrev
128 128 >>> list(slicechunk(revlog, [-1, 0, 11, 13, 15], targetsize=20))
129 129 [[-1, 0], [11], [13, 15]]
130 130 >>> list(slicechunk(revlog, [-1, 13, 15], targetsize=5))
131 131 [[-1], [13], [15]]
132 132 """
133 133 if targetsize is not None:
134 134 targetsize = max(targetsize, revlog._srmingapsize)
135 135 # targetsize should not be specified when evaluating delta candidates:
136 136 # * targetsize is used to ensure we stay within specification when reading,
137 137 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
138 138 if densityslicing is None:
139 139 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
140 140 for chunk in densityslicing(revs,
141 141 revlog._srdensitythreshold,
142 142 revlog._srmingapsize):
143 143 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
144 144 yield subchunk
145 145
146 146 def _slicechunktosize(revlog, revs, targetsize=None):
147 147 """slice revs to match the target size
148 148
149 149 This is intended to be used on chunk that density slicing selected by that
150 150 are still too large compared to the read garantee of revlog. This might
151 151 happens when "minimal gap size" interrupted the slicing or when chain are
152 152 built in a way that create large blocks next to each other.
153 153
154 154 >>> data = [
155 155 ... 3, #0 (3)
156 156 ... 5, #1 (2)
157 157 ... 6, #2 (1)
158 158 ... 8, #3 (2)
159 159 ... 8, #4 (empty)
160 160 ... 11, #5 (3)
161 161 ... 12, #6 (1)
162 162 ... 13, #7 (1)
163 163 ... 14, #8 (1)
164 164 ... ]
165 165
166 166 == All snapshots cases ==
167 167 >>> revlog = _testrevlog(data, snapshot=range(9))
168 168
169 169 Cases where chunk is already small enough
170 170 >>> list(_slicechunktosize(revlog, [0], 3))
171 171 [[0]]
172 172 >>> list(_slicechunktosize(revlog, [6, 7], 3))
173 173 [[6, 7]]
174 174 >>> list(_slicechunktosize(revlog, [0], None))
175 175 [[0]]
176 176 >>> list(_slicechunktosize(revlog, [6, 7], None))
177 177 [[6, 7]]
178 178
179 179 cases where we need actual slicing
180 180 >>> list(_slicechunktosize(revlog, [0, 1], 3))
181 181 [[0], [1]]
182 182 >>> list(_slicechunktosize(revlog, [1, 3], 3))
183 183 [[1], [3]]
184 184 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
185 185 [[1, 2], [3]]
186 186 >>> list(_slicechunktosize(revlog, [3, 5], 3))
187 187 [[3], [5]]
188 188 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
189 189 [[3], [5]]
190 190 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
191 191 [[5], [6, 7, 8]]
192 192 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
193 193 [[0], [1, 2], [3], [5], [6, 7, 8]]
194 194
195 195 Case with too large individual chunk (must return valid chunk)
196 196 >>> list(_slicechunktosize(revlog, [0, 1], 2))
197 197 [[0], [1]]
198 198 >>> list(_slicechunktosize(revlog, [1, 3], 1))
199 199 [[1], [3]]
200 200 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
201 201 [[3], [5]]
202 202
203 203 == No Snapshot cases ==
204 204 >>> revlog = _testrevlog(data)
205 205
206 206 Cases where chunk is already small enough
207 207 >>> list(_slicechunktosize(revlog, [0], 3))
208 208 [[0]]
209 209 >>> list(_slicechunktosize(revlog, [6, 7], 3))
210 210 [[6, 7]]
211 211 >>> list(_slicechunktosize(revlog, [0], None))
212 212 [[0]]
213 213 >>> list(_slicechunktosize(revlog, [6, 7], None))
214 214 [[6, 7]]
215 215
216 216 cases where we need actual slicing
217 217 >>> list(_slicechunktosize(revlog, [0, 1], 3))
218 218 [[0], [1]]
219 219 >>> list(_slicechunktosize(revlog, [1, 3], 3))
220 220 [[1], [3]]
221 221 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
222 222 [[1], [2, 3]]
223 223 >>> list(_slicechunktosize(revlog, [3, 5], 3))
224 224 [[3], [5]]
225 225 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
226 226 [[3], [4, 5]]
227 227 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
228 228 [[5], [6, 7, 8]]
229 229 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
230 230 [[0], [1, 2], [3], [5], [6, 7, 8]]
231 231
232 232 Case with too large individual chunk (must return valid chunk)
233 233 >>> list(_slicechunktosize(revlog, [0, 1], 2))
234 234 [[0], [1]]
235 235 >>> list(_slicechunktosize(revlog, [1, 3], 1))
236 236 [[1], [3]]
237 237 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
238 238 [[3], [5]]
239 239
240 240 == mixed case ==
241 241 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
242 242 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
243 243 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
244 244 """
245 245 assert targetsize is None or 0 <= targetsize
246 246 startdata = revlog.start(revs[0])
247 247 enddata = revlog.end(revs[-1])
248 248 fullspan = enddata - startdata
249 249 if targetsize is None or fullspan <= targetsize:
250 250 yield revs
251 251 return
252 252
253 253 startrevidx = 0
254 254 endrevidx = 1
255 255 iterrevs = enumerate(revs)
256 256 next(iterrevs) # skip first rev.
257 257 # first step: get snapshots out of the way
258 258 for idx, r in iterrevs:
259 259 span = revlog.end(r) - startdata
260 260 snapshot = revlog.issnapshot(r)
261 261 if span <= targetsize and snapshot:
262 262 endrevidx = idx + 1
263 263 else:
264 264 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
265 265 if chunk:
266 266 yield chunk
267 267 startrevidx = idx
268 268 startdata = revlog.start(r)
269 269 endrevidx = idx + 1
270 270 if not snapshot:
271 271 break
272 272
273 273 # for the others, we use binary slicing to quickly converge toward valid
274 274 # chunks (otherwise, we might end up looking for start/end of many
275 275 # revisions). This logic is not looking for the perfect slicing point, it
276 276 # focuses on quickly converging toward valid chunks.
277 277 nbitem = len(revs)
278 278 while (enddata - startdata) > targetsize:
279 279 endrevidx = nbitem
280 280 if nbitem - startrevidx <= 1:
281 281 break # protect against individual chunk larger than limit
282 282 localenddata = revlog.end(revs[endrevidx - 1])
283 283 span = localenddata - startdata
284 284 while span > targetsize:
285 285 if endrevidx - startrevidx <= 1:
286 286 break # protect against individual chunk larger than limit
287 287 endrevidx -= (endrevidx - startrevidx) // 2
288 288 localenddata = revlog.end(revs[endrevidx - 1])
289 289 span = localenddata - startdata
290 290 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
291 291 if chunk:
292 292 yield chunk
293 293 startrevidx = endrevidx
294 294 startdata = revlog.start(revs[startrevidx])
295 295
296 296 chunk = _trimchunk(revlog, revs, startrevidx)
297 297 if chunk:
298 298 yield chunk
299 299
300 300 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
301 301 mingapsize=0):
302 302 """slice revs to reduce the amount of unrelated data to be read from disk.
303 303
304 304 ``revs`` is sliced into groups that should be read in one time.
305 305 Assume that revs are sorted.
306 306
307 307 The initial chunk is sliced until the overall density (payload/chunks-span
308 308 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
309 309 skipped.
310 310
311 311 >>> revlog = _testrevlog([
312 312 ... 5, #00 (5)
313 313 ... 10, #01 (5)
314 314 ... 12, #02 (2)
315 315 ... 12, #03 (empty)
316 316 ... 27, #04 (15)
317 317 ... 31, #05 (4)
318 318 ... 31, #06 (empty)
319 319 ... 42, #07 (11)
320 320 ... 47, #08 (5)
321 321 ... 47, #09 (empty)
322 322 ... 48, #10 (1)
323 323 ... 51, #11 (3)
324 324 ... 74, #12 (23)
325 325 ... 85, #13 (11)
326 326 ... 86, #14 (1)
327 327 ... 91, #15 (5)
328 328 ... ])
329 329
330 330 >>> list(_slicechunktodensity(revlog, list(range(16))))
331 331 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
332 332 >>> list(_slicechunktodensity(revlog, [0, 15]))
333 333 [[0], [15]]
334 334 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
335 335 [[0], [11], [15]]
336 336 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
337 337 [[0], [11, 13, 15]]
338 338 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
339 339 [[1, 2], [5, 8, 10, 11], [14]]
340 340 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
341 341 ... mingapsize=20))
342 342 [[1, 2, 3, 5, 8, 10, 11], [14]]
343 343 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
344 344 ... targetdensity=0.95))
345 345 [[1, 2], [5], [8, 10, 11], [14]]
346 346 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
347 347 ... targetdensity=0.95, mingapsize=12))
348 348 [[1, 2], [5, 8, 10, 11], [14]]
349 349 """
350 350 start = revlog.start
351 351 length = revlog.length
352 352
353 353 if len(revs) <= 1:
354 354 yield revs
355 355 return
356 356
357 357 deltachainspan = segmentspan(revlog, revs)
358 358
359 359 if deltachainspan < mingapsize:
360 360 yield revs
361 361 return
362 362
363 363 readdata = deltachainspan
364 364 chainpayload = sum(length(r) for r in revs)
365 365
366 366 if deltachainspan:
367 367 density = chainpayload / float(deltachainspan)
368 368 else:
369 369 density = 1.0
370 370
371 371 if density >= targetdensity:
372 372 yield revs
373 373 return
374 374
375 375 # Store the gaps in a heap to have them sorted by decreasing size
376 376 gaps = []
377 377 prevend = None
378 378 for i, rev in enumerate(revs):
379 379 revstart = start(rev)
380 380 revlen = length(rev)
381 381
382 382 # Skip empty revisions to form larger holes
383 383 if revlen == 0:
384 384 continue
385 385
386 386 if prevend is not None:
387 387 gapsize = revstart - prevend
388 388 # only consider holes that are large enough
389 389 if gapsize > mingapsize:
390 390 gaps.append((gapsize, i))
391 391
392 392 prevend = revstart + revlen
393 393 # sort the gaps to pop them from largest to small
394 394 gaps.sort()
395 395
396 396 # Collect the indices of the largest holes until the density is acceptable
397 397 selected = []
398 398 while gaps and density < targetdensity:
399 399 gapsize, gapidx = gaps.pop()
400 400
401 401 selected.append(gapidx)
402 402
403 403 # the gap sizes are stored as negatives to be sorted decreasingly
404 404 # by the heap
405 405 readdata -= gapsize
406 406 if readdata > 0:
407 407 density = chainpayload / float(readdata)
408 408 else:
409 409 density = 1.0
410 410 selected.sort()
411 411
412 412 # Cut the revs at collected indices
413 413 previdx = 0
414 414 for idx in selected:
415 415
416 416 chunk = _trimchunk(revlog, revs, previdx, idx)
417 417 if chunk:
418 418 yield chunk
419 419
420 420 previdx = idx
421 421
422 422 chunk = _trimchunk(revlog, revs, previdx)
423 423 if chunk:
424 424 yield chunk
425 425
426 426 def _trimchunk(revlog, revs, startidx, endidx=None):
427 427 """returns revs[startidx:endidx] without empty trailing revs
428 428
429 429 Doctest Setup
430 430 >>> revlog = _testrevlog([
431 431 ... 5, #0
432 432 ... 10, #1
433 433 ... 12, #2
434 434 ... 12, #3 (empty)
435 435 ... 17, #4
436 436 ... 21, #5
437 437 ... 21, #6 (empty)
438 438 ... ])
439 439
440 440 Contiguous cases:
441 441 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
442 442 [0, 1, 2, 3, 4, 5]
443 443 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
444 444 [0, 1, 2, 3, 4]
445 445 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
446 446 [0, 1, 2]
447 447 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
448 448 [2]
449 449 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
450 450 [3, 4, 5]
451 451 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
452 452 [3, 4]
453 453
454 454 Discontiguous cases:
455 455 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
456 456 [1, 3, 5]
457 457 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
458 458 [1]
459 459 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
460 460 [3, 5]
461 461 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
462 462 [3, 5]
463 463 """
464 464 length = revlog.length
465 465
466 466 if endidx is None:
467 467 endidx = len(revs)
468 468
469 469 # If we have a non-emtpy delta candidate, there are nothing to trim
470 470 if revs[endidx - 1] < len(revlog):
471 471 # Trim empty revs at the end, except the very first revision of a chain
472 472 while (endidx > 1
473 473 and endidx > startidx
474 474 and length(revs[endidx - 1]) == 0):
475 475 endidx -= 1
476 476
477 477 return revs[startidx:endidx]
478 478
479 479 def segmentspan(revlog, revs):
480 480 """Get the byte span of a segment of revisions
481 481
482 482 revs is a sorted array of revision numbers
483 483
484 484 >>> revlog = _testrevlog([
485 485 ... 5, #0
486 486 ... 10, #1
487 487 ... 12, #2
488 488 ... 12, #3 (empty)
489 489 ... 17, #4
490 490 ... ])
491 491
492 492 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
493 493 17
494 494 >>> segmentspan(revlog, [0, 4])
495 495 17
496 496 >>> segmentspan(revlog, [3, 4])
497 497 5
498 498 >>> segmentspan(revlog, [1, 2, 3,])
499 499 7
500 500 >>> segmentspan(revlog, [1, 3])
501 501 7
502 502 """
503 503 if not revs:
504 504 return 0
505 505 end = revlog.end(revs[-1])
506 506 return end - revlog.start(revs[0])
507 507
508 508 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
509 509 """build full text from a (base, delta) pair and other metadata"""
510 510 # special case deltas which replace entire base; no need to decode
511 511 # base revision. this neatly avoids censored bases, which throw when
512 512 # they're decoded.
513 513 hlen = struct.calcsize(">lll")
514 514 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
515 515 len(delta) - hlen):
516 516 fulltext = delta[hlen:]
517 517 else:
518 518 # deltabase is rawtext before changed by flag processors, which is
519 519 # equivalent to non-raw text
520 520 basetext = revlog.revision(baserev, _df=fh, raw=False)
521 521 fulltext = mdiff.patch(basetext, delta)
522 522
523 523 try:
524 524 res = revlog._processflags(fulltext, flags, 'read', raw=True)
525 525 fulltext, validatehash = res
526 526 if validatehash:
527 527 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
528 528 if flags & REVIDX_ISCENSORED:
529 529 raise error.StorageError(_('node %s is not censored') %
530 530 expectednode)
531 531 except error.CensoredNodeError:
532 532 # must pass the censored index flag to add censored revisions
533 533 if not flags & REVIDX_ISCENSORED:
534 534 raise
535 535 return fulltext
536 536
537 537 @attr.s(slots=True, frozen=True)
538 538 class _deltainfo(object):
539 539 distance = attr.ib()
540 540 deltalen = attr.ib()
541 541 data = attr.ib()
542 542 base = attr.ib()
543 543 chainbase = attr.ib()
544 544 chainlen = attr.ib()
545 545 compresseddeltalen = attr.ib()
546 546 snapshotdepth = attr.ib()
547 547
548 548 def isgooddeltainfo(revlog, deltainfo, revinfo):
549 549 """Returns True if the given delta is good. Good means that it is within
550 550 the disk span, disk size, and chain length bounds that we know to be
551 551 performant."""
552 552 if deltainfo is None:
553 553 return False
554 554
555 555 # - 'deltainfo.distance' is the distance from the base revision --
556 556 # bounding it limits the amount of I/O we need to do.
557 557 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
558 558 # deltas we need to apply -- bounding it limits the amount of CPU
559 559 # we consume.
560 560
561 561 textlen = revinfo.textlen
562 562 defaultmax = textlen * 4
563 563 maxdist = revlog._maxdeltachainspan
564 564 if not maxdist:
565 565 maxdist = deltainfo.distance # ensure the conditional pass
566 566 maxdist = max(maxdist, defaultmax)
567 567
568 568 # Bad delta from read span:
569 569 #
570 570 # If the span of data read is larger than the maximum allowed.
571 571 #
572 572 # In the sparse-revlog case, we rely on the associated "sparse reading"
573 573 # to avoid issue related to the span of data. In theory, it would be
574 574 # possible to build pathological revlog where delta pattern would lead
575 575 # to too many reads. However, they do not happen in practice at all. So
576 576 # we skip the span check entirely.
577 577 if not revlog._sparserevlog and maxdist < deltainfo.distance:
578 578 return False
579 579
580 580 # Bad delta from new delta size:
581 581 #
582 582 # If the delta size is larger than the target text, storing the
583 583 # delta will be inefficient.
584 584 if textlen < deltainfo.deltalen:
585 585 return False
586 586
587 587 # Bad delta from cumulated payload size:
588 588 #
589 589 # If the sum of delta get larger than K * target text length.
590 590 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
591 591 return False
592 592
593 593 # Bad delta from chain length:
594 594 #
595 595 # If the number of delta in the chain gets too high.
596 596 if (revlog._maxchainlen
597 597 and revlog._maxchainlen < deltainfo.chainlen):
598 598 return False
599 599
600 600 # bad delta from intermediate snapshot size limit
601 601 #
602 602 # If an intermediate snapshot size is higher than the limit. The
603 603 # limit exist to prevent endless chain of intermediate delta to be
604 604 # created.
605 605 if (deltainfo.snapshotdepth is not None and
606 606 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
607 607 return False
608 608
609 609 # bad delta if new intermediate snapshot is larger than the previous
610 610 # snapshot
611 611 if (deltainfo.snapshotdepth
612 612 and revlog.length(deltainfo.base) < deltainfo.deltalen):
613 613 return False
614 614
615 615 return True
616 616
617 617 # If a revision's full text is that much bigger than a base candidate full
618 618 # text's, it is very unlikely that it will produce a valid delta. We no longer
619 619 # consider these candidates.
620 620 LIMIT_BASE2TEXT = 500
621 621
622 622 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
623 623 """Provides group of revision to be tested as delta base
624 624
625 625 This top level function focus on emitting groups with unique and worthwhile
626 626 content. See _raw_candidate_groups for details about the group order.
627 627 """
628 628 # should we try to build a delta?
629 629 if not (len(revlog) and revlog._storedeltachains):
630 630 yield None
631 631 return
632 632
633 633 deltalength = revlog.length
634 634 deltaparent = revlog.deltaparent
635 635 sparse = revlog._sparserevlog
636 636 good = None
637 637
638 638 deltas_limit = textlen * LIMIT_DELTA2TEXT
639 639
640 640 tested = {nullrev}
641 641 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
642 642 while True:
643 643 temptative = candidates.send(good)
644 644 if temptative is None:
645 645 break
646 646 group = []
647 647 for rev in temptative:
648 648 # skip over empty delta (no need to include them in a chain)
649 649 while (revlog._generaldelta
650 650 and not (rev == nullrev
651 651 or rev in tested
652 652 or deltalength(rev))):
653 653 tested.add(rev)
654 654 rev = deltaparent(rev)
655 655 # no need to try a delta against nullrev, this will be done as a
656 656 # last resort.
657 657 if rev == nullrev:
658 658 continue
659 659 # filter out revision we tested already
660 660 if rev in tested:
661 661 continue
662 662 tested.add(rev)
663 663 # filter out delta base that will never produce good delta
664 664 if deltas_limit < revlog.length(rev):
665 665 continue
666 666 if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
667 667 continue
668 668 # no delta for rawtext-changing revs (see "candelta" for why)
669 669 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
670 670 continue
671 671 # If we reach here, we are about to build and test a delta.
672 672 # The delta building process will compute the chaininfo in all
673 673 # case, since that computation is cached, it is fine to access it
674 674 # here too.
675 675 chainlen, chainsize = revlog._chaininfo(rev)
676 676 # if chain will be too long, skip base
677 677 if revlog._maxchainlen and chainlen >= revlog._maxchainlen:
678 678 continue
679 679 # if chain already have too much data, skip base
680 680 if deltas_limit < chainsize:
681 681 continue
682 682 if sparse and revlog.upperboundcomp is not None:
683 683 maxcomp = revlog.upperboundcomp
684 684 basenotsnap = (p1, p2, nullrev)
685 685 if rev not in basenotsnap and revlog.issnapshot(rev):
686 686 snapshotdepth = revlog.snapshotdepth(rev)
687 687 # If text is significantly larger than the base, we can
688 688 # expect the resulting delta to be proportional to the size
689 689 # difference
690 690 revsize = revlog.rawsize(rev)
691 691 rawsizedistance = max(textlen - revsize, 0)
692 692 # use an estimate of the compression upper bound.
693 693 lowestrealisticdeltalen = rawsizedistance // maxcomp
694 694
695 695 # check the absolute constraint on the delta size
696 696 snapshotlimit = textlen >> snapshotdepth
697 697 if snapshotlimit < lowestrealisticdeltalen:
698 698 # delta lower bound is larger than accepted upper bound
699 699 continue
700 700
701 # check the relative constraint on the delta size
702 revlength = revlog.length(rev)
703 if revlength < lowestrealisticdeltalen:
704 # delta probable lower bound is larger than target base
705 continue
706
701 707 group.append(rev)
702 708 if group:
703 709 # XXX: in the sparse revlog case, group can become large,
704 710 # impacting performances. Some bounding or slicing mecanism
705 711 # would help to reduce this impact.
706 712 good = yield tuple(group)
707 713 yield None
708 714
709 715 def _findsnapshots(revlog, cache, start_rev):
710 716 """find snapshot from start_rev to tip"""
711 717 if util.safehasattr(revlog.index, 'findsnapshots'):
712 718 revlog.index.findsnapshots(cache, start_rev)
713 719 else:
714 720 deltaparent = revlog.deltaparent
715 721 issnapshot = revlog.issnapshot
716 722 for rev in revlog.revs(start_rev):
717 723 if issnapshot(rev):
718 724 cache[deltaparent(rev)].append(rev)
719 725
720 726 def _refinedgroups(revlog, p1, p2, cachedelta):
721 727 good = None
722 728 # First we try to reuse a the delta contained in the bundle.
723 729 # (or from the source revlog)
724 730 #
725 731 # This logic only applies to general delta repositories and can be disabled
726 732 # through configuration. Disabling reuse source delta is useful when
727 733 # we want to make sure we recomputed "optimal" deltas.
728 734 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
729 735 # Assume what we received from the server is a good choice
730 736 # build delta will reuse the cache
731 737 good = yield (cachedelta[0],)
732 738 if good is not None:
733 739 yield None
734 740 return
735 741 snapshots = collections.defaultdict(list)
736 742 for candidates in _rawgroups(revlog, p1, p2, cachedelta, snapshots):
737 743 good = yield candidates
738 744 if good is not None:
739 745 break
740 746
741 747 # If sparse revlog is enabled, we can try to refine the available deltas
742 748 if not revlog._sparserevlog:
743 749 yield None
744 750 return
745 751
746 752 # if we have a refinable value, try to refine it
747 753 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
748 754 # refine snapshot down
749 755 previous = None
750 756 while previous != good:
751 757 previous = good
752 758 base = revlog.deltaparent(good)
753 759 if base == nullrev:
754 760 break
755 761 good = yield (base,)
756 762 # refine snapshot up
757 763 if not snapshots:
758 764 _findsnapshots(revlog, snapshots, good + 1)
759 765 previous = None
760 766 while good != previous:
761 767 previous = good
762 768 children = tuple(sorted(c for c in snapshots[good]))
763 769 good = yield children
764 770
765 771 # we have found nothing
766 772 yield None
767 773
768 774 def _rawgroups(revlog, p1, p2, cachedelta, snapshots=None):
769 775 """Provides group of revision to be tested as delta base
770 776
771 777 This lower level function focus on emitting delta theorically interresting
772 778 without looking it any practical details.
773 779
774 780 The group order aims at providing fast or small candidates first.
775 781 """
776 782 gdelta = revlog._generaldelta
777 783 # gate sparse behind general-delta because of issue6056
778 784 sparse = gdelta and revlog._sparserevlog
779 785 curr = len(revlog)
780 786 prev = curr - 1
781 787 deltachain = lambda rev: revlog._deltachain(rev)[0]
782 788
783 789 if gdelta:
784 790 # exclude already lazy tested base if any
785 791 parents = [p for p in (p1, p2) if p != nullrev]
786 792
787 793 if not revlog._deltabothparents and len(parents) == 2:
788 794 parents.sort()
789 795 # To minimize the chance of having to build a fulltext,
790 796 # pick first whichever parent is closest to us (max rev)
791 797 yield (parents[1],)
792 798 # then the other one (min rev) if the first did not fit
793 799 yield (parents[0],)
794 800 elif len(parents) > 0:
795 801 # Test all parents (1 or 2), and keep the best candidate
796 802 yield parents
797 803
798 804 if sparse and parents:
799 805 if snapshots is None:
800 806 # map: base-rev: snapshot-rev
801 807 snapshots = collections.defaultdict(list)
802 808 # See if we can use an existing snapshot in the parent chains to use as
803 809 # a base for a new intermediate-snapshot
804 810 #
805 811 # search for snapshot in parents delta chain
806 812 # map: snapshot-level: snapshot-rev
807 813 parents_snaps = collections.defaultdict(set)
808 814 candidate_chains = [deltachain(p) for p in parents]
809 815 for chain in candidate_chains:
810 816 for idx, s in enumerate(chain):
811 817 if not revlog.issnapshot(s):
812 818 break
813 819 parents_snaps[idx].add(s)
814 820 snapfloor = min(parents_snaps[0]) + 1
815 821 _findsnapshots(revlog, snapshots, snapfloor)
816 822 # search for the highest "unrelated" revision
817 823 #
818 824 # Adding snapshots used by "unrelated" revision increase the odd we
819 825 # reuse an independant, yet better snapshot chain.
820 826 #
821 827 # XXX instead of building a set of revisions, we could lazily enumerate
822 828 # over the chains. That would be more efficient, however we stick to
823 829 # simple code for now.
824 830 all_revs = set()
825 831 for chain in candidate_chains:
826 832 all_revs.update(chain)
827 833 other = None
828 834 for r in revlog.revs(prev, snapfloor):
829 835 if r not in all_revs:
830 836 other = r
831 837 break
832 838 if other is not None:
833 839 # To avoid unfair competition, we won't use unrelated intermediate
834 840 # snapshot that are deeper than the ones from the parent delta
835 841 # chain.
836 842 max_depth = max(parents_snaps.keys())
837 843 chain = deltachain(other)
838 844 for idx, s in enumerate(chain):
839 845 if s < snapfloor:
840 846 continue
841 847 if max_depth < idx:
842 848 break
843 849 if not revlog.issnapshot(s):
844 850 break
845 851 parents_snaps[idx].add(s)
846 852 # Test them as possible intermediate snapshot base
847 853 # We test them from highest to lowest level. High level one are more
848 854 # likely to result in small delta
849 855 floor = None
850 856 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
851 857 siblings = set()
852 858 for s in snaps:
853 859 siblings.update(snapshots[s])
854 860 # Before considering making a new intermediate snapshot, we check
855 861 # if an existing snapshot, children of base we consider, would be
856 862 # suitable.
857 863 #
858 864 # It give a change to reuse a delta chain "unrelated" to the
859 865 # current revision instead of starting our own. Without such
860 866 # re-use, topological branches would keep reopening new chains.
861 867 # Creating more and more snapshot as the repository grow.
862 868
863 869 if floor is not None:
864 870 # We only do this for siblings created after the one in our
865 871 # parent's delta chain. Those created before has less chances
866 872 # to be valid base since our ancestors had to create a new
867 873 # snapshot.
868 874 siblings = [r for r in siblings if floor < r]
869 875 yield tuple(sorted(siblings))
870 876 # then test the base from our parent's delta chain.
871 877 yield tuple(sorted(snaps))
872 878 floor = min(snaps)
873 879 # No suitable base found in the parent chain, search if any full
874 880 # snapshots emitted since parent's base would be a suitable base for an
875 881 # intermediate snapshot.
876 882 #
877 883 # It give a chance to reuse a delta chain unrelated to the current
878 884 # revisions instead of starting our own. Without such re-use,
879 885 # topological branches would keep reopening new full chains. Creating
880 886 # more and more snapshot as the repository grow.
881 887 yield tuple(snapshots[nullrev])
882 888
883 889 if not sparse:
884 890 # other approach failed try against prev to hopefully save us a
885 891 # fulltext.
886 892 yield (prev,)
887 893
888 894 class deltacomputer(object):
889 895 def __init__(self, revlog):
890 896 self.revlog = revlog
891 897
892 898 def buildtext(self, revinfo, fh):
893 899 """Builds a fulltext version of a revision
894 900
895 901 revinfo: _revisioninfo instance that contains all needed info
896 902 fh: file handle to either the .i or the .d revlog file,
897 903 depending on whether it is inlined or not
898 904 """
899 905 btext = revinfo.btext
900 906 if btext[0] is not None:
901 907 return btext[0]
902 908
903 909 revlog = self.revlog
904 910 cachedelta = revinfo.cachedelta
905 911 baserev = cachedelta[0]
906 912 delta = cachedelta[1]
907 913
908 914 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
909 915 revinfo.p1, revinfo.p2,
910 916 revinfo.flags, revinfo.node)
911 917 return fulltext
912 918
913 919 def _builddeltadiff(self, base, revinfo, fh):
914 920 revlog = self.revlog
915 921 t = self.buildtext(revinfo, fh)
916 922 if revlog.iscensored(base):
917 923 # deltas based on a censored revision must replace the
918 924 # full content in one patch, so delta works everywhere
919 925 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
920 926 delta = header + t
921 927 else:
922 928 ptext = revlog.revision(base, _df=fh, raw=True)
923 929 delta = mdiff.textdiff(ptext, t)
924 930
925 931 return delta
926 932
927 933 def _builddeltainfo(self, revinfo, base, fh):
928 934 # can we use the cached delta?
929 935 delta = None
930 936 if revinfo.cachedelta:
931 937 cachebase, cachediff = revinfo.cachedelta
932 938 #check if the diff still apply
933 939 currentbase = cachebase
934 940 while (currentbase != nullrev
935 941 and currentbase != base
936 942 and self.revlog.length(currentbase) == 0):
937 943 currentbase = self.revlog.deltaparent(currentbase)
938 944 if self.revlog._lazydelta and currentbase == base:
939 945 delta = revinfo.cachedelta[1]
940 946 if delta is None:
941 947 delta = self._builddeltadiff(base, revinfo, fh)
942 948 revlog = self.revlog
943 949 header, data = revlog.compress(delta)
944 950 deltalen = len(header) + len(data)
945 951 chainbase = revlog.chainbase(base)
946 952 offset = revlog.end(len(revlog) - 1)
947 953 dist = deltalen + offset - revlog.start(chainbase)
948 954 if revlog._generaldelta:
949 955 deltabase = base
950 956 else:
951 957 deltabase = chainbase
952 958 chainlen, compresseddeltalen = revlog._chaininfo(base)
953 959 chainlen += 1
954 960 compresseddeltalen += deltalen
955 961
956 962 revlog = self.revlog
957 963 snapshotdepth = None
958 964 if deltabase == nullrev:
959 965 snapshotdepth = 0
960 966 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
961 967 # A delta chain should always be one full snapshot,
962 968 # zero or more semi-snapshots, and zero or more deltas
963 969 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
964 970 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
965 971 snapshotdepth = len(revlog._deltachain(deltabase)[0])
966 972
967 973 return _deltainfo(dist, deltalen, (header, data), deltabase,
968 974 chainbase, chainlen, compresseddeltalen,
969 975 snapshotdepth)
970 976
971 977 def _fullsnapshotinfo(self, fh, revinfo):
972 978 curr = len(self.revlog)
973 979 rawtext = self.buildtext(revinfo, fh)
974 980 data = self.revlog.compress(rawtext)
975 981 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
976 982 deltabase = chainbase = curr
977 983 snapshotdepth = 0
978 984 chainlen = 1
979 985
980 986 return _deltainfo(dist, deltalen, data, deltabase,
981 987 chainbase, chainlen, compresseddeltalen,
982 988 snapshotdepth)
983 989
984 990 def finddeltainfo(self, revinfo, fh):
985 991 """Find an acceptable delta against a candidate revision
986 992
987 993 revinfo: information about the revision (instance of _revisioninfo)
988 994 fh: file handle to either the .i or the .d revlog file,
989 995 depending on whether it is inlined or not
990 996
991 997 Returns the first acceptable candidate revision, as ordered by
992 998 _candidategroups
993 999
994 1000 If no suitable deltabase is found, we return delta info for a full
995 1001 snapshot.
996 1002 """
997 1003 if not revinfo.textlen:
998 1004 return self._fullsnapshotinfo(fh, revinfo)
999 1005
1000 1006 # no delta for flag processor revision (see "candelta" for why)
1001 1007 # not calling candelta since only one revision needs test, also to
1002 1008 # avoid overhead fetching flags again.
1003 1009 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
1004 1010 return self._fullsnapshotinfo(fh, revinfo)
1005 1011
1006 1012 cachedelta = revinfo.cachedelta
1007 1013 p1 = revinfo.p1
1008 1014 p2 = revinfo.p2
1009 1015 revlog = self.revlog
1010 1016
1011 1017 deltainfo = None
1012 1018 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
1013 1019 groups = _candidategroups(self.revlog, revinfo.textlen,
1014 1020 p1r, p2r, cachedelta)
1015 1021 candidaterevs = next(groups)
1016 1022 while candidaterevs is not None:
1017 1023 nominateddeltas = []
1018 1024 if deltainfo is not None:
1019 1025 # if we already found a good delta,
1020 1026 # challenge it against refined candidates
1021 1027 nominateddeltas.append(deltainfo)
1022 1028 for candidaterev in candidaterevs:
1023 1029 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
1024 1030 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
1025 1031 nominateddeltas.append(candidatedelta)
1026 1032 if nominateddeltas:
1027 1033 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
1028 1034 if deltainfo is not None:
1029 1035 candidaterevs = groups.send(deltainfo.base)
1030 1036 else:
1031 1037 candidaterevs = next(groups)
1032 1038
1033 1039 if deltainfo is None:
1034 1040 deltainfo = self._fullsnapshotinfo(fh, revinfo)
1035 1041 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now