##// END OF EJS Templates
sparse-revlog: rework the way we enforce chunk size limit...
Boris Feld -
r40678:9c3c6972 default
parent child Browse files
Show More
@@ -1,899 +1,977 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import (
17 17 nullrev,
18 18 )
19 19 from ..i18n import _
20 20
21 21 from .constants import (
22 22 REVIDX_ISCENSORED,
23 23 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 24 )
25 25
26 26 from ..thirdparty import (
27 27 attr,
28 28 )
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 )
34 34
35 35 # maximum <delta-chain-data>/<revision-text-length> ratio
36 36 LIMIT_DELTA2TEXT = 2
37 37
38 38 class _testrevlog(object):
39 39 """minimalist fake revlog to use in doctests"""
40 40
41 41 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
42 42 """data is an list of revision payload boundaries"""
43 43 self._data = data
44 44 self._srdensitythreshold = density
45 45 self._srmingapsize = mingap
46 46 self._snapshot = set(snapshot)
47 47
48 48 def start(self, rev):
49 49 if rev == 0:
50 50 return 0
51 51 return self._data[rev - 1]
52 52
53 53 def end(self, rev):
54 54 return self._data[rev]
55 55
56 56 def length(self, rev):
57 57 return self.end(rev) - self.start(rev)
58 58
59 59 def __len__(self):
60 60 return len(self._data)
61 61
62 62 def issnapshot(self, rev):
63 63 return rev in self._snapshot
64 64
65 65 def slicechunk(revlog, revs, targetsize=None):
66 66 """slice revs to reduce the amount of unrelated data to be read from disk.
67 67
68 68 ``revs`` is sliced into groups that should be read in one time.
69 69 Assume that revs are sorted.
70 70
71 71 The initial chunk is sliced until the overall density (payload/chunks-span
72 72 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
73 73 `revlog._srmingapsize` is skipped.
74 74
75 75 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
76 76 For consistency with other slicing choice, this limit won't go lower than
77 77 `revlog._srmingapsize`.
78 78
79 79 If individual revisions chunk are larger than this limit, they will still
80 80 be raised individually.
81 81
82 >>> revlog = _testrevlog([
82 >>> data = [
83 83 ... 5, #00 (5)
84 84 ... 10, #01 (5)
85 85 ... 12, #02 (2)
86 86 ... 12, #03 (empty)
87 87 ... 27, #04 (15)
88 88 ... 31, #05 (4)
89 89 ... 31, #06 (empty)
90 90 ... 42, #07 (11)
91 91 ... 47, #08 (5)
92 92 ... 47, #09 (empty)
93 93 ... 48, #10 (1)
94 94 ... 51, #11 (3)
95 95 ... 74, #12 (23)
96 96 ... 85, #13 (11)
97 97 ... 86, #14 (1)
98 98 ... 91, #15 (5)
99 ... ])
99 ... ]
100 >>> revlog = _testrevlog(data, snapshot=range(16))
100 101
101 102 >>> list(slicechunk(revlog, list(range(16))))
102 103 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
103 104 >>> list(slicechunk(revlog, [0, 15]))
104 105 [[0], [15]]
105 106 >>> list(slicechunk(revlog, [0, 11, 15]))
106 107 [[0], [11], [15]]
107 108 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
108 109 [[0], [11, 13, 15]]
109 110 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
110 111 [[1, 2], [5, 8, 10, 11], [14]]
111 112
112 113 Slicing with a maximum chunk size
113 114 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
114 115 [[0], [11], [13], [15]]
115 116 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
116 117 [[0], [11], [13, 15]]
117 118 """
118 119 if targetsize is not None:
119 120 targetsize = max(targetsize, revlog._srmingapsize)
120 121 # targetsize should not be specified when evaluating delta candidates:
121 122 # * targetsize is used to ensure we stay within specification when reading,
122 123 for chunk in _slicechunktodensity(revlog, revs,
123 124 revlog._srdensitythreshold,
124 125 revlog._srmingapsize):
125 126 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
126 127 yield subchunk
127 128
128 129 def _slicechunktosize(revlog, revs, targetsize=None):
129 130 """slice revs to match the target size
130 131
131 132 This is intended to be used on chunk that density slicing selected by that
132 133 are still too large compared to the read garantee of revlog. This might
133 134 happens when "minimal gap size" interrupted the slicing or when chain are
134 135 built in a way that create large blocks next to each other.
135 136
136 >>> revlog = _testrevlog([
137 >>> data = [
137 138 ... 3, #0 (3)
138 139 ... 5, #1 (2)
139 140 ... 6, #2 (1)
140 141 ... 8, #3 (2)
141 142 ... 8, #4 (empty)
142 143 ... 11, #5 (3)
143 144 ... 12, #6 (1)
144 145 ... 13, #7 (1)
145 146 ... 14, #8 (1)
146 ... ])
147 ... ]
148
149 == All snapshots cases ==
150 >>> revlog = _testrevlog(data, snapshot=range(9))
147 151
148 152 Cases where chunk is already small enough
149 153 >>> list(_slicechunktosize(revlog, [0], 3))
150 154 [[0]]
151 155 >>> list(_slicechunktosize(revlog, [6, 7], 3))
152 156 [[6, 7]]
153 157 >>> list(_slicechunktosize(revlog, [0], None))
154 158 [[0]]
155 159 >>> list(_slicechunktosize(revlog, [6, 7], None))
156 160 [[6, 7]]
157 161
158 162 cases where we need actual slicing
159 163 >>> list(_slicechunktosize(revlog, [0, 1], 3))
160 164 [[0], [1]]
161 165 >>> list(_slicechunktosize(revlog, [1, 3], 3))
162 166 [[1], [3]]
163 167 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
164 168 [[1, 2], [3]]
165 169 >>> list(_slicechunktosize(revlog, [3, 5], 3))
166 170 [[3], [5]]
167 171 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
168 172 [[3], [5]]
169 173 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
170 174 [[5], [6, 7, 8]]
171 175 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
172 176 [[0], [1, 2], [3], [5], [6, 7, 8]]
173 177
174 178 Case with too large individual chunk (must return valid chunk)
175 179 >>> list(_slicechunktosize(revlog, [0, 1], 2))
176 180 [[0], [1]]
177 181 >>> list(_slicechunktosize(revlog, [1, 3], 1))
178 182 [[1], [3]]
179 183 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
180 184 [[3], [5]]
185
186 == No Snapshot cases ==
187 >>> revlog = _testrevlog(data)
188
189 Cases where chunk is already small enough
190 >>> list(_slicechunktosize(revlog, [0], 3))
191 [[0]]
192 >>> list(_slicechunktosize(revlog, [6, 7], 3))
193 [[6, 7]]
194 >>> list(_slicechunktosize(revlog, [0], None))
195 [[0]]
196 >>> list(_slicechunktosize(revlog, [6, 7], None))
197 [[6, 7]]
198
199 cases where we need actual slicing
200 >>> list(_slicechunktosize(revlog, [0, 1], 3))
201 [[0], [1]]
202 >>> list(_slicechunktosize(revlog, [1, 3], 3))
203 [[1], [3]]
204 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
205 [[1], [2, 3]]
206 >>> list(_slicechunktosize(revlog, [3, 5], 3))
207 [[3], [5]]
208 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
209 [[3], [4, 5]]
210 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
211 [[5], [6, 7, 8]]
212 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
213 [[0], [1, 2], [3], [5], [6, 7, 8]]
214
215 Case with too large individual chunk (must return valid chunk)
216 >>> list(_slicechunktosize(revlog, [0, 1], 2))
217 [[0], [1]]
218 >>> list(_slicechunktosize(revlog, [1, 3], 1))
219 [[1], [3]]
220 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
221 [[3], [5]]
222
223 == mixed case ==
224 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
225 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
226 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
181 227 """
182 228 assert targetsize is None or 0 <= targetsize
183 if targetsize is None or segmentspan(revlog, revs) <= targetsize:
229 startdata = revlog.start(revs[0])
230 enddata = revlog.end(revs[-1])
231 fullspan = enddata - startdata
232 if targetsize is None or fullspan <= targetsize:
184 233 yield revs
185 234 return
186 235
187 236 startrevidx = 0
188 startdata = revlog.start(revs[0])
189 237 endrevidx = 0
190 238 iterrevs = enumerate(revs)
191 239 next(iterrevs) # skip first rev.
240 # first step: get snapshots out of the way
192 241 for idx, r in iterrevs:
193 242 span = revlog.end(r) - startdata
194 if span <= targetsize:
243 snapshot = revlog.issnapshot(r)
244 if span <= targetsize and snapshot:
195 245 endrevidx = idx
196 246 else:
197 247 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1)
198 248 if chunk:
199 249 yield chunk
200 250 startrevidx = idx
201 251 startdata = revlog.start(r)
202 252 endrevidx = idx
203 yield _trimchunk(revlog, revs, startrevidx)
253 if not snapshot:
254 break
255
256 # for the others, we use binary slicing to quickly converge toward valid
257 # chunks (otherwise, we might end up looking for start/end of many
258 # revisions). This logic is not looking for the perfect slicing point, it
259 # focuses on quickly converging toward valid chunks.
260 nbitem = len(revs)
261 while (enddata - startdata) > targetsize:
262 endrevidx = nbitem
263 if nbitem - startrevidx <= 1:
264 break # protect against individual chunk larger than limit
265 localenddata = revlog.end(revs[endrevidx - 1])
266 span = localenddata - startdata
267 while (localenddata - startdata) > targetsize:
268 if endrevidx - startrevidx <= 1:
269 break # protect against individual chunk larger than limit
270 endrevidx -= (endrevidx - startrevidx) // 2
271 localenddata = revlog.end(revs[endrevidx - 1])
272 span = localenddata - startdata
273 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
274 if chunk:
275 yield chunk
276 startrevidx = endrevidx
277 startdata = revlog.start(revs[startrevidx])
278
279 chunk = _trimchunk(revlog, revs, startrevidx)
280 if chunk:
281 yield chunk
204 282
205 283 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
206 284 mingapsize=0):
207 285 """slice revs to reduce the amount of unrelated data to be read from disk.
208 286
209 287 ``revs`` is sliced into groups that should be read in one time.
210 288 Assume that revs are sorted.
211 289
212 290 The initial chunk is sliced until the overall density (payload/chunks-span
213 291 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
214 292 skipped.
215 293
216 294 >>> revlog = _testrevlog([
217 295 ... 5, #00 (5)
218 296 ... 10, #01 (5)
219 297 ... 12, #02 (2)
220 298 ... 12, #03 (empty)
221 299 ... 27, #04 (15)
222 300 ... 31, #05 (4)
223 301 ... 31, #06 (empty)
224 302 ... 42, #07 (11)
225 303 ... 47, #08 (5)
226 304 ... 47, #09 (empty)
227 305 ... 48, #10 (1)
228 306 ... 51, #11 (3)
229 307 ... 74, #12 (23)
230 308 ... 85, #13 (11)
231 309 ... 86, #14 (1)
232 310 ... 91, #15 (5)
233 311 ... ])
234 312
235 313 >>> list(_slicechunktodensity(revlog, list(range(16))))
236 314 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
237 315 >>> list(_slicechunktodensity(revlog, [0, 15]))
238 316 [[0], [15]]
239 317 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
240 318 [[0], [11], [15]]
241 319 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
242 320 [[0], [11, 13, 15]]
243 321 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
244 322 [[1, 2], [5, 8, 10, 11], [14]]
245 323 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
246 324 ... mingapsize=20))
247 325 [[1, 2, 3, 5, 8, 10, 11], [14]]
248 326 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
249 327 ... targetdensity=0.95))
250 328 [[1, 2], [5], [8, 10, 11], [14]]
251 329 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
252 330 ... targetdensity=0.95, mingapsize=12))
253 331 [[1, 2], [5, 8, 10, 11], [14]]
254 332 """
255 333 start = revlog.start
256 334 length = revlog.length
257 335
258 336 if len(revs) <= 1:
259 337 yield revs
260 338 return
261 339
262 340 deltachainspan = segmentspan(revlog, revs)
263 341
264 342 if deltachainspan < mingapsize:
265 343 yield revs
266 344 return
267 345
268 346 readdata = deltachainspan
269 347 chainpayload = sum(length(r) for r in revs)
270 348
271 349 if deltachainspan:
272 350 density = chainpayload / float(deltachainspan)
273 351 else:
274 352 density = 1.0
275 353
276 354 if density >= targetdensity:
277 355 yield revs
278 356 return
279 357
280 358 # Store the gaps in a heap to have them sorted by decreasing size
281 359 gaps = []
282 360 prevend = None
283 361 for i, rev in enumerate(revs):
284 362 revstart = start(rev)
285 363 revlen = length(rev)
286 364
287 365 # Skip empty revisions to form larger holes
288 366 if revlen == 0:
289 367 continue
290 368
291 369 if prevend is not None:
292 370 gapsize = revstart - prevend
293 371 # only consider holes that are large enough
294 372 if gapsize > mingapsize:
295 373 gaps.append((gapsize, i))
296 374
297 375 prevend = revstart + revlen
298 376 # sort the gaps to pop them from largest to small
299 377 gaps.sort()
300 378
301 379 # Collect the indices of the largest holes until the density is acceptable
302 380 selected = []
303 381 while gaps and density < targetdensity:
304 382 gapsize, gapidx = gaps.pop()
305 383
306 384 selected.append(gapidx)
307 385
308 386 # the gap sizes are stored as negatives to be sorted decreasingly
309 387 # by the heap
310 388 readdata -= gapsize
311 389 if readdata > 0:
312 390 density = chainpayload / float(readdata)
313 391 else:
314 392 density = 1.0
315 393 selected.sort()
316 394
317 395 # Cut the revs at collected indices
318 396 previdx = 0
319 397 for idx in selected:
320 398
321 399 chunk = _trimchunk(revlog, revs, previdx, idx)
322 400 if chunk:
323 401 yield chunk
324 402
325 403 previdx = idx
326 404
327 405 chunk = _trimchunk(revlog, revs, previdx)
328 406 if chunk:
329 407 yield chunk
330 408
331 409 def _trimchunk(revlog, revs, startidx, endidx=None):
332 410 """returns revs[startidx:endidx] without empty trailing revs
333 411
334 412 Doctest Setup
335 413 >>> revlog = _testrevlog([
336 414 ... 5, #0
337 415 ... 10, #1
338 416 ... 12, #2
339 417 ... 12, #3 (empty)
340 418 ... 17, #4
341 419 ... 21, #5
342 420 ... 21, #6 (empty)
343 421 ... ])
344 422
345 423 Contiguous cases:
346 424 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
347 425 [0, 1, 2, 3, 4, 5]
348 426 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
349 427 [0, 1, 2, 3, 4]
350 428 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
351 429 [0, 1, 2]
352 430 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
353 431 [2]
354 432 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
355 433 [3, 4, 5]
356 434 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
357 435 [3, 4]
358 436
359 437 Discontiguous cases:
360 438 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
361 439 [1, 3, 5]
362 440 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
363 441 [1]
364 442 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
365 443 [3, 5]
366 444 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
367 445 [3, 5]
368 446 """
369 447 length = revlog.length
370 448
371 449 if endidx is None:
372 450 endidx = len(revs)
373 451
374 452 # If we have a non-emtpy delta candidate, there are nothing to trim
375 453 if revs[endidx - 1] < len(revlog):
376 454 # Trim empty revs at the end, except the very first revision of a chain
377 455 while (endidx > 1
378 456 and endidx > startidx
379 457 and length(revs[endidx - 1]) == 0):
380 458 endidx -= 1
381 459
382 460 return revs[startidx:endidx]
383 461
384 462 def segmentspan(revlog, revs):
385 463 """Get the byte span of a segment of revisions
386 464
387 465 revs is a sorted array of revision numbers
388 466
389 467 >>> revlog = _testrevlog([
390 468 ... 5, #0
391 469 ... 10, #1
392 470 ... 12, #2
393 471 ... 12, #3 (empty)
394 472 ... 17, #4
395 473 ... ])
396 474
397 475 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
398 476 17
399 477 >>> segmentspan(revlog, [0, 4])
400 478 17
401 479 >>> segmentspan(revlog, [3, 4])
402 480 5
403 481 >>> segmentspan(revlog, [1, 2, 3,])
404 482 7
405 483 >>> segmentspan(revlog, [1, 3])
406 484 7
407 485 """
408 486 if not revs:
409 487 return 0
410 488 end = revlog.end(revs[-1])
411 489 return end - revlog.start(revs[0])
412 490
413 491 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
414 492 """build full text from a (base, delta) pair and other metadata"""
415 493 # special case deltas which replace entire base; no need to decode
416 494 # base revision. this neatly avoids censored bases, which throw when
417 495 # they're decoded.
418 496 hlen = struct.calcsize(">lll")
419 497 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
420 498 len(delta) - hlen):
421 499 fulltext = delta[hlen:]
422 500 else:
423 501 # deltabase is rawtext before changed by flag processors, which is
424 502 # equivalent to non-raw text
425 503 basetext = revlog.revision(baserev, _df=fh, raw=False)
426 504 fulltext = mdiff.patch(basetext, delta)
427 505
428 506 try:
429 507 res = revlog._processflags(fulltext, flags, 'read', raw=True)
430 508 fulltext, validatehash = res
431 509 if validatehash:
432 510 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
433 511 if flags & REVIDX_ISCENSORED:
434 512 raise error.StorageError(_('node %s is not censored') %
435 513 expectednode)
436 514 except error.CensoredNodeError:
437 515 # must pass the censored index flag to add censored revisions
438 516 if not flags & REVIDX_ISCENSORED:
439 517 raise
440 518 return fulltext
441 519
442 520 @attr.s(slots=True, frozen=True)
443 521 class _deltainfo(object):
444 522 distance = attr.ib()
445 523 deltalen = attr.ib()
446 524 data = attr.ib()
447 525 base = attr.ib()
448 526 chainbase = attr.ib()
449 527 chainlen = attr.ib()
450 528 compresseddeltalen = attr.ib()
451 529 snapshotdepth = attr.ib()
452 530
453 531 def isgooddeltainfo(revlog, deltainfo, revinfo):
454 532 """Returns True if the given delta is good. Good means that it is within
455 533 the disk span, disk size, and chain length bounds that we know to be
456 534 performant."""
457 535 if deltainfo is None:
458 536 return False
459 537
460 538 # - 'deltainfo.distance' is the distance from the base revision --
461 539 # bounding it limits the amount of I/O we need to do.
462 540 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
463 541 # deltas we need to apply -- bounding it limits the amount of CPU
464 542 # we consume.
465 543
466 544 textlen = revinfo.textlen
467 545 defaultmax = textlen * 4
468 546 maxdist = revlog._maxdeltachainspan
469 547 if not maxdist:
470 548 maxdist = deltainfo.distance # ensure the conditional pass
471 549 maxdist = max(maxdist, defaultmax)
472 550
473 551 # Bad delta from read span:
474 552 #
475 553 # If the span of data read is larger than the maximum allowed.
476 554 #
477 555 # In the sparse-revlog case, we rely on the associated "sparse reading"
478 556 # to avoid issue related to the span of data. In theory, it would be
479 557 # possible to build pathological revlog where delta pattern would lead
480 558 # to too many reads. However, they do not happen in practice at all. So
481 559 # we skip the span check entirely.
482 560 if not revlog._sparserevlog and maxdist < deltainfo.distance:
483 561 return False
484 562
485 563 # Bad delta from new delta size:
486 564 #
487 565 # If the delta size is larger than the target text, storing the
488 566 # delta will be inefficient.
489 567 if textlen < deltainfo.deltalen:
490 568 return False
491 569
492 570 # Bad delta from cumulated payload size:
493 571 #
494 572 # If the sum of delta get larger than K * target text length.
495 573 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
496 574 return False
497 575
498 576 # Bad delta from chain length:
499 577 #
500 578 # If the number of delta in the chain gets too high.
501 579 if (revlog._maxchainlen
502 580 and revlog._maxchainlen < deltainfo.chainlen):
503 581 return False
504 582
505 583 # bad delta from intermediate snapshot size limit
506 584 #
507 585 # If an intermediate snapshot size is higher than the limit. The
508 586 # limit exist to prevent endless chain of intermediate delta to be
509 587 # created.
510 588 if (deltainfo.snapshotdepth is not None and
511 589 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
512 590 return False
513 591
514 592 # bad delta if new intermediate snapshot is larger than the previous
515 593 # snapshot
516 594 if (deltainfo.snapshotdepth
517 595 and revlog.length(deltainfo.base) < deltainfo.deltalen):
518 596 return False
519 597
520 598 return True
521 599
522 600 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
523 601 """Provides group of revision to be tested as delta base
524 602
525 603 This top level function focus on emitting groups with unique and worthwhile
526 604 content. See _raw_candidate_groups for details about the group order.
527 605 """
528 606 # should we try to build a delta?
529 607 if not (len(revlog) and revlog._storedeltachains):
530 608 yield None
531 609 return
532 610
533 611 deltalength = revlog.length
534 612 deltaparent = revlog.deltaparent
535 613 good = None
536 614
537 615 deltas_limit = textlen * LIMIT_DELTA2TEXT
538 616
539 617 tested = set([nullrev])
540 618 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
541 619 while True:
542 620 temptative = candidates.send(good)
543 621 if temptative is None:
544 622 break
545 623 group = []
546 624 for rev in temptative:
547 625 # skip over empty delta (no need to include them in a chain)
548 626 while (revlog._generaldelta
549 627 and not (rev == nullrev
550 628 or rev in tested
551 629 or deltalength(rev))):
552 630 tested.add(rev)
553 631 rev = deltaparent(rev)
554 632 # filter out revision we tested already
555 633 if rev in tested:
556 634 continue
557 635 tested.add(rev)
558 636 # filter out delta base that will never produce good delta
559 637 if deltas_limit < revlog.length(rev):
560 638 continue
561 639 # no need to try a delta against nullrev, this will be done as a
562 640 # last resort.
563 641 if rev == nullrev:
564 642 continue
565 643 # no delta for rawtext-changing revs (see "candelta" for why)
566 644 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
567 645 continue
568 646 group.append(rev)
569 647 if group:
570 648 # XXX: in the sparse revlog case, group can become large,
571 649 # impacting performances. Some bounding or slicing mecanism
572 650 # would help to reduce this impact.
573 651 good = yield tuple(group)
574 652 yield None
575 653
576 654 def _findsnapshots(revlog, cache, start_rev):
577 655 """find snapshot from start_rev to tip"""
578 656 deltaparent = revlog.deltaparent
579 657 issnapshot = revlog.issnapshot
580 658 for rev in revlog.revs(start_rev):
581 659 if issnapshot(rev):
582 660 cache[deltaparent(rev)].append(rev)
583 661
584 662 def _refinedgroups(revlog, p1, p2, cachedelta):
585 663 good = None
586 664 # First we try to reuse a the delta contained in the bundle.
587 665 # (or from the source revlog)
588 666 #
589 667 # This logic only applies to general delta repositories and can be disabled
590 668 # through configuration. Disabling reuse source delta is useful when
591 669 # we want to make sure we recomputed "optimal" deltas.
592 670 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
593 671 # Assume what we received from the server is a good choice
594 672 # build delta will reuse the cache
595 673 good = yield (cachedelta[0],)
596 674 if good is not None:
597 675 yield None
598 676 return
599 677 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
600 678 good = yield candidates
601 679 if good is not None:
602 680 break
603 681
604 682 # If sparse revlog is enabled, we can try to refine the available deltas
605 683 if not revlog._sparserevlog:
606 684 yield None
607 685 return
608 686
609 687 # if we have a refinable value, try to refine it
610 688 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
611 689 # refine snapshot down
612 690 previous = None
613 691 while previous != good:
614 692 previous = good
615 693 base = revlog.deltaparent(good)
616 694 if base == nullrev:
617 695 break
618 696 good = yield (base,)
619 697 # refine snapshot up
620 698 #
621 699 # XXX the _findsnapshots call can be expensive and is "duplicated" with
622 700 # the one done in `_rawgroups`. Once we start working on performance,
623 701 # we should make the two logics share this computation.
624 702 snapshots = collections.defaultdict(list)
625 703 _findsnapshots(revlog, snapshots, good + 1)
626 704 previous = None
627 705 while good != previous:
628 706 previous = good
629 707 children = tuple(sorted(c for c in snapshots[good]))
630 708 good = yield children
631 709
632 710 # we have found nothing
633 711 yield None
634 712
635 713 def _rawgroups(revlog, p1, p2, cachedelta):
636 714 """Provides group of revision to be tested as delta base
637 715
638 716 This lower level function focus on emitting delta theorically interresting
639 717 without looking it any practical details.
640 718
641 719 The group order aims at providing fast or small candidates first.
642 720 """
643 721 gdelta = revlog._generaldelta
644 722 sparse = revlog._sparserevlog
645 723 curr = len(revlog)
646 724 prev = curr - 1
647 725 deltachain = lambda rev: revlog._deltachain(rev)[0]
648 726
649 727 if gdelta:
650 728 # exclude already lazy tested base if any
651 729 parents = [p for p in (p1, p2) if p != nullrev]
652 730
653 731 if not revlog._deltabothparents and len(parents) == 2:
654 732 parents.sort()
655 733 # To minimize the chance of having to build a fulltext,
656 734 # pick first whichever parent is closest to us (max rev)
657 735 yield (parents[1],)
658 736 # then the other one (min rev) if the first did not fit
659 737 yield (parents[0],)
660 738 elif len(parents) > 0:
661 739 # Test all parents (1 or 2), and keep the best candidate
662 740 yield parents
663 741
664 742 if sparse and parents:
665 743 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
666 744 # See if we can use an existing snapshot in the parent chains to use as
667 745 # a base for a new intermediate-snapshot
668 746 #
669 747 # search for snapshot in parents delta chain
670 748 # map: snapshot-level: snapshot-rev
671 749 parents_snaps = collections.defaultdict(set)
672 750 candidate_chains = [deltachain(p) for p in parents]
673 751 for chain in candidate_chains:
674 752 for idx, s in enumerate(chain):
675 753 if not revlog.issnapshot(s):
676 754 break
677 755 parents_snaps[idx].add(s)
678 756 snapfloor = min(parents_snaps[0]) + 1
679 757 _findsnapshots(revlog, snapshots, snapfloor)
680 758 # search for the highest "unrelated" revision
681 759 #
682 760 # Adding snapshots used by "unrelated" revision increase the odd we
683 761 # reuse an independant, yet better snapshot chain.
684 762 #
685 763 # XXX instead of building a set of revisions, we could lazily enumerate
686 764 # over the chains. That would be more efficient, however we stick to
687 765 # simple code for now.
688 766 all_revs = set()
689 767 for chain in candidate_chains:
690 768 all_revs.update(chain)
691 769 other = None
692 770 for r in revlog.revs(prev, snapfloor):
693 771 if r not in all_revs:
694 772 other = r
695 773 break
696 774 if other is not None:
697 775 # To avoid unfair competition, we won't use unrelated intermediate
698 776 # snapshot that are deeper than the ones from the parent delta
699 777 # chain.
700 778 max_depth = max(parents_snaps.keys())
701 779 chain = deltachain(other)
702 780 for idx, s in enumerate(chain):
703 781 if s < snapfloor:
704 782 continue
705 783 if max_depth < idx:
706 784 break
707 785 if not revlog.issnapshot(s):
708 786 break
709 787 parents_snaps[idx].add(s)
710 788 # Test them as possible intermediate snapshot base
711 789 # We test them from highest to lowest level. High level one are more
712 790 # likely to result in small delta
713 791 floor = None
714 792 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
715 793 siblings = set()
716 794 for s in snaps:
717 795 siblings.update(snapshots[s])
718 796 # Before considering making a new intermediate snapshot, we check
719 797 # if an existing snapshot, children of base we consider, would be
720 798 # suitable.
721 799 #
722 800 # It give a change to reuse a delta chain "unrelated" to the
723 801 # current revision instead of starting our own. Without such
724 802 # re-use, topological branches would keep reopening new chains.
725 803 # Creating more and more snapshot as the repository grow.
726 804
727 805 if floor is not None:
728 806 # We only do this for siblings created after the one in our
729 807 # parent's delta chain. Those created before has less chances
730 808 # to be valid base since our ancestors had to create a new
731 809 # snapshot.
732 810 siblings = [r for r in siblings if floor < r]
733 811 yield tuple(sorted(siblings))
734 812 # then test the base from our parent's delta chain.
735 813 yield tuple(sorted(snaps))
736 814 floor = min(snaps)
737 815 # No suitable base found in the parent chain, search if any full
738 816 # snapshots emitted since parent's base would be a suitable base for an
739 817 # intermediate snapshot.
740 818 #
741 819 # It give a chance to reuse a delta chain unrelated to the current
742 820 # revisions instead of starting our own. Without such re-use,
743 821 # topological branches would keep reopening new full chains. Creating
744 822 # more and more snapshot as the repository grow.
745 823 yield tuple(snapshots[nullrev])
746 824
747 825 if not sparse:
748 826 # other approach failed try against prev to hopefully save us a
749 827 # fulltext.
750 828 yield (prev,)
751 829
752 830 class deltacomputer(object):
753 831 def __init__(self, revlog):
754 832 self.revlog = revlog
755 833
756 834 def buildtext(self, revinfo, fh):
757 835 """Builds a fulltext version of a revision
758 836
759 837 revinfo: _revisioninfo instance that contains all needed info
760 838 fh: file handle to either the .i or the .d revlog file,
761 839 depending on whether it is inlined or not
762 840 """
763 841 btext = revinfo.btext
764 842 if btext[0] is not None:
765 843 return btext[0]
766 844
767 845 revlog = self.revlog
768 846 cachedelta = revinfo.cachedelta
769 847 baserev = cachedelta[0]
770 848 delta = cachedelta[1]
771 849
772 850 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
773 851 revinfo.p1, revinfo.p2,
774 852 revinfo.flags, revinfo.node)
775 853 return fulltext
776 854
777 855 def _builddeltadiff(self, base, revinfo, fh):
778 856 revlog = self.revlog
779 857 t = self.buildtext(revinfo, fh)
780 858 if revlog.iscensored(base):
781 859 # deltas based on a censored revision must replace the
782 860 # full content in one patch, so delta works everywhere
783 861 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
784 862 delta = header + t
785 863 else:
786 864 ptext = revlog.revision(base, _df=fh, raw=True)
787 865 delta = mdiff.textdiff(ptext, t)
788 866
789 867 return delta
790 868
791 869 def _builddeltainfo(self, revinfo, base, fh):
792 870 # can we use the cached delta?
793 871 delta = None
794 872 if revinfo.cachedelta:
795 873 cachebase, cachediff = revinfo.cachedelta
796 874 #check if the diff still apply
797 875 currentbase = cachebase
798 876 while (currentbase != nullrev
799 877 and currentbase != base
800 878 and self.revlog.length(currentbase) == 0):
801 879 currentbase = self.revlog.deltaparent(currentbase)
802 880 if currentbase == base:
803 881 delta = revinfo.cachedelta[1]
804 882 if delta is None:
805 883 delta = self._builddeltadiff(base, revinfo, fh)
806 884 revlog = self.revlog
807 885 header, data = revlog.compress(delta)
808 886 deltalen = len(header) + len(data)
809 887 chainbase = revlog.chainbase(base)
810 888 offset = revlog.end(len(revlog) - 1)
811 889 dist = deltalen + offset - revlog.start(chainbase)
812 890 if revlog._generaldelta:
813 891 deltabase = base
814 892 else:
815 893 deltabase = chainbase
816 894 chainlen, compresseddeltalen = revlog._chaininfo(base)
817 895 chainlen += 1
818 896 compresseddeltalen += deltalen
819 897
820 898 revlog = self.revlog
821 899 snapshotdepth = None
822 900 if deltabase == nullrev:
823 901 snapshotdepth = 0
824 902 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
825 903 # A delta chain should always be one full snapshot,
826 904 # zero or more semi-snapshots, and zero or more deltas
827 905 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
828 906 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
829 907 snapshotdepth = len(revlog._deltachain(deltabase)[0])
830 908
831 909 return _deltainfo(dist, deltalen, (header, data), deltabase,
832 910 chainbase, chainlen, compresseddeltalen,
833 911 snapshotdepth)
834 912
835 913 def _fullsnapshotinfo(self, fh, revinfo):
836 914 curr = len(self.revlog)
837 915 rawtext = self.buildtext(revinfo, fh)
838 916 data = self.revlog.compress(rawtext)
839 917 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
840 918 deltabase = chainbase = curr
841 919 snapshotdepth = 0
842 920 chainlen = 1
843 921
844 922 return _deltainfo(dist, deltalen, data, deltabase,
845 923 chainbase, chainlen, compresseddeltalen,
846 924 snapshotdepth)
847 925
848 926 def finddeltainfo(self, revinfo, fh):
849 927 """Find an acceptable delta against a candidate revision
850 928
851 929 revinfo: information about the revision (instance of _revisioninfo)
852 930 fh: file handle to either the .i or the .d revlog file,
853 931 depending on whether it is inlined or not
854 932
855 933 Returns the first acceptable candidate revision, as ordered by
856 934 _candidategroups
857 935
858 936 If no suitable deltabase is found, we return delta info for a full
859 937 snapshot.
860 938 """
861 939 if not revinfo.textlen:
862 940 return self._fullsnapshotinfo(fh, revinfo)
863 941
864 942 # no delta for flag processor revision (see "candelta" for why)
865 943 # not calling candelta since only one revision needs test, also to
866 944 # avoid overhead fetching flags again.
867 945 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
868 946 return self._fullsnapshotinfo(fh, revinfo)
869 947
870 948 cachedelta = revinfo.cachedelta
871 949 p1 = revinfo.p1
872 950 p2 = revinfo.p2
873 951 revlog = self.revlog
874 952
875 953 deltainfo = None
876 954 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
877 955 groups = _candidategroups(self.revlog, revinfo.textlen,
878 956 p1r, p2r, cachedelta)
879 957 candidaterevs = next(groups)
880 958 while candidaterevs is not None:
881 959 nominateddeltas = []
882 960 if deltainfo is not None:
883 961 # if we already found a good delta,
884 962 # challenge it against refined candidates
885 963 nominateddeltas.append(deltainfo)
886 964 for candidaterev in candidaterevs:
887 965 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
888 966 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
889 967 nominateddeltas.append(candidatedelta)
890 968 if nominateddeltas:
891 969 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
892 970 if deltainfo is not None:
893 971 candidaterevs = groups.send(deltainfo.base)
894 972 else:
895 973 candidaterevs = next(groups)
896 974
897 975 if deltainfo is None:
898 976 deltainfo = self._fullsnapshotinfo(fh, revinfo)
899 977 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now