##// END OF EJS Templates
delta: filter nullrev out first...
Boris Feld -
r40993:f960c51e default
parent child Browse files
Show More
@@ -1,981 +1,981 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import (
17 17 nullrev,
18 18 )
19 19 from ..i18n import _
20 20
21 21 from .constants import (
22 22 REVIDX_ISCENSORED,
23 23 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 24 )
25 25
26 26 from ..thirdparty import (
27 27 attr,
28 28 )
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 )
34 34
35 35 # maximum <delta-chain-data>/<revision-text-length> ratio
36 36 LIMIT_DELTA2TEXT = 2
37 37
38 38 class _testrevlog(object):
39 39 """minimalist fake revlog to use in doctests"""
40 40
41 41 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
42 42 """data is an list of revision payload boundaries"""
43 43 self._data = data
44 44 self._srdensitythreshold = density
45 45 self._srmingapsize = mingap
46 46 self._snapshot = set(snapshot)
47 47 self.index = None
48 48
49 49 def start(self, rev):
50 50 if rev == 0:
51 51 return 0
52 52 return self._data[rev - 1]
53 53
54 54 def end(self, rev):
55 55 return self._data[rev]
56 56
57 57 def length(self, rev):
58 58 return self.end(rev) - self.start(rev)
59 59
60 60 def __len__(self):
61 61 return len(self._data)
62 62
63 63 def issnapshot(self, rev):
64 64 return rev in self._snapshot
65 65
66 66 def slicechunk(revlog, revs, targetsize=None):
67 67 """slice revs to reduce the amount of unrelated data to be read from disk.
68 68
69 69 ``revs`` is sliced into groups that should be read in one time.
70 70 Assume that revs are sorted.
71 71
72 72 The initial chunk is sliced until the overall density (payload/chunks-span
73 73 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
74 74 `revlog._srmingapsize` is skipped.
75 75
76 76 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
77 77 For consistency with other slicing choice, this limit won't go lower than
78 78 `revlog._srmingapsize`.
79 79
80 80 If individual revisions chunk are larger than this limit, they will still
81 81 be raised individually.
82 82
83 83 >>> data = [
84 84 ... 5, #00 (5)
85 85 ... 10, #01 (5)
86 86 ... 12, #02 (2)
87 87 ... 12, #03 (empty)
88 88 ... 27, #04 (15)
89 89 ... 31, #05 (4)
90 90 ... 31, #06 (empty)
91 91 ... 42, #07 (11)
92 92 ... 47, #08 (5)
93 93 ... 47, #09 (empty)
94 94 ... 48, #10 (1)
95 95 ... 51, #11 (3)
96 96 ... 74, #12 (23)
97 97 ... 85, #13 (11)
98 98 ... 86, #14 (1)
99 99 ... 91, #15 (5)
100 100 ... ]
101 101 >>> revlog = _testrevlog(data, snapshot=range(16))
102 102
103 103 >>> list(slicechunk(revlog, list(range(16))))
104 104 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
105 105 >>> list(slicechunk(revlog, [0, 15]))
106 106 [[0], [15]]
107 107 >>> list(slicechunk(revlog, [0, 11, 15]))
108 108 [[0], [11], [15]]
109 109 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
110 110 [[0], [11, 13, 15]]
111 111 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
112 112 [[1, 2], [5, 8, 10, 11], [14]]
113 113
114 114 Slicing with a maximum chunk size
115 115 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
116 116 [[0], [11], [13], [15]]
117 117 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
118 118 [[0], [11], [13, 15]]
119 119 """
120 120 if targetsize is not None:
121 121 targetsize = max(targetsize, revlog._srmingapsize)
122 122 # targetsize should not be specified when evaluating delta candidates:
123 123 # * targetsize is used to ensure we stay within specification when reading,
124 124 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
125 125 if densityslicing is None:
126 126 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
127 127 for chunk in densityslicing(revs,
128 128 revlog._srdensitythreshold,
129 129 revlog._srmingapsize):
130 130 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
131 131 yield subchunk
132 132
133 133 def _slicechunktosize(revlog, revs, targetsize=None):
134 134 """slice revs to match the target size
135 135
136 136 This is intended to be used on chunk that density slicing selected by that
137 137 are still too large compared to the read garantee of revlog. This might
138 138 happens when "minimal gap size" interrupted the slicing or when chain are
139 139 built in a way that create large blocks next to each other.
140 140
141 141 >>> data = [
142 142 ... 3, #0 (3)
143 143 ... 5, #1 (2)
144 144 ... 6, #2 (1)
145 145 ... 8, #3 (2)
146 146 ... 8, #4 (empty)
147 147 ... 11, #5 (3)
148 148 ... 12, #6 (1)
149 149 ... 13, #7 (1)
150 150 ... 14, #8 (1)
151 151 ... ]
152 152
153 153 == All snapshots cases ==
154 154 >>> revlog = _testrevlog(data, snapshot=range(9))
155 155
156 156 Cases where chunk is already small enough
157 157 >>> list(_slicechunktosize(revlog, [0], 3))
158 158 [[0]]
159 159 >>> list(_slicechunktosize(revlog, [6, 7], 3))
160 160 [[6, 7]]
161 161 >>> list(_slicechunktosize(revlog, [0], None))
162 162 [[0]]
163 163 >>> list(_slicechunktosize(revlog, [6, 7], None))
164 164 [[6, 7]]
165 165
166 166 cases where we need actual slicing
167 167 >>> list(_slicechunktosize(revlog, [0, 1], 3))
168 168 [[0], [1]]
169 169 >>> list(_slicechunktosize(revlog, [1, 3], 3))
170 170 [[1], [3]]
171 171 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
172 172 [[1, 2], [3]]
173 173 >>> list(_slicechunktosize(revlog, [3, 5], 3))
174 174 [[3], [5]]
175 175 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
176 176 [[3], [5]]
177 177 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
178 178 [[5], [6, 7, 8]]
179 179 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
180 180 [[0], [1, 2], [3], [5], [6, 7, 8]]
181 181
182 182 Case with too large individual chunk (must return valid chunk)
183 183 >>> list(_slicechunktosize(revlog, [0, 1], 2))
184 184 [[0], [1]]
185 185 >>> list(_slicechunktosize(revlog, [1, 3], 1))
186 186 [[1], [3]]
187 187 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
188 188 [[3], [5]]
189 189
190 190 == No Snapshot cases ==
191 191 >>> revlog = _testrevlog(data)
192 192
193 193 Cases where chunk is already small enough
194 194 >>> list(_slicechunktosize(revlog, [0], 3))
195 195 [[0]]
196 196 >>> list(_slicechunktosize(revlog, [6, 7], 3))
197 197 [[6, 7]]
198 198 >>> list(_slicechunktosize(revlog, [0], None))
199 199 [[0]]
200 200 >>> list(_slicechunktosize(revlog, [6, 7], None))
201 201 [[6, 7]]
202 202
203 203 cases where we need actual slicing
204 204 >>> list(_slicechunktosize(revlog, [0, 1], 3))
205 205 [[0], [1]]
206 206 >>> list(_slicechunktosize(revlog, [1, 3], 3))
207 207 [[1], [3]]
208 208 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
209 209 [[1], [2, 3]]
210 210 >>> list(_slicechunktosize(revlog, [3, 5], 3))
211 211 [[3], [5]]
212 212 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
213 213 [[3], [4, 5]]
214 214 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
215 215 [[5], [6, 7, 8]]
216 216 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
217 217 [[0], [1, 2], [3], [5], [6, 7, 8]]
218 218
219 219 Case with too large individual chunk (must return valid chunk)
220 220 >>> list(_slicechunktosize(revlog, [0, 1], 2))
221 221 [[0], [1]]
222 222 >>> list(_slicechunktosize(revlog, [1, 3], 1))
223 223 [[1], [3]]
224 224 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
225 225 [[3], [5]]
226 226
227 227 == mixed case ==
228 228 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
229 229 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
230 230 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
231 231 """
232 232 assert targetsize is None or 0 <= targetsize
233 233 startdata = revlog.start(revs[0])
234 234 enddata = revlog.end(revs[-1])
235 235 fullspan = enddata - startdata
236 236 if targetsize is None or fullspan <= targetsize:
237 237 yield revs
238 238 return
239 239
240 240 startrevidx = 0
241 241 endrevidx = 1
242 242 iterrevs = enumerate(revs)
243 243 next(iterrevs) # skip first rev.
244 244 # first step: get snapshots out of the way
245 245 for idx, r in iterrevs:
246 246 span = revlog.end(r) - startdata
247 247 snapshot = revlog.issnapshot(r)
248 248 if span <= targetsize and snapshot:
249 249 endrevidx = idx + 1
250 250 else:
251 251 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
252 252 if chunk:
253 253 yield chunk
254 254 startrevidx = idx
255 255 startdata = revlog.start(r)
256 256 endrevidx = idx + 1
257 257 if not snapshot:
258 258 break
259 259
260 260 # for the others, we use binary slicing to quickly converge toward valid
261 261 # chunks (otherwise, we might end up looking for start/end of many
262 262 # revisions). This logic is not looking for the perfect slicing point, it
263 263 # focuses on quickly converging toward valid chunks.
264 264 nbitem = len(revs)
265 265 while (enddata - startdata) > targetsize:
266 266 endrevidx = nbitem
267 267 if nbitem - startrevidx <= 1:
268 268 break # protect against individual chunk larger than limit
269 269 localenddata = revlog.end(revs[endrevidx - 1])
270 270 span = localenddata - startdata
271 271 while span > targetsize:
272 272 if endrevidx - startrevidx <= 1:
273 273 break # protect against individual chunk larger than limit
274 274 endrevidx -= (endrevidx - startrevidx) // 2
275 275 localenddata = revlog.end(revs[endrevidx - 1])
276 276 span = localenddata - startdata
277 277 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
278 278 if chunk:
279 279 yield chunk
280 280 startrevidx = endrevidx
281 281 startdata = revlog.start(revs[startrevidx])
282 282
283 283 chunk = _trimchunk(revlog, revs, startrevidx)
284 284 if chunk:
285 285 yield chunk
286 286
287 287 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
288 288 mingapsize=0):
289 289 """slice revs to reduce the amount of unrelated data to be read from disk.
290 290
291 291 ``revs`` is sliced into groups that should be read in one time.
292 292 Assume that revs are sorted.
293 293
294 294 The initial chunk is sliced until the overall density (payload/chunks-span
295 295 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
296 296 skipped.
297 297
298 298 >>> revlog = _testrevlog([
299 299 ... 5, #00 (5)
300 300 ... 10, #01 (5)
301 301 ... 12, #02 (2)
302 302 ... 12, #03 (empty)
303 303 ... 27, #04 (15)
304 304 ... 31, #05 (4)
305 305 ... 31, #06 (empty)
306 306 ... 42, #07 (11)
307 307 ... 47, #08 (5)
308 308 ... 47, #09 (empty)
309 309 ... 48, #10 (1)
310 310 ... 51, #11 (3)
311 311 ... 74, #12 (23)
312 312 ... 85, #13 (11)
313 313 ... 86, #14 (1)
314 314 ... 91, #15 (5)
315 315 ... ])
316 316
317 317 >>> list(_slicechunktodensity(revlog, list(range(16))))
318 318 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
319 319 >>> list(_slicechunktodensity(revlog, [0, 15]))
320 320 [[0], [15]]
321 321 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
322 322 [[0], [11], [15]]
323 323 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
324 324 [[0], [11, 13, 15]]
325 325 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
326 326 [[1, 2], [5, 8, 10, 11], [14]]
327 327 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
328 328 ... mingapsize=20))
329 329 [[1, 2, 3, 5, 8, 10, 11], [14]]
330 330 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
331 331 ... targetdensity=0.95))
332 332 [[1, 2], [5], [8, 10, 11], [14]]
333 333 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
334 334 ... targetdensity=0.95, mingapsize=12))
335 335 [[1, 2], [5, 8, 10, 11], [14]]
336 336 """
337 337 start = revlog.start
338 338 length = revlog.length
339 339
340 340 if len(revs) <= 1:
341 341 yield revs
342 342 return
343 343
344 344 deltachainspan = segmentspan(revlog, revs)
345 345
346 346 if deltachainspan < mingapsize:
347 347 yield revs
348 348 return
349 349
350 350 readdata = deltachainspan
351 351 chainpayload = sum(length(r) for r in revs)
352 352
353 353 if deltachainspan:
354 354 density = chainpayload / float(deltachainspan)
355 355 else:
356 356 density = 1.0
357 357
358 358 if density >= targetdensity:
359 359 yield revs
360 360 return
361 361
362 362 # Store the gaps in a heap to have them sorted by decreasing size
363 363 gaps = []
364 364 prevend = None
365 365 for i, rev in enumerate(revs):
366 366 revstart = start(rev)
367 367 revlen = length(rev)
368 368
369 369 # Skip empty revisions to form larger holes
370 370 if revlen == 0:
371 371 continue
372 372
373 373 if prevend is not None:
374 374 gapsize = revstart - prevend
375 375 # only consider holes that are large enough
376 376 if gapsize > mingapsize:
377 377 gaps.append((gapsize, i))
378 378
379 379 prevend = revstart + revlen
380 380 # sort the gaps to pop them from largest to small
381 381 gaps.sort()
382 382
383 383 # Collect the indices of the largest holes until the density is acceptable
384 384 selected = []
385 385 while gaps and density < targetdensity:
386 386 gapsize, gapidx = gaps.pop()
387 387
388 388 selected.append(gapidx)
389 389
390 390 # the gap sizes are stored as negatives to be sorted decreasingly
391 391 # by the heap
392 392 readdata -= gapsize
393 393 if readdata > 0:
394 394 density = chainpayload / float(readdata)
395 395 else:
396 396 density = 1.0
397 397 selected.sort()
398 398
399 399 # Cut the revs at collected indices
400 400 previdx = 0
401 401 for idx in selected:
402 402
403 403 chunk = _trimchunk(revlog, revs, previdx, idx)
404 404 if chunk:
405 405 yield chunk
406 406
407 407 previdx = idx
408 408
409 409 chunk = _trimchunk(revlog, revs, previdx)
410 410 if chunk:
411 411 yield chunk
412 412
413 413 def _trimchunk(revlog, revs, startidx, endidx=None):
414 414 """returns revs[startidx:endidx] without empty trailing revs
415 415
416 416 Doctest Setup
417 417 >>> revlog = _testrevlog([
418 418 ... 5, #0
419 419 ... 10, #1
420 420 ... 12, #2
421 421 ... 12, #3 (empty)
422 422 ... 17, #4
423 423 ... 21, #5
424 424 ... 21, #6 (empty)
425 425 ... ])
426 426
427 427 Contiguous cases:
428 428 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
429 429 [0, 1, 2, 3, 4, 5]
430 430 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
431 431 [0, 1, 2, 3, 4]
432 432 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
433 433 [0, 1, 2]
434 434 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
435 435 [2]
436 436 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
437 437 [3, 4, 5]
438 438 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
439 439 [3, 4]
440 440
441 441 Discontiguous cases:
442 442 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
443 443 [1, 3, 5]
444 444 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
445 445 [1]
446 446 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
447 447 [3, 5]
448 448 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
449 449 [3, 5]
450 450 """
451 451 length = revlog.length
452 452
453 453 if endidx is None:
454 454 endidx = len(revs)
455 455
456 456 # If we have a non-emtpy delta candidate, there are nothing to trim
457 457 if revs[endidx - 1] < len(revlog):
458 458 # Trim empty revs at the end, except the very first revision of a chain
459 459 while (endidx > 1
460 460 and endidx > startidx
461 461 and length(revs[endidx - 1]) == 0):
462 462 endidx -= 1
463 463
464 464 return revs[startidx:endidx]
465 465
466 466 def segmentspan(revlog, revs):
467 467 """Get the byte span of a segment of revisions
468 468
469 469 revs is a sorted array of revision numbers
470 470
471 471 >>> revlog = _testrevlog([
472 472 ... 5, #0
473 473 ... 10, #1
474 474 ... 12, #2
475 475 ... 12, #3 (empty)
476 476 ... 17, #4
477 477 ... ])
478 478
479 479 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
480 480 17
481 481 >>> segmentspan(revlog, [0, 4])
482 482 17
483 483 >>> segmentspan(revlog, [3, 4])
484 484 5
485 485 >>> segmentspan(revlog, [1, 2, 3,])
486 486 7
487 487 >>> segmentspan(revlog, [1, 3])
488 488 7
489 489 """
490 490 if not revs:
491 491 return 0
492 492 end = revlog.end(revs[-1])
493 493 return end - revlog.start(revs[0])
494 494
495 495 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
496 496 """build full text from a (base, delta) pair and other metadata"""
497 497 # special case deltas which replace entire base; no need to decode
498 498 # base revision. this neatly avoids censored bases, which throw when
499 499 # they're decoded.
500 500 hlen = struct.calcsize(">lll")
501 501 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
502 502 len(delta) - hlen):
503 503 fulltext = delta[hlen:]
504 504 else:
505 505 # deltabase is rawtext before changed by flag processors, which is
506 506 # equivalent to non-raw text
507 507 basetext = revlog.revision(baserev, _df=fh, raw=False)
508 508 fulltext = mdiff.patch(basetext, delta)
509 509
510 510 try:
511 511 res = revlog._processflags(fulltext, flags, 'read', raw=True)
512 512 fulltext, validatehash = res
513 513 if validatehash:
514 514 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
515 515 if flags & REVIDX_ISCENSORED:
516 516 raise error.StorageError(_('node %s is not censored') %
517 517 expectednode)
518 518 except error.CensoredNodeError:
519 519 # must pass the censored index flag to add censored revisions
520 520 if not flags & REVIDX_ISCENSORED:
521 521 raise
522 522 return fulltext
523 523
524 524 @attr.s(slots=True, frozen=True)
525 525 class _deltainfo(object):
526 526 distance = attr.ib()
527 527 deltalen = attr.ib()
528 528 data = attr.ib()
529 529 base = attr.ib()
530 530 chainbase = attr.ib()
531 531 chainlen = attr.ib()
532 532 compresseddeltalen = attr.ib()
533 533 snapshotdepth = attr.ib()
534 534
535 535 def isgooddeltainfo(revlog, deltainfo, revinfo):
536 536 """Returns True if the given delta is good. Good means that it is within
537 537 the disk span, disk size, and chain length bounds that we know to be
538 538 performant."""
539 539 if deltainfo is None:
540 540 return False
541 541
542 542 # - 'deltainfo.distance' is the distance from the base revision --
543 543 # bounding it limits the amount of I/O we need to do.
544 544 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
545 545 # deltas we need to apply -- bounding it limits the amount of CPU
546 546 # we consume.
547 547
548 548 textlen = revinfo.textlen
549 549 defaultmax = textlen * 4
550 550 maxdist = revlog._maxdeltachainspan
551 551 if not maxdist:
552 552 maxdist = deltainfo.distance # ensure the conditional pass
553 553 maxdist = max(maxdist, defaultmax)
554 554
555 555 # Bad delta from read span:
556 556 #
557 557 # If the span of data read is larger than the maximum allowed.
558 558 #
559 559 # In the sparse-revlog case, we rely on the associated "sparse reading"
560 560 # to avoid issue related to the span of data. In theory, it would be
561 561 # possible to build pathological revlog where delta pattern would lead
562 562 # to too many reads. However, they do not happen in practice at all. So
563 563 # we skip the span check entirely.
564 564 if not revlog._sparserevlog and maxdist < deltainfo.distance:
565 565 return False
566 566
567 567 # Bad delta from new delta size:
568 568 #
569 569 # If the delta size is larger than the target text, storing the
570 570 # delta will be inefficient.
571 571 if textlen < deltainfo.deltalen:
572 572 return False
573 573
574 574 # Bad delta from cumulated payload size:
575 575 #
576 576 # If the sum of delta get larger than K * target text length.
577 577 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
578 578 return False
579 579
580 580 # Bad delta from chain length:
581 581 #
582 582 # If the number of delta in the chain gets too high.
583 583 if (revlog._maxchainlen
584 584 and revlog._maxchainlen < deltainfo.chainlen):
585 585 return False
586 586
587 587 # bad delta from intermediate snapshot size limit
588 588 #
589 589 # If an intermediate snapshot size is higher than the limit. The
590 590 # limit exist to prevent endless chain of intermediate delta to be
591 591 # created.
592 592 if (deltainfo.snapshotdepth is not None and
593 593 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
594 594 return False
595 595
596 596 # bad delta if new intermediate snapshot is larger than the previous
597 597 # snapshot
598 598 if (deltainfo.snapshotdepth
599 599 and revlog.length(deltainfo.base) < deltainfo.deltalen):
600 600 return False
601 601
602 602 return True
603 603
604 604 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
605 605 """Provides group of revision to be tested as delta base
606 606
607 607 This top level function focus on emitting groups with unique and worthwhile
608 608 content. See _raw_candidate_groups for details about the group order.
609 609 """
610 610 # should we try to build a delta?
611 611 if not (len(revlog) and revlog._storedeltachains):
612 612 yield None
613 613 return
614 614
615 615 deltalength = revlog.length
616 616 deltaparent = revlog.deltaparent
617 617 good = None
618 618
619 619 deltas_limit = textlen * LIMIT_DELTA2TEXT
620 620
621 621 tested = set([nullrev])
622 622 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
623 623 while True:
624 624 temptative = candidates.send(good)
625 625 if temptative is None:
626 626 break
627 627 group = []
628 628 for rev in temptative:
629 629 # skip over empty delta (no need to include them in a chain)
630 630 while (revlog._generaldelta
631 631 and not (rev == nullrev
632 632 or rev in tested
633 633 or deltalength(rev))):
634 634 tested.add(rev)
635 635 rev = deltaparent(rev)
636 # no need to try a delta against nullrev, this will be done as a
637 # last resort.
638 if rev == nullrev:
639 continue
636 640 # filter out revision we tested already
637 641 if rev in tested:
638 642 continue
639 643 tested.add(rev)
640 644 # filter out delta base that will never produce good delta
641 645 if deltas_limit < revlog.length(rev):
642 646 continue
643 # no need to try a delta against nullrev, this will be done as a
644 # last resort.
645 if rev == nullrev:
646 continue
647 647 # no delta for rawtext-changing revs (see "candelta" for why)
648 648 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
649 649 continue
650 650 group.append(rev)
651 651 if group:
652 652 # XXX: in the sparse revlog case, group can become large,
653 653 # impacting performances. Some bounding or slicing mecanism
654 654 # would help to reduce this impact.
655 655 good = yield tuple(group)
656 656 yield None
657 657
658 658 def _findsnapshots(revlog, cache, start_rev):
659 659 """find snapshot from start_rev to tip"""
660 660 deltaparent = revlog.deltaparent
661 661 issnapshot = revlog.issnapshot
662 662 for rev in revlog.revs(start_rev):
663 663 if issnapshot(rev):
664 664 cache[deltaparent(rev)].append(rev)
665 665
666 666 def _refinedgroups(revlog, p1, p2, cachedelta):
667 667 good = None
668 668 # First we try to reuse a the delta contained in the bundle.
669 669 # (or from the source revlog)
670 670 #
671 671 # This logic only applies to general delta repositories and can be disabled
672 672 # through configuration. Disabling reuse source delta is useful when
673 673 # we want to make sure we recomputed "optimal" deltas.
674 674 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
675 675 # Assume what we received from the server is a good choice
676 676 # build delta will reuse the cache
677 677 good = yield (cachedelta[0],)
678 678 if good is not None:
679 679 yield None
680 680 return
681 681 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
682 682 good = yield candidates
683 683 if good is not None:
684 684 break
685 685
686 686 # If sparse revlog is enabled, we can try to refine the available deltas
687 687 if not revlog._sparserevlog:
688 688 yield None
689 689 return
690 690
691 691 # if we have a refinable value, try to refine it
692 692 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
693 693 # refine snapshot down
694 694 previous = None
695 695 while previous != good:
696 696 previous = good
697 697 base = revlog.deltaparent(good)
698 698 if base == nullrev:
699 699 break
700 700 good = yield (base,)
701 701 # refine snapshot up
702 702 #
703 703 # XXX the _findsnapshots call can be expensive and is "duplicated" with
704 704 # the one done in `_rawgroups`. Once we start working on performance,
705 705 # we should make the two logics share this computation.
706 706 snapshots = collections.defaultdict(list)
707 707 _findsnapshots(revlog, snapshots, good + 1)
708 708 previous = None
709 709 while good != previous:
710 710 previous = good
711 711 children = tuple(sorted(c for c in snapshots[good]))
712 712 good = yield children
713 713
714 714 # we have found nothing
715 715 yield None
716 716
717 717 def _rawgroups(revlog, p1, p2, cachedelta):
718 718 """Provides group of revision to be tested as delta base
719 719
720 720 This lower level function focus on emitting delta theorically interresting
721 721 without looking it any practical details.
722 722
723 723 The group order aims at providing fast or small candidates first.
724 724 """
725 725 gdelta = revlog._generaldelta
726 726 sparse = revlog._sparserevlog
727 727 curr = len(revlog)
728 728 prev = curr - 1
729 729 deltachain = lambda rev: revlog._deltachain(rev)[0]
730 730
731 731 if gdelta:
732 732 # exclude already lazy tested base if any
733 733 parents = [p for p in (p1, p2) if p != nullrev]
734 734
735 735 if not revlog._deltabothparents and len(parents) == 2:
736 736 parents.sort()
737 737 # To minimize the chance of having to build a fulltext,
738 738 # pick first whichever parent is closest to us (max rev)
739 739 yield (parents[1],)
740 740 # then the other one (min rev) if the first did not fit
741 741 yield (parents[0],)
742 742 elif len(parents) > 0:
743 743 # Test all parents (1 or 2), and keep the best candidate
744 744 yield parents
745 745
746 746 if sparse and parents:
747 747 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
748 748 # See if we can use an existing snapshot in the parent chains to use as
749 749 # a base for a new intermediate-snapshot
750 750 #
751 751 # search for snapshot in parents delta chain
752 752 # map: snapshot-level: snapshot-rev
753 753 parents_snaps = collections.defaultdict(set)
754 754 candidate_chains = [deltachain(p) for p in parents]
755 755 for chain in candidate_chains:
756 756 for idx, s in enumerate(chain):
757 757 if not revlog.issnapshot(s):
758 758 break
759 759 parents_snaps[idx].add(s)
760 760 snapfloor = min(parents_snaps[0]) + 1
761 761 _findsnapshots(revlog, snapshots, snapfloor)
762 762 # search for the highest "unrelated" revision
763 763 #
764 764 # Adding snapshots used by "unrelated" revision increase the odd we
765 765 # reuse an independant, yet better snapshot chain.
766 766 #
767 767 # XXX instead of building a set of revisions, we could lazily enumerate
768 768 # over the chains. That would be more efficient, however we stick to
769 769 # simple code for now.
770 770 all_revs = set()
771 771 for chain in candidate_chains:
772 772 all_revs.update(chain)
773 773 other = None
774 774 for r in revlog.revs(prev, snapfloor):
775 775 if r not in all_revs:
776 776 other = r
777 777 break
778 778 if other is not None:
779 779 # To avoid unfair competition, we won't use unrelated intermediate
780 780 # snapshot that are deeper than the ones from the parent delta
781 781 # chain.
782 782 max_depth = max(parents_snaps.keys())
783 783 chain = deltachain(other)
784 784 for idx, s in enumerate(chain):
785 785 if s < snapfloor:
786 786 continue
787 787 if max_depth < idx:
788 788 break
789 789 if not revlog.issnapshot(s):
790 790 break
791 791 parents_snaps[idx].add(s)
792 792 # Test them as possible intermediate snapshot base
793 793 # We test them from highest to lowest level. High level one are more
794 794 # likely to result in small delta
795 795 floor = None
796 796 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
797 797 siblings = set()
798 798 for s in snaps:
799 799 siblings.update(snapshots[s])
800 800 # Before considering making a new intermediate snapshot, we check
801 801 # if an existing snapshot, children of base we consider, would be
802 802 # suitable.
803 803 #
804 804 # It give a change to reuse a delta chain "unrelated" to the
805 805 # current revision instead of starting our own. Without such
806 806 # re-use, topological branches would keep reopening new chains.
807 807 # Creating more and more snapshot as the repository grow.
808 808
809 809 if floor is not None:
810 810 # We only do this for siblings created after the one in our
811 811 # parent's delta chain. Those created before has less chances
812 812 # to be valid base since our ancestors had to create a new
813 813 # snapshot.
814 814 siblings = [r for r in siblings if floor < r]
815 815 yield tuple(sorted(siblings))
816 816 # then test the base from our parent's delta chain.
817 817 yield tuple(sorted(snaps))
818 818 floor = min(snaps)
819 819 # No suitable base found in the parent chain, search if any full
820 820 # snapshots emitted since parent's base would be a suitable base for an
821 821 # intermediate snapshot.
822 822 #
823 823 # It give a chance to reuse a delta chain unrelated to the current
824 824 # revisions instead of starting our own. Without such re-use,
825 825 # topological branches would keep reopening new full chains. Creating
826 826 # more and more snapshot as the repository grow.
827 827 yield tuple(snapshots[nullrev])
828 828
829 829 if not sparse:
830 830 # other approach failed try against prev to hopefully save us a
831 831 # fulltext.
832 832 yield (prev,)
833 833
834 834 class deltacomputer(object):
835 835 def __init__(self, revlog):
836 836 self.revlog = revlog
837 837
838 838 def buildtext(self, revinfo, fh):
839 839 """Builds a fulltext version of a revision
840 840
841 841 revinfo: _revisioninfo instance that contains all needed info
842 842 fh: file handle to either the .i or the .d revlog file,
843 843 depending on whether it is inlined or not
844 844 """
845 845 btext = revinfo.btext
846 846 if btext[0] is not None:
847 847 return btext[0]
848 848
849 849 revlog = self.revlog
850 850 cachedelta = revinfo.cachedelta
851 851 baserev = cachedelta[0]
852 852 delta = cachedelta[1]
853 853
854 854 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
855 855 revinfo.p1, revinfo.p2,
856 856 revinfo.flags, revinfo.node)
857 857 return fulltext
858 858
859 859 def _builddeltadiff(self, base, revinfo, fh):
860 860 revlog = self.revlog
861 861 t = self.buildtext(revinfo, fh)
862 862 if revlog.iscensored(base):
863 863 # deltas based on a censored revision must replace the
864 864 # full content in one patch, so delta works everywhere
865 865 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
866 866 delta = header + t
867 867 else:
868 868 ptext = revlog.revision(base, _df=fh, raw=True)
869 869 delta = mdiff.textdiff(ptext, t)
870 870
871 871 return delta
872 872
873 873 def _builddeltainfo(self, revinfo, base, fh):
874 874 # can we use the cached delta?
875 875 delta = None
876 876 if revinfo.cachedelta:
877 877 cachebase, cachediff = revinfo.cachedelta
878 878 #check if the diff still apply
879 879 currentbase = cachebase
880 880 while (currentbase != nullrev
881 881 and currentbase != base
882 882 and self.revlog.length(currentbase) == 0):
883 883 currentbase = self.revlog.deltaparent(currentbase)
884 884 if currentbase == base:
885 885 delta = revinfo.cachedelta[1]
886 886 if delta is None:
887 887 delta = self._builddeltadiff(base, revinfo, fh)
888 888 revlog = self.revlog
889 889 header, data = revlog.compress(delta)
890 890 deltalen = len(header) + len(data)
891 891 chainbase = revlog.chainbase(base)
892 892 offset = revlog.end(len(revlog) - 1)
893 893 dist = deltalen + offset - revlog.start(chainbase)
894 894 if revlog._generaldelta:
895 895 deltabase = base
896 896 else:
897 897 deltabase = chainbase
898 898 chainlen, compresseddeltalen = revlog._chaininfo(base)
899 899 chainlen += 1
900 900 compresseddeltalen += deltalen
901 901
902 902 revlog = self.revlog
903 903 snapshotdepth = None
904 904 if deltabase == nullrev:
905 905 snapshotdepth = 0
906 906 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
907 907 # A delta chain should always be one full snapshot,
908 908 # zero or more semi-snapshots, and zero or more deltas
909 909 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
910 910 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
911 911 snapshotdepth = len(revlog._deltachain(deltabase)[0])
912 912
913 913 return _deltainfo(dist, deltalen, (header, data), deltabase,
914 914 chainbase, chainlen, compresseddeltalen,
915 915 snapshotdepth)
916 916
917 917 def _fullsnapshotinfo(self, fh, revinfo):
918 918 curr = len(self.revlog)
919 919 rawtext = self.buildtext(revinfo, fh)
920 920 data = self.revlog.compress(rawtext)
921 921 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
922 922 deltabase = chainbase = curr
923 923 snapshotdepth = 0
924 924 chainlen = 1
925 925
926 926 return _deltainfo(dist, deltalen, data, deltabase,
927 927 chainbase, chainlen, compresseddeltalen,
928 928 snapshotdepth)
929 929
930 930 def finddeltainfo(self, revinfo, fh):
931 931 """Find an acceptable delta against a candidate revision
932 932
933 933 revinfo: information about the revision (instance of _revisioninfo)
934 934 fh: file handle to either the .i or the .d revlog file,
935 935 depending on whether it is inlined or not
936 936
937 937 Returns the first acceptable candidate revision, as ordered by
938 938 _candidategroups
939 939
940 940 If no suitable deltabase is found, we return delta info for a full
941 941 snapshot.
942 942 """
943 943 if not revinfo.textlen:
944 944 return self._fullsnapshotinfo(fh, revinfo)
945 945
946 946 # no delta for flag processor revision (see "candelta" for why)
947 947 # not calling candelta since only one revision needs test, also to
948 948 # avoid overhead fetching flags again.
949 949 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
950 950 return self._fullsnapshotinfo(fh, revinfo)
951 951
952 952 cachedelta = revinfo.cachedelta
953 953 p1 = revinfo.p1
954 954 p2 = revinfo.p2
955 955 revlog = self.revlog
956 956
957 957 deltainfo = None
958 958 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
959 959 groups = _candidategroups(self.revlog, revinfo.textlen,
960 960 p1r, p2r, cachedelta)
961 961 candidaterevs = next(groups)
962 962 while candidaterevs is not None:
963 963 nominateddeltas = []
964 964 if deltainfo is not None:
965 965 # if we already found a good delta,
966 966 # challenge it against refined candidates
967 967 nominateddeltas.append(deltainfo)
968 968 for candidaterev in candidaterevs:
969 969 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
970 970 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
971 971 nominateddeltas.append(candidatedelta)
972 972 if nominateddeltas:
973 973 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
974 974 if deltainfo is not None:
975 975 candidaterevs = groups.send(deltainfo.base)
976 976 else:
977 977 candidaterevs = next(groups)
978 978
979 979 if deltainfo is None:
980 980 deltainfo = self._fullsnapshotinfo(fh, revinfo)
981 981 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now