##// END OF EJS Templates
revlog: fix pure python slicing test when chain contains nullrev...
Boris Feld -
r41110:88d813cd default
parent child Browse files
Show More
@@ -1,1000 +1,1012 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import (
17 17 nullrev,
18 18 )
19 19 from ..i18n import _
20 20
21 21 from .constants import (
22 22 REVIDX_ISCENSORED,
23 23 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 24 )
25 25
26 26 from ..thirdparty import (
27 27 attr,
28 28 )
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 )
34 34
35 35 # maximum <delta-chain-data>/<revision-text-length> ratio
36 36 LIMIT_DELTA2TEXT = 2
37 37
38 38 class _testrevlog(object):
39 39 """minimalist fake revlog to use in doctests"""
40 40
41 41 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
42 42 """data is an list of revision payload boundaries"""
43 43 self._data = data
44 44 self._srdensitythreshold = density
45 45 self._srmingapsize = mingap
46 46 self._snapshot = set(snapshot)
47 47 self.index = None
48 48
49 49 def start(self, rev):
50 if rev == nullrev:
51 return 0
50 52 if rev == 0:
51 53 return 0
52 54 return self._data[rev - 1]
53 55
54 56 def end(self, rev):
57 if rev == nullrev:
58 return 0
55 59 return self._data[rev]
56 60
57 61 def length(self, rev):
58 62 return self.end(rev) - self.start(rev)
59 63
60 64 def __len__(self):
61 65 return len(self._data)
62 66
63 67 def issnapshot(self, rev):
68 if rev == nullrev:
69 return True
64 70 return rev in self._snapshot
65 71
66 72 def slicechunk(revlog, revs, targetsize=None):
67 73 """slice revs to reduce the amount of unrelated data to be read from disk.
68 74
69 75 ``revs`` is sliced into groups that should be read in one time.
70 76 Assume that revs are sorted.
71 77
72 78 The initial chunk is sliced until the overall density (payload/chunks-span
73 79 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
74 80 `revlog._srmingapsize` is skipped.
75 81
76 82 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
77 83 For consistency with other slicing choice, this limit won't go lower than
78 84 `revlog._srmingapsize`.
79 85
80 86 If individual revisions chunk are larger than this limit, they will still
81 87 be raised individually.
82 88
83 89 >>> data = [
84 90 ... 5, #00 (5)
85 91 ... 10, #01 (5)
86 92 ... 12, #02 (2)
87 93 ... 12, #03 (empty)
88 94 ... 27, #04 (15)
89 95 ... 31, #05 (4)
90 96 ... 31, #06 (empty)
91 97 ... 42, #07 (11)
92 98 ... 47, #08 (5)
93 99 ... 47, #09 (empty)
94 100 ... 48, #10 (1)
95 101 ... 51, #11 (3)
96 102 ... 74, #12 (23)
97 103 ... 85, #13 (11)
98 104 ... 86, #14 (1)
99 105 ... 91, #15 (5)
100 106 ... ]
101 107 >>> revlog = _testrevlog(data, snapshot=range(16))
102 108
103 109 >>> list(slicechunk(revlog, list(range(16))))
104 110 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
105 111 >>> list(slicechunk(revlog, [0, 15]))
106 112 [[0], [15]]
107 113 >>> list(slicechunk(revlog, [0, 11, 15]))
108 114 [[0], [11], [15]]
109 115 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
110 116 [[0], [11, 13, 15]]
111 117 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
112 118 [[1, 2], [5, 8, 10, 11], [14]]
113 119
114 120 Slicing with a maximum chunk size
115 121 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
116 122 [[0], [11], [13], [15]]
117 123 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
118 124 [[0], [11], [13, 15]]
125
126 Slicing involving nullrev
127 >>> list(slicechunk(revlog, [-1, 0, 11, 13, 15], targetsize=20))
128 [[-1, 0], [11], [13, 15]]
129 >>> list(slicechunk(revlog, [-1, 13, 15], targetsize=5))
130 [[-1], [13], [15]]
119 131 """
120 132 if targetsize is not None:
121 133 targetsize = max(targetsize, revlog._srmingapsize)
122 134 # targetsize should not be specified when evaluating delta candidates:
123 135 # * targetsize is used to ensure we stay within specification when reading,
124 136 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
125 137 if densityslicing is None:
126 138 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
127 139 for chunk in densityslicing(revs,
128 140 revlog._srdensitythreshold,
129 141 revlog._srmingapsize):
130 142 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
131 143 yield subchunk
132 144
133 145 def _slicechunktosize(revlog, revs, targetsize=None):
134 146 """slice revs to match the target size
135 147
136 148 This is intended to be used on chunk that density slicing selected by that
137 149 are still too large compared to the read garantee of revlog. This might
138 150 happens when "minimal gap size" interrupted the slicing or when chain are
139 151 built in a way that create large blocks next to each other.
140 152
141 153 >>> data = [
142 154 ... 3, #0 (3)
143 155 ... 5, #1 (2)
144 156 ... 6, #2 (1)
145 157 ... 8, #3 (2)
146 158 ... 8, #4 (empty)
147 159 ... 11, #5 (3)
148 160 ... 12, #6 (1)
149 161 ... 13, #7 (1)
150 162 ... 14, #8 (1)
151 163 ... ]
152 164
153 165 == All snapshots cases ==
154 166 >>> revlog = _testrevlog(data, snapshot=range(9))
155 167
156 168 Cases where chunk is already small enough
157 169 >>> list(_slicechunktosize(revlog, [0], 3))
158 170 [[0]]
159 171 >>> list(_slicechunktosize(revlog, [6, 7], 3))
160 172 [[6, 7]]
161 173 >>> list(_slicechunktosize(revlog, [0], None))
162 174 [[0]]
163 175 >>> list(_slicechunktosize(revlog, [6, 7], None))
164 176 [[6, 7]]
165 177
166 178 cases where we need actual slicing
167 179 >>> list(_slicechunktosize(revlog, [0, 1], 3))
168 180 [[0], [1]]
169 181 >>> list(_slicechunktosize(revlog, [1, 3], 3))
170 182 [[1], [3]]
171 183 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
172 184 [[1, 2], [3]]
173 185 >>> list(_slicechunktosize(revlog, [3, 5], 3))
174 186 [[3], [5]]
175 187 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
176 188 [[3], [5]]
177 189 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
178 190 [[5], [6, 7, 8]]
179 191 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
180 192 [[0], [1, 2], [3], [5], [6, 7, 8]]
181 193
182 194 Case with too large individual chunk (must return valid chunk)
183 195 >>> list(_slicechunktosize(revlog, [0, 1], 2))
184 196 [[0], [1]]
185 197 >>> list(_slicechunktosize(revlog, [1, 3], 1))
186 198 [[1], [3]]
187 199 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
188 200 [[3], [5]]
189 201
190 202 == No Snapshot cases ==
191 203 >>> revlog = _testrevlog(data)
192 204
193 205 Cases where chunk is already small enough
194 206 >>> list(_slicechunktosize(revlog, [0], 3))
195 207 [[0]]
196 208 >>> list(_slicechunktosize(revlog, [6, 7], 3))
197 209 [[6, 7]]
198 210 >>> list(_slicechunktosize(revlog, [0], None))
199 211 [[0]]
200 212 >>> list(_slicechunktosize(revlog, [6, 7], None))
201 213 [[6, 7]]
202 214
203 215 cases where we need actual slicing
204 216 >>> list(_slicechunktosize(revlog, [0, 1], 3))
205 217 [[0], [1]]
206 218 >>> list(_slicechunktosize(revlog, [1, 3], 3))
207 219 [[1], [3]]
208 220 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
209 221 [[1], [2, 3]]
210 222 >>> list(_slicechunktosize(revlog, [3, 5], 3))
211 223 [[3], [5]]
212 224 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
213 225 [[3], [4, 5]]
214 226 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
215 227 [[5], [6, 7, 8]]
216 228 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
217 229 [[0], [1, 2], [3], [5], [6, 7, 8]]
218 230
219 231 Case with too large individual chunk (must return valid chunk)
220 232 >>> list(_slicechunktosize(revlog, [0, 1], 2))
221 233 [[0], [1]]
222 234 >>> list(_slicechunktosize(revlog, [1, 3], 1))
223 235 [[1], [3]]
224 236 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
225 237 [[3], [5]]
226 238
227 239 == mixed case ==
228 240 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
229 241 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
230 242 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
231 243 """
232 244 assert targetsize is None or 0 <= targetsize
233 245 startdata = revlog.start(revs[0])
234 246 enddata = revlog.end(revs[-1])
235 247 fullspan = enddata - startdata
236 248 if targetsize is None or fullspan <= targetsize:
237 249 yield revs
238 250 return
239 251
240 252 startrevidx = 0
241 253 endrevidx = 1
242 254 iterrevs = enumerate(revs)
243 255 next(iterrevs) # skip first rev.
244 256 # first step: get snapshots out of the way
245 257 for idx, r in iterrevs:
246 258 span = revlog.end(r) - startdata
247 259 snapshot = revlog.issnapshot(r)
248 260 if span <= targetsize and snapshot:
249 261 endrevidx = idx + 1
250 262 else:
251 263 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
252 264 if chunk:
253 265 yield chunk
254 266 startrevidx = idx
255 267 startdata = revlog.start(r)
256 268 endrevidx = idx + 1
257 269 if not snapshot:
258 270 break
259 271
260 272 # for the others, we use binary slicing to quickly converge toward valid
261 273 # chunks (otherwise, we might end up looking for start/end of many
262 274 # revisions). This logic is not looking for the perfect slicing point, it
263 275 # focuses on quickly converging toward valid chunks.
264 276 nbitem = len(revs)
265 277 while (enddata - startdata) > targetsize:
266 278 endrevidx = nbitem
267 279 if nbitem - startrevidx <= 1:
268 280 break # protect against individual chunk larger than limit
269 281 localenddata = revlog.end(revs[endrevidx - 1])
270 282 span = localenddata - startdata
271 283 while span > targetsize:
272 284 if endrevidx - startrevidx <= 1:
273 285 break # protect against individual chunk larger than limit
274 286 endrevidx -= (endrevidx - startrevidx) // 2
275 287 localenddata = revlog.end(revs[endrevidx - 1])
276 288 span = localenddata - startdata
277 289 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
278 290 if chunk:
279 291 yield chunk
280 292 startrevidx = endrevidx
281 293 startdata = revlog.start(revs[startrevidx])
282 294
283 295 chunk = _trimchunk(revlog, revs, startrevidx)
284 296 if chunk:
285 297 yield chunk
286 298
287 299 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
288 300 mingapsize=0):
289 301 """slice revs to reduce the amount of unrelated data to be read from disk.
290 302
291 303 ``revs`` is sliced into groups that should be read in one time.
292 304 Assume that revs are sorted.
293 305
294 306 The initial chunk is sliced until the overall density (payload/chunks-span
295 307 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
296 308 skipped.
297 309
298 310 >>> revlog = _testrevlog([
299 311 ... 5, #00 (5)
300 312 ... 10, #01 (5)
301 313 ... 12, #02 (2)
302 314 ... 12, #03 (empty)
303 315 ... 27, #04 (15)
304 316 ... 31, #05 (4)
305 317 ... 31, #06 (empty)
306 318 ... 42, #07 (11)
307 319 ... 47, #08 (5)
308 320 ... 47, #09 (empty)
309 321 ... 48, #10 (1)
310 322 ... 51, #11 (3)
311 323 ... 74, #12 (23)
312 324 ... 85, #13 (11)
313 325 ... 86, #14 (1)
314 326 ... 91, #15 (5)
315 327 ... ])
316 328
317 329 >>> list(_slicechunktodensity(revlog, list(range(16))))
318 330 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
319 331 >>> list(_slicechunktodensity(revlog, [0, 15]))
320 332 [[0], [15]]
321 333 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
322 334 [[0], [11], [15]]
323 335 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
324 336 [[0], [11, 13, 15]]
325 337 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
326 338 [[1, 2], [5, 8, 10, 11], [14]]
327 339 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
328 340 ... mingapsize=20))
329 341 [[1, 2, 3, 5, 8, 10, 11], [14]]
330 342 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
331 343 ... targetdensity=0.95))
332 344 [[1, 2], [5], [8, 10, 11], [14]]
333 345 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
334 346 ... targetdensity=0.95, mingapsize=12))
335 347 [[1, 2], [5, 8, 10, 11], [14]]
336 348 """
337 349 start = revlog.start
338 350 length = revlog.length
339 351
340 352 if len(revs) <= 1:
341 353 yield revs
342 354 return
343 355
344 356 deltachainspan = segmentspan(revlog, revs)
345 357
346 358 if deltachainspan < mingapsize:
347 359 yield revs
348 360 return
349 361
350 362 readdata = deltachainspan
351 363 chainpayload = sum(length(r) for r in revs)
352 364
353 365 if deltachainspan:
354 366 density = chainpayload / float(deltachainspan)
355 367 else:
356 368 density = 1.0
357 369
358 370 if density >= targetdensity:
359 371 yield revs
360 372 return
361 373
362 374 # Store the gaps in a heap to have them sorted by decreasing size
363 375 gaps = []
364 376 prevend = None
365 377 for i, rev in enumerate(revs):
366 378 revstart = start(rev)
367 379 revlen = length(rev)
368 380
369 381 # Skip empty revisions to form larger holes
370 382 if revlen == 0:
371 383 continue
372 384
373 385 if prevend is not None:
374 386 gapsize = revstart - prevend
375 387 # only consider holes that are large enough
376 388 if gapsize > mingapsize:
377 389 gaps.append((gapsize, i))
378 390
379 391 prevend = revstart + revlen
380 392 # sort the gaps to pop them from largest to small
381 393 gaps.sort()
382 394
383 395 # Collect the indices of the largest holes until the density is acceptable
384 396 selected = []
385 397 while gaps and density < targetdensity:
386 398 gapsize, gapidx = gaps.pop()
387 399
388 400 selected.append(gapidx)
389 401
390 402 # the gap sizes are stored as negatives to be sorted decreasingly
391 403 # by the heap
392 404 readdata -= gapsize
393 405 if readdata > 0:
394 406 density = chainpayload / float(readdata)
395 407 else:
396 408 density = 1.0
397 409 selected.sort()
398 410
399 411 # Cut the revs at collected indices
400 412 previdx = 0
401 413 for idx in selected:
402 414
403 415 chunk = _trimchunk(revlog, revs, previdx, idx)
404 416 if chunk:
405 417 yield chunk
406 418
407 419 previdx = idx
408 420
409 421 chunk = _trimchunk(revlog, revs, previdx)
410 422 if chunk:
411 423 yield chunk
412 424
413 425 def _trimchunk(revlog, revs, startidx, endidx=None):
414 426 """returns revs[startidx:endidx] without empty trailing revs
415 427
416 428 Doctest Setup
417 429 >>> revlog = _testrevlog([
418 430 ... 5, #0
419 431 ... 10, #1
420 432 ... 12, #2
421 433 ... 12, #3 (empty)
422 434 ... 17, #4
423 435 ... 21, #5
424 436 ... 21, #6 (empty)
425 437 ... ])
426 438
427 439 Contiguous cases:
428 440 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
429 441 [0, 1, 2, 3, 4, 5]
430 442 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
431 443 [0, 1, 2, 3, 4]
432 444 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
433 445 [0, 1, 2]
434 446 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
435 447 [2]
436 448 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
437 449 [3, 4, 5]
438 450 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
439 451 [3, 4]
440 452
441 453 Discontiguous cases:
442 454 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
443 455 [1, 3, 5]
444 456 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
445 457 [1]
446 458 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
447 459 [3, 5]
448 460 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
449 461 [3, 5]
450 462 """
451 463 length = revlog.length
452 464
453 465 if endidx is None:
454 466 endidx = len(revs)
455 467
456 468 # If we have a non-emtpy delta candidate, there are nothing to trim
457 469 if revs[endidx - 1] < len(revlog):
458 470 # Trim empty revs at the end, except the very first revision of a chain
459 471 while (endidx > 1
460 472 and endidx > startidx
461 473 and length(revs[endidx - 1]) == 0):
462 474 endidx -= 1
463 475
464 476 return revs[startidx:endidx]
465 477
466 478 def segmentspan(revlog, revs):
467 479 """Get the byte span of a segment of revisions
468 480
469 481 revs is a sorted array of revision numbers
470 482
471 483 >>> revlog = _testrevlog([
472 484 ... 5, #0
473 485 ... 10, #1
474 486 ... 12, #2
475 487 ... 12, #3 (empty)
476 488 ... 17, #4
477 489 ... ])
478 490
479 491 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
480 492 17
481 493 >>> segmentspan(revlog, [0, 4])
482 494 17
483 495 >>> segmentspan(revlog, [3, 4])
484 496 5
485 497 >>> segmentspan(revlog, [1, 2, 3,])
486 498 7
487 499 >>> segmentspan(revlog, [1, 3])
488 500 7
489 501 """
490 502 if not revs:
491 503 return 0
492 504 end = revlog.end(revs[-1])
493 505 return end - revlog.start(revs[0])
494 506
495 507 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
496 508 """build full text from a (base, delta) pair and other metadata"""
497 509 # special case deltas which replace entire base; no need to decode
498 510 # base revision. this neatly avoids censored bases, which throw when
499 511 # they're decoded.
500 512 hlen = struct.calcsize(">lll")
501 513 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
502 514 len(delta) - hlen):
503 515 fulltext = delta[hlen:]
504 516 else:
505 517 # deltabase is rawtext before changed by flag processors, which is
506 518 # equivalent to non-raw text
507 519 basetext = revlog.revision(baserev, _df=fh, raw=False)
508 520 fulltext = mdiff.patch(basetext, delta)
509 521
510 522 try:
511 523 res = revlog._processflags(fulltext, flags, 'read', raw=True)
512 524 fulltext, validatehash = res
513 525 if validatehash:
514 526 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
515 527 if flags & REVIDX_ISCENSORED:
516 528 raise error.StorageError(_('node %s is not censored') %
517 529 expectednode)
518 530 except error.CensoredNodeError:
519 531 # must pass the censored index flag to add censored revisions
520 532 if not flags & REVIDX_ISCENSORED:
521 533 raise
522 534 return fulltext
523 535
524 536 @attr.s(slots=True, frozen=True)
525 537 class _deltainfo(object):
526 538 distance = attr.ib()
527 539 deltalen = attr.ib()
528 540 data = attr.ib()
529 541 base = attr.ib()
530 542 chainbase = attr.ib()
531 543 chainlen = attr.ib()
532 544 compresseddeltalen = attr.ib()
533 545 snapshotdepth = attr.ib()
534 546
535 547 def isgooddeltainfo(revlog, deltainfo, revinfo):
536 548 """Returns True if the given delta is good. Good means that it is within
537 549 the disk span, disk size, and chain length bounds that we know to be
538 550 performant."""
539 551 if deltainfo is None:
540 552 return False
541 553
542 554 # - 'deltainfo.distance' is the distance from the base revision --
543 555 # bounding it limits the amount of I/O we need to do.
544 556 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
545 557 # deltas we need to apply -- bounding it limits the amount of CPU
546 558 # we consume.
547 559
548 560 textlen = revinfo.textlen
549 561 defaultmax = textlen * 4
550 562 maxdist = revlog._maxdeltachainspan
551 563 if not maxdist:
552 564 maxdist = deltainfo.distance # ensure the conditional pass
553 565 maxdist = max(maxdist, defaultmax)
554 566
555 567 # Bad delta from read span:
556 568 #
557 569 # If the span of data read is larger than the maximum allowed.
558 570 #
559 571 # In the sparse-revlog case, we rely on the associated "sparse reading"
560 572 # to avoid issue related to the span of data. In theory, it would be
561 573 # possible to build pathological revlog where delta pattern would lead
562 574 # to too many reads. However, they do not happen in practice at all. So
563 575 # we skip the span check entirely.
564 576 if not revlog._sparserevlog and maxdist < deltainfo.distance:
565 577 return False
566 578
567 579 # Bad delta from new delta size:
568 580 #
569 581 # If the delta size is larger than the target text, storing the
570 582 # delta will be inefficient.
571 583 if textlen < deltainfo.deltalen:
572 584 return False
573 585
574 586 # Bad delta from cumulated payload size:
575 587 #
576 588 # If the sum of delta get larger than K * target text length.
577 589 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
578 590 return False
579 591
580 592 # Bad delta from chain length:
581 593 #
582 594 # If the number of delta in the chain gets too high.
583 595 if (revlog._maxchainlen
584 596 and revlog._maxchainlen < deltainfo.chainlen):
585 597 return False
586 598
587 599 # bad delta from intermediate snapshot size limit
588 600 #
589 601 # If an intermediate snapshot size is higher than the limit. The
590 602 # limit exist to prevent endless chain of intermediate delta to be
591 603 # created.
592 604 if (deltainfo.snapshotdepth is not None and
593 605 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
594 606 return False
595 607
596 608 # bad delta if new intermediate snapshot is larger than the previous
597 609 # snapshot
598 610 if (deltainfo.snapshotdepth
599 611 and revlog.length(deltainfo.base) < deltainfo.deltalen):
600 612 return False
601 613
602 614 return True
603 615
604 616 # If a revision's full text is that much bigger than a base candidate full
605 617 # text's, it is very unlikely that it will produce a valid delta. We no longer
606 618 # consider these candidates.
607 619 LIMIT_BASE2TEXT = 500
608 620
609 621 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
610 622 """Provides group of revision to be tested as delta base
611 623
612 624 This top level function focus on emitting groups with unique and worthwhile
613 625 content. See _raw_candidate_groups for details about the group order.
614 626 """
615 627 # should we try to build a delta?
616 628 if not (len(revlog) and revlog._storedeltachains):
617 629 yield None
618 630 return
619 631
620 632 deltalength = revlog.length
621 633 deltaparent = revlog.deltaparent
622 634 sparse = revlog._sparserevlog
623 635 good = None
624 636
625 637 deltas_limit = textlen * LIMIT_DELTA2TEXT
626 638
627 639 tested = set([nullrev])
628 640 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
629 641 while True:
630 642 temptative = candidates.send(good)
631 643 if temptative is None:
632 644 break
633 645 group = []
634 646 for rev in temptative:
635 647 # skip over empty delta (no need to include them in a chain)
636 648 while (revlog._generaldelta
637 649 and not (rev == nullrev
638 650 or rev in tested
639 651 or deltalength(rev))):
640 652 tested.add(rev)
641 653 rev = deltaparent(rev)
642 654 # no need to try a delta against nullrev, this will be done as a
643 655 # last resort.
644 656 if rev == nullrev:
645 657 continue
646 658 # filter out revision we tested already
647 659 if rev in tested:
648 660 continue
649 661 tested.add(rev)
650 662 # filter out delta base that will never produce good delta
651 663 if deltas_limit < revlog.length(rev):
652 664 continue
653 665 if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
654 666 continue
655 667 # no delta for rawtext-changing revs (see "candelta" for why)
656 668 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
657 669 continue
658 670 # If we reach here, we are about to build and test a delta.
659 671 # The delta building process will compute the chaininfo in all
660 672 # case, since that computation is cached, it is fine to access it
661 673 # here too.
662 674 chainlen, chainsize = revlog._chaininfo(rev)
663 675 # if chain will be too long, skip base
664 676 if revlog._maxchainlen and chainlen >= revlog._maxchainlen:
665 677 continue
666 678 # if chain already have too much data, skip base
667 679 if deltas_limit < chainsize:
668 680 continue
669 681 group.append(rev)
670 682 if group:
671 683 # XXX: in the sparse revlog case, group can become large,
672 684 # impacting performances. Some bounding or slicing mecanism
673 685 # would help to reduce this impact.
674 686 good = yield tuple(group)
675 687 yield None
676 688
677 689 def _findsnapshots(revlog, cache, start_rev):
678 690 """find snapshot from start_rev to tip"""
679 691 deltaparent = revlog.deltaparent
680 692 issnapshot = revlog.issnapshot
681 693 for rev in revlog.revs(start_rev):
682 694 if issnapshot(rev):
683 695 cache[deltaparent(rev)].append(rev)
684 696
685 697 def _refinedgroups(revlog, p1, p2, cachedelta):
686 698 good = None
687 699 # First we try to reuse a the delta contained in the bundle.
688 700 # (or from the source revlog)
689 701 #
690 702 # This logic only applies to general delta repositories and can be disabled
691 703 # through configuration. Disabling reuse source delta is useful when
692 704 # we want to make sure we recomputed "optimal" deltas.
693 705 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
694 706 # Assume what we received from the server is a good choice
695 707 # build delta will reuse the cache
696 708 good = yield (cachedelta[0],)
697 709 if good is not None:
698 710 yield None
699 711 return
700 712 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
701 713 good = yield candidates
702 714 if good is not None:
703 715 break
704 716
705 717 # If sparse revlog is enabled, we can try to refine the available deltas
706 718 if not revlog._sparserevlog:
707 719 yield None
708 720 return
709 721
710 722 # if we have a refinable value, try to refine it
711 723 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
712 724 # refine snapshot down
713 725 previous = None
714 726 while previous != good:
715 727 previous = good
716 728 base = revlog.deltaparent(good)
717 729 if base == nullrev:
718 730 break
719 731 good = yield (base,)
720 732 # refine snapshot up
721 733 #
722 734 # XXX the _findsnapshots call can be expensive and is "duplicated" with
723 735 # the one done in `_rawgroups`. Once we start working on performance,
724 736 # we should make the two logics share this computation.
725 737 snapshots = collections.defaultdict(list)
726 738 _findsnapshots(revlog, snapshots, good + 1)
727 739 previous = None
728 740 while good != previous:
729 741 previous = good
730 742 children = tuple(sorted(c for c in snapshots[good]))
731 743 good = yield children
732 744
733 745 # we have found nothing
734 746 yield None
735 747
736 748 def _rawgroups(revlog, p1, p2, cachedelta):
737 749 """Provides group of revision to be tested as delta base
738 750
739 751 This lower level function focus on emitting delta theorically interresting
740 752 without looking it any practical details.
741 753
742 754 The group order aims at providing fast or small candidates first.
743 755 """
744 756 gdelta = revlog._generaldelta
745 757 sparse = revlog._sparserevlog
746 758 curr = len(revlog)
747 759 prev = curr - 1
748 760 deltachain = lambda rev: revlog._deltachain(rev)[0]
749 761
750 762 if gdelta:
751 763 # exclude already lazy tested base if any
752 764 parents = [p for p in (p1, p2) if p != nullrev]
753 765
754 766 if not revlog._deltabothparents and len(parents) == 2:
755 767 parents.sort()
756 768 # To minimize the chance of having to build a fulltext,
757 769 # pick first whichever parent is closest to us (max rev)
758 770 yield (parents[1],)
759 771 # then the other one (min rev) if the first did not fit
760 772 yield (parents[0],)
761 773 elif len(parents) > 0:
762 774 # Test all parents (1 or 2), and keep the best candidate
763 775 yield parents
764 776
765 777 if sparse and parents:
766 778 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
767 779 # See if we can use an existing snapshot in the parent chains to use as
768 780 # a base for a new intermediate-snapshot
769 781 #
770 782 # search for snapshot in parents delta chain
771 783 # map: snapshot-level: snapshot-rev
772 784 parents_snaps = collections.defaultdict(set)
773 785 candidate_chains = [deltachain(p) for p in parents]
774 786 for chain in candidate_chains:
775 787 for idx, s in enumerate(chain):
776 788 if not revlog.issnapshot(s):
777 789 break
778 790 parents_snaps[idx].add(s)
779 791 snapfloor = min(parents_snaps[0]) + 1
780 792 _findsnapshots(revlog, snapshots, snapfloor)
781 793 # search for the highest "unrelated" revision
782 794 #
783 795 # Adding snapshots used by "unrelated" revision increase the odd we
784 796 # reuse an independant, yet better snapshot chain.
785 797 #
786 798 # XXX instead of building a set of revisions, we could lazily enumerate
787 799 # over the chains. That would be more efficient, however we stick to
788 800 # simple code for now.
789 801 all_revs = set()
790 802 for chain in candidate_chains:
791 803 all_revs.update(chain)
792 804 other = None
793 805 for r in revlog.revs(prev, snapfloor):
794 806 if r not in all_revs:
795 807 other = r
796 808 break
797 809 if other is not None:
798 810 # To avoid unfair competition, we won't use unrelated intermediate
799 811 # snapshot that are deeper than the ones from the parent delta
800 812 # chain.
801 813 max_depth = max(parents_snaps.keys())
802 814 chain = deltachain(other)
803 815 for idx, s in enumerate(chain):
804 816 if s < snapfloor:
805 817 continue
806 818 if max_depth < idx:
807 819 break
808 820 if not revlog.issnapshot(s):
809 821 break
810 822 parents_snaps[idx].add(s)
811 823 # Test them as possible intermediate snapshot base
812 824 # We test them from highest to lowest level. High level one are more
813 825 # likely to result in small delta
814 826 floor = None
815 827 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
816 828 siblings = set()
817 829 for s in snaps:
818 830 siblings.update(snapshots[s])
819 831 # Before considering making a new intermediate snapshot, we check
820 832 # if an existing snapshot, children of base we consider, would be
821 833 # suitable.
822 834 #
823 835 # It give a change to reuse a delta chain "unrelated" to the
824 836 # current revision instead of starting our own. Without such
825 837 # re-use, topological branches would keep reopening new chains.
826 838 # Creating more and more snapshot as the repository grow.
827 839
828 840 if floor is not None:
829 841 # We only do this for siblings created after the one in our
830 842 # parent's delta chain. Those created before has less chances
831 843 # to be valid base since our ancestors had to create a new
832 844 # snapshot.
833 845 siblings = [r for r in siblings if floor < r]
834 846 yield tuple(sorted(siblings))
835 847 # then test the base from our parent's delta chain.
836 848 yield tuple(sorted(snaps))
837 849 floor = min(snaps)
838 850 # No suitable base found in the parent chain, search if any full
839 851 # snapshots emitted since parent's base would be a suitable base for an
840 852 # intermediate snapshot.
841 853 #
842 854 # It give a chance to reuse a delta chain unrelated to the current
843 855 # revisions instead of starting our own. Without such re-use,
844 856 # topological branches would keep reopening new full chains. Creating
845 857 # more and more snapshot as the repository grow.
846 858 yield tuple(snapshots[nullrev])
847 859
848 860 if not sparse:
849 861 # other approach failed try against prev to hopefully save us a
850 862 # fulltext.
851 863 yield (prev,)
852 864
853 865 class deltacomputer(object):
854 866 def __init__(self, revlog):
855 867 self.revlog = revlog
856 868
857 869 def buildtext(self, revinfo, fh):
858 870 """Builds a fulltext version of a revision
859 871
860 872 revinfo: _revisioninfo instance that contains all needed info
861 873 fh: file handle to either the .i or the .d revlog file,
862 874 depending on whether it is inlined or not
863 875 """
864 876 btext = revinfo.btext
865 877 if btext[0] is not None:
866 878 return btext[0]
867 879
868 880 revlog = self.revlog
869 881 cachedelta = revinfo.cachedelta
870 882 baserev = cachedelta[0]
871 883 delta = cachedelta[1]
872 884
873 885 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
874 886 revinfo.p1, revinfo.p2,
875 887 revinfo.flags, revinfo.node)
876 888 return fulltext
877 889
878 890 def _builddeltadiff(self, base, revinfo, fh):
879 891 revlog = self.revlog
880 892 t = self.buildtext(revinfo, fh)
881 893 if revlog.iscensored(base):
882 894 # deltas based on a censored revision must replace the
883 895 # full content in one patch, so delta works everywhere
884 896 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
885 897 delta = header + t
886 898 else:
887 899 ptext = revlog.revision(base, _df=fh, raw=True)
888 900 delta = mdiff.textdiff(ptext, t)
889 901
890 902 return delta
891 903
892 904 def _builddeltainfo(self, revinfo, base, fh):
893 905 # can we use the cached delta?
894 906 delta = None
895 907 if revinfo.cachedelta:
896 908 cachebase, cachediff = revinfo.cachedelta
897 909 #check if the diff still apply
898 910 currentbase = cachebase
899 911 while (currentbase != nullrev
900 912 and currentbase != base
901 913 and self.revlog.length(currentbase) == 0):
902 914 currentbase = self.revlog.deltaparent(currentbase)
903 915 if currentbase == base:
904 916 delta = revinfo.cachedelta[1]
905 917 if delta is None:
906 918 delta = self._builddeltadiff(base, revinfo, fh)
907 919 revlog = self.revlog
908 920 header, data = revlog.compress(delta)
909 921 deltalen = len(header) + len(data)
910 922 chainbase = revlog.chainbase(base)
911 923 offset = revlog.end(len(revlog) - 1)
912 924 dist = deltalen + offset - revlog.start(chainbase)
913 925 if revlog._generaldelta:
914 926 deltabase = base
915 927 else:
916 928 deltabase = chainbase
917 929 chainlen, compresseddeltalen = revlog._chaininfo(base)
918 930 chainlen += 1
919 931 compresseddeltalen += deltalen
920 932
921 933 revlog = self.revlog
922 934 snapshotdepth = None
923 935 if deltabase == nullrev:
924 936 snapshotdepth = 0
925 937 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
926 938 # A delta chain should always be one full snapshot,
927 939 # zero or more semi-snapshots, and zero or more deltas
928 940 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
929 941 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
930 942 snapshotdepth = len(revlog._deltachain(deltabase)[0])
931 943
932 944 return _deltainfo(dist, deltalen, (header, data), deltabase,
933 945 chainbase, chainlen, compresseddeltalen,
934 946 snapshotdepth)
935 947
936 948 def _fullsnapshotinfo(self, fh, revinfo):
937 949 curr = len(self.revlog)
938 950 rawtext = self.buildtext(revinfo, fh)
939 951 data = self.revlog.compress(rawtext)
940 952 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
941 953 deltabase = chainbase = curr
942 954 snapshotdepth = 0
943 955 chainlen = 1
944 956
945 957 return _deltainfo(dist, deltalen, data, deltabase,
946 958 chainbase, chainlen, compresseddeltalen,
947 959 snapshotdepth)
948 960
949 961 def finddeltainfo(self, revinfo, fh):
950 962 """Find an acceptable delta against a candidate revision
951 963
952 964 revinfo: information about the revision (instance of _revisioninfo)
953 965 fh: file handle to either the .i or the .d revlog file,
954 966 depending on whether it is inlined or not
955 967
956 968 Returns the first acceptable candidate revision, as ordered by
957 969 _candidategroups
958 970
959 971 If no suitable deltabase is found, we return delta info for a full
960 972 snapshot.
961 973 """
962 974 if not revinfo.textlen:
963 975 return self._fullsnapshotinfo(fh, revinfo)
964 976
965 977 # no delta for flag processor revision (see "candelta" for why)
966 978 # not calling candelta since only one revision needs test, also to
967 979 # avoid overhead fetching flags again.
968 980 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
969 981 return self._fullsnapshotinfo(fh, revinfo)
970 982
971 983 cachedelta = revinfo.cachedelta
972 984 p1 = revinfo.p1
973 985 p2 = revinfo.p2
974 986 revlog = self.revlog
975 987
976 988 deltainfo = None
977 989 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
978 990 groups = _candidategroups(self.revlog, revinfo.textlen,
979 991 p1r, p2r, cachedelta)
980 992 candidaterevs = next(groups)
981 993 while candidaterevs is not None:
982 994 nominateddeltas = []
983 995 if deltainfo is not None:
984 996 # if we already found a good delta,
985 997 # challenge it against refined candidates
986 998 nominateddeltas.append(deltainfo)
987 999 for candidaterev in candidaterevs:
988 1000 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
989 1001 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
990 1002 nominateddeltas.append(candidatedelta)
991 1003 if nominateddeltas:
992 1004 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
993 1005 if deltainfo is not None:
994 1006 candidaterevs = groups.send(deltainfo.base)
995 1007 else:
996 1008 candidaterevs = next(groups)
997 1009
998 1010 if deltainfo is None:
999 1011 deltainfo = self._fullsnapshotinfo(fh, revinfo)
1000 1012 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now