##// END OF EJS Templates
revlog: limit base to rev size ratio to 500 instead of 50...
Boris Feld -
r41062:b3734779 default
parent child Browse files
Show More
@@ -1,1000 +1,1000 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import (
17 17 nullrev,
18 18 )
19 19 from ..i18n import _
20 20
21 21 from .constants import (
22 22 REVIDX_ISCENSORED,
23 23 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 24 )
25 25
26 26 from ..thirdparty import (
27 27 attr,
28 28 )
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 )
34 34
35 35 # maximum <delta-chain-data>/<revision-text-length> ratio
36 36 LIMIT_DELTA2TEXT = 2
37 37
38 38 class _testrevlog(object):
39 39 """minimalist fake revlog to use in doctests"""
40 40
41 41 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
42 42 """data is an list of revision payload boundaries"""
43 43 self._data = data
44 44 self._srdensitythreshold = density
45 45 self._srmingapsize = mingap
46 46 self._snapshot = set(snapshot)
47 47 self.index = None
48 48
49 49 def start(self, rev):
50 50 if rev == 0:
51 51 return 0
52 52 return self._data[rev - 1]
53 53
54 54 def end(self, rev):
55 55 return self._data[rev]
56 56
57 57 def length(self, rev):
58 58 return self.end(rev) - self.start(rev)
59 59
60 60 def __len__(self):
61 61 return len(self._data)
62 62
63 63 def issnapshot(self, rev):
64 64 return rev in self._snapshot
65 65
66 66 def slicechunk(revlog, revs, targetsize=None):
67 67 """slice revs to reduce the amount of unrelated data to be read from disk.
68 68
69 69 ``revs`` is sliced into groups that should be read in one time.
70 70 Assume that revs are sorted.
71 71
72 72 The initial chunk is sliced until the overall density (payload/chunks-span
73 73 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
74 74 `revlog._srmingapsize` is skipped.
75 75
76 76 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
77 77 For consistency with other slicing choice, this limit won't go lower than
78 78 `revlog._srmingapsize`.
79 79
80 80 If individual revisions chunk are larger than this limit, they will still
81 81 be raised individually.
82 82
83 83 >>> data = [
84 84 ... 5, #00 (5)
85 85 ... 10, #01 (5)
86 86 ... 12, #02 (2)
87 87 ... 12, #03 (empty)
88 88 ... 27, #04 (15)
89 89 ... 31, #05 (4)
90 90 ... 31, #06 (empty)
91 91 ... 42, #07 (11)
92 92 ... 47, #08 (5)
93 93 ... 47, #09 (empty)
94 94 ... 48, #10 (1)
95 95 ... 51, #11 (3)
96 96 ... 74, #12 (23)
97 97 ... 85, #13 (11)
98 98 ... 86, #14 (1)
99 99 ... 91, #15 (5)
100 100 ... ]
101 101 >>> revlog = _testrevlog(data, snapshot=range(16))
102 102
103 103 >>> list(slicechunk(revlog, list(range(16))))
104 104 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
105 105 >>> list(slicechunk(revlog, [0, 15]))
106 106 [[0], [15]]
107 107 >>> list(slicechunk(revlog, [0, 11, 15]))
108 108 [[0], [11], [15]]
109 109 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
110 110 [[0], [11, 13, 15]]
111 111 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
112 112 [[1, 2], [5, 8, 10, 11], [14]]
113 113
114 114 Slicing with a maximum chunk size
115 115 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
116 116 [[0], [11], [13], [15]]
117 117 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
118 118 [[0], [11], [13, 15]]
119 119 """
120 120 if targetsize is not None:
121 121 targetsize = max(targetsize, revlog._srmingapsize)
122 122 # targetsize should not be specified when evaluating delta candidates:
123 123 # * targetsize is used to ensure we stay within specification when reading,
124 124 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
125 125 if densityslicing is None:
126 126 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
127 127 for chunk in densityslicing(revs,
128 128 revlog._srdensitythreshold,
129 129 revlog._srmingapsize):
130 130 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
131 131 yield subchunk
132 132
133 133 def _slicechunktosize(revlog, revs, targetsize=None):
134 134 """slice revs to match the target size
135 135
136 136 This is intended to be used on chunk that density slicing selected by that
137 137 are still too large compared to the read garantee of revlog. This might
138 138 happens when "minimal gap size" interrupted the slicing or when chain are
139 139 built in a way that create large blocks next to each other.
140 140
141 141 >>> data = [
142 142 ... 3, #0 (3)
143 143 ... 5, #1 (2)
144 144 ... 6, #2 (1)
145 145 ... 8, #3 (2)
146 146 ... 8, #4 (empty)
147 147 ... 11, #5 (3)
148 148 ... 12, #6 (1)
149 149 ... 13, #7 (1)
150 150 ... 14, #8 (1)
151 151 ... ]
152 152
153 153 == All snapshots cases ==
154 154 >>> revlog = _testrevlog(data, snapshot=range(9))
155 155
156 156 Cases where chunk is already small enough
157 157 >>> list(_slicechunktosize(revlog, [0], 3))
158 158 [[0]]
159 159 >>> list(_slicechunktosize(revlog, [6, 7], 3))
160 160 [[6, 7]]
161 161 >>> list(_slicechunktosize(revlog, [0], None))
162 162 [[0]]
163 163 >>> list(_slicechunktosize(revlog, [6, 7], None))
164 164 [[6, 7]]
165 165
166 166 cases where we need actual slicing
167 167 >>> list(_slicechunktosize(revlog, [0, 1], 3))
168 168 [[0], [1]]
169 169 >>> list(_slicechunktosize(revlog, [1, 3], 3))
170 170 [[1], [3]]
171 171 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
172 172 [[1, 2], [3]]
173 173 >>> list(_slicechunktosize(revlog, [3, 5], 3))
174 174 [[3], [5]]
175 175 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
176 176 [[3], [5]]
177 177 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
178 178 [[5], [6, 7, 8]]
179 179 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
180 180 [[0], [1, 2], [3], [5], [6, 7, 8]]
181 181
182 182 Case with too large individual chunk (must return valid chunk)
183 183 >>> list(_slicechunktosize(revlog, [0, 1], 2))
184 184 [[0], [1]]
185 185 >>> list(_slicechunktosize(revlog, [1, 3], 1))
186 186 [[1], [3]]
187 187 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
188 188 [[3], [5]]
189 189
190 190 == No Snapshot cases ==
191 191 >>> revlog = _testrevlog(data)
192 192
193 193 Cases where chunk is already small enough
194 194 >>> list(_slicechunktosize(revlog, [0], 3))
195 195 [[0]]
196 196 >>> list(_slicechunktosize(revlog, [6, 7], 3))
197 197 [[6, 7]]
198 198 >>> list(_slicechunktosize(revlog, [0], None))
199 199 [[0]]
200 200 >>> list(_slicechunktosize(revlog, [6, 7], None))
201 201 [[6, 7]]
202 202
203 203 cases where we need actual slicing
204 204 >>> list(_slicechunktosize(revlog, [0, 1], 3))
205 205 [[0], [1]]
206 206 >>> list(_slicechunktosize(revlog, [1, 3], 3))
207 207 [[1], [3]]
208 208 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
209 209 [[1], [2, 3]]
210 210 >>> list(_slicechunktosize(revlog, [3, 5], 3))
211 211 [[3], [5]]
212 212 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
213 213 [[3], [4, 5]]
214 214 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
215 215 [[5], [6, 7, 8]]
216 216 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
217 217 [[0], [1, 2], [3], [5], [6, 7, 8]]
218 218
219 219 Case with too large individual chunk (must return valid chunk)
220 220 >>> list(_slicechunktosize(revlog, [0, 1], 2))
221 221 [[0], [1]]
222 222 >>> list(_slicechunktosize(revlog, [1, 3], 1))
223 223 [[1], [3]]
224 224 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
225 225 [[3], [5]]
226 226
227 227 == mixed case ==
228 228 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
229 229 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
230 230 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
231 231 """
232 232 assert targetsize is None or 0 <= targetsize
233 233 startdata = revlog.start(revs[0])
234 234 enddata = revlog.end(revs[-1])
235 235 fullspan = enddata - startdata
236 236 if targetsize is None or fullspan <= targetsize:
237 237 yield revs
238 238 return
239 239
240 240 startrevidx = 0
241 241 endrevidx = 1
242 242 iterrevs = enumerate(revs)
243 243 next(iterrevs) # skip first rev.
244 244 # first step: get snapshots out of the way
245 245 for idx, r in iterrevs:
246 246 span = revlog.end(r) - startdata
247 247 snapshot = revlog.issnapshot(r)
248 248 if span <= targetsize and snapshot:
249 249 endrevidx = idx + 1
250 250 else:
251 251 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
252 252 if chunk:
253 253 yield chunk
254 254 startrevidx = idx
255 255 startdata = revlog.start(r)
256 256 endrevidx = idx + 1
257 257 if not snapshot:
258 258 break
259 259
260 260 # for the others, we use binary slicing to quickly converge toward valid
261 261 # chunks (otherwise, we might end up looking for start/end of many
262 262 # revisions). This logic is not looking for the perfect slicing point, it
263 263 # focuses on quickly converging toward valid chunks.
264 264 nbitem = len(revs)
265 265 while (enddata - startdata) > targetsize:
266 266 endrevidx = nbitem
267 267 if nbitem - startrevidx <= 1:
268 268 break # protect against individual chunk larger than limit
269 269 localenddata = revlog.end(revs[endrevidx - 1])
270 270 span = localenddata - startdata
271 271 while span > targetsize:
272 272 if endrevidx - startrevidx <= 1:
273 273 break # protect against individual chunk larger than limit
274 274 endrevidx -= (endrevidx - startrevidx) // 2
275 275 localenddata = revlog.end(revs[endrevidx - 1])
276 276 span = localenddata - startdata
277 277 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
278 278 if chunk:
279 279 yield chunk
280 280 startrevidx = endrevidx
281 281 startdata = revlog.start(revs[startrevidx])
282 282
283 283 chunk = _trimchunk(revlog, revs, startrevidx)
284 284 if chunk:
285 285 yield chunk
286 286
287 287 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
288 288 mingapsize=0):
289 289 """slice revs to reduce the amount of unrelated data to be read from disk.
290 290
291 291 ``revs`` is sliced into groups that should be read in one time.
292 292 Assume that revs are sorted.
293 293
294 294 The initial chunk is sliced until the overall density (payload/chunks-span
295 295 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
296 296 skipped.
297 297
298 298 >>> revlog = _testrevlog([
299 299 ... 5, #00 (5)
300 300 ... 10, #01 (5)
301 301 ... 12, #02 (2)
302 302 ... 12, #03 (empty)
303 303 ... 27, #04 (15)
304 304 ... 31, #05 (4)
305 305 ... 31, #06 (empty)
306 306 ... 42, #07 (11)
307 307 ... 47, #08 (5)
308 308 ... 47, #09 (empty)
309 309 ... 48, #10 (1)
310 310 ... 51, #11 (3)
311 311 ... 74, #12 (23)
312 312 ... 85, #13 (11)
313 313 ... 86, #14 (1)
314 314 ... 91, #15 (5)
315 315 ... ])
316 316
317 317 >>> list(_slicechunktodensity(revlog, list(range(16))))
318 318 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
319 319 >>> list(_slicechunktodensity(revlog, [0, 15]))
320 320 [[0], [15]]
321 321 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
322 322 [[0], [11], [15]]
323 323 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
324 324 [[0], [11, 13, 15]]
325 325 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
326 326 [[1, 2], [5, 8, 10, 11], [14]]
327 327 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
328 328 ... mingapsize=20))
329 329 [[1, 2, 3, 5, 8, 10, 11], [14]]
330 330 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
331 331 ... targetdensity=0.95))
332 332 [[1, 2], [5], [8, 10, 11], [14]]
333 333 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
334 334 ... targetdensity=0.95, mingapsize=12))
335 335 [[1, 2], [5, 8, 10, 11], [14]]
336 336 """
337 337 start = revlog.start
338 338 length = revlog.length
339 339
340 340 if len(revs) <= 1:
341 341 yield revs
342 342 return
343 343
344 344 deltachainspan = segmentspan(revlog, revs)
345 345
346 346 if deltachainspan < mingapsize:
347 347 yield revs
348 348 return
349 349
350 350 readdata = deltachainspan
351 351 chainpayload = sum(length(r) for r in revs)
352 352
353 353 if deltachainspan:
354 354 density = chainpayload / float(deltachainspan)
355 355 else:
356 356 density = 1.0
357 357
358 358 if density >= targetdensity:
359 359 yield revs
360 360 return
361 361
362 362 # Store the gaps in a heap to have them sorted by decreasing size
363 363 gaps = []
364 364 prevend = None
365 365 for i, rev in enumerate(revs):
366 366 revstart = start(rev)
367 367 revlen = length(rev)
368 368
369 369 # Skip empty revisions to form larger holes
370 370 if revlen == 0:
371 371 continue
372 372
373 373 if prevend is not None:
374 374 gapsize = revstart - prevend
375 375 # only consider holes that are large enough
376 376 if gapsize > mingapsize:
377 377 gaps.append((gapsize, i))
378 378
379 379 prevend = revstart + revlen
380 380 # sort the gaps to pop them from largest to small
381 381 gaps.sort()
382 382
383 383 # Collect the indices of the largest holes until the density is acceptable
384 384 selected = []
385 385 while gaps and density < targetdensity:
386 386 gapsize, gapidx = gaps.pop()
387 387
388 388 selected.append(gapidx)
389 389
390 390 # the gap sizes are stored as negatives to be sorted decreasingly
391 391 # by the heap
392 392 readdata -= gapsize
393 393 if readdata > 0:
394 394 density = chainpayload / float(readdata)
395 395 else:
396 396 density = 1.0
397 397 selected.sort()
398 398
399 399 # Cut the revs at collected indices
400 400 previdx = 0
401 401 for idx in selected:
402 402
403 403 chunk = _trimchunk(revlog, revs, previdx, idx)
404 404 if chunk:
405 405 yield chunk
406 406
407 407 previdx = idx
408 408
409 409 chunk = _trimchunk(revlog, revs, previdx)
410 410 if chunk:
411 411 yield chunk
412 412
413 413 def _trimchunk(revlog, revs, startidx, endidx=None):
414 414 """returns revs[startidx:endidx] without empty trailing revs
415 415
416 416 Doctest Setup
417 417 >>> revlog = _testrevlog([
418 418 ... 5, #0
419 419 ... 10, #1
420 420 ... 12, #2
421 421 ... 12, #3 (empty)
422 422 ... 17, #4
423 423 ... 21, #5
424 424 ... 21, #6 (empty)
425 425 ... ])
426 426
427 427 Contiguous cases:
428 428 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
429 429 [0, 1, 2, 3, 4, 5]
430 430 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
431 431 [0, 1, 2, 3, 4]
432 432 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
433 433 [0, 1, 2]
434 434 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
435 435 [2]
436 436 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
437 437 [3, 4, 5]
438 438 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
439 439 [3, 4]
440 440
441 441 Discontiguous cases:
442 442 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
443 443 [1, 3, 5]
444 444 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
445 445 [1]
446 446 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
447 447 [3, 5]
448 448 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
449 449 [3, 5]
450 450 """
451 451 length = revlog.length
452 452
453 453 if endidx is None:
454 454 endidx = len(revs)
455 455
456 456 # If we have a non-emtpy delta candidate, there are nothing to trim
457 457 if revs[endidx - 1] < len(revlog):
458 458 # Trim empty revs at the end, except the very first revision of a chain
459 459 while (endidx > 1
460 460 and endidx > startidx
461 461 and length(revs[endidx - 1]) == 0):
462 462 endidx -= 1
463 463
464 464 return revs[startidx:endidx]
465 465
466 466 def segmentspan(revlog, revs):
467 467 """Get the byte span of a segment of revisions
468 468
469 469 revs is a sorted array of revision numbers
470 470
471 471 >>> revlog = _testrevlog([
472 472 ... 5, #0
473 473 ... 10, #1
474 474 ... 12, #2
475 475 ... 12, #3 (empty)
476 476 ... 17, #4
477 477 ... ])
478 478
479 479 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
480 480 17
481 481 >>> segmentspan(revlog, [0, 4])
482 482 17
483 483 >>> segmentspan(revlog, [3, 4])
484 484 5
485 485 >>> segmentspan(revlog, [1, 2, 3,])
486 486 7
487 487 >>> segmentspan(revlog, [1, 3])
488 488 7
489 489 """
490 490 if not revs:
491 491 return 0
492 492 end = revlog.end(revs[-1])
493 493 return end - revlog.start(revs[0])
494 494
495 495 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
496 496 """build full text from a (base, delta) pair and other metadata"""
497 497 # special case deltas which replace entire base; no need to decode
498 498 # base revision. this neatly avoids censored bases, which throw when
499 499 # they're decoded.
500 500 hlen = struct.calcsize(">lll")
501 501 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
502 502 len(delta) - hlen):
503 503 fulltext = delta[hlen:]
504 504 else:
505 505 # deltabase is rawtext before changed by flag processors, which is
506 506 # equivalent to non-raw text
507 507 basetext = revlog.revision(baserev, _df=fh, raw=False)
508 508 fulltext = mdiff.patch(basetext, delta)
509 509
510 510 try:
511 511 res = revlog._processflags(fulltext, flags, 'read', raw=True)
512 512 fulltext, validatehash = res
513 513 if validatehash:
514 514 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
515 515 if flags & REVIDX_ISCENSORED:
516 516 raise error.StorageError(_('node %s is not censored') %
517 517 expectednode)
518 518 except error.CensoredNodeError:
519 519 # must pass the censored index flag to add censored revisions
520 520 if not flags & REVIDX_ISCENSORED:
521 521 raise
522 522 return fulltext
523 523
524 524 @attr.s(slots=True, frozen=True)
525 525 class _deltainfo(object):
526 526 distance = attr.ib()
527 527 deltalen = attr.ib()
528 528 data = attr.ib()
529 529 base = attr.ib()
530 530 chainbase = attr.ib()
531 531 chainlen = attr.ib()
532 532 compresseddeltalen = attr.ib()
533 533 snapshotdepth = attr.ib()
534 534
535 535 def isgooddeltainfo(revlog, deltainfo, revinfo):
536 536 """Returns True if the given delta is good. Good means that it is within
537 537 the disk span, disk size, and chain length bounds that we know to be
538 538 performant."""
539 539 if deltainfo is None:
540 540 return False
541 541
542 542 # - 'deltainfo.distance' is the distance from the base revision --
543 543 # bounding it limits the amount of I/O we need to do.
544 544 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
545 545 # deltas we need to apply -- bounding it limits the amount of CPU
546 546 # we consume.
547 547
548 548 textlen = revinfo.textlen
549 549 defaultmax = textlen * 4
550 550 maxdist = revlog._maxdeltachainspan
551 551 if not maxdist:
552 552 maxdist = deltainfo.distance # ensure the conditional pass
553 553 maxdist = max(maxdist, defaultmax)
554 554
555 555 # Bad delta from read span:
556 556 #
557 557 # If the span of data read is larger than the maximum allowed.
558 558 #
559 559 # In the sparse-revlog case, we rely on the associated "sparse reading"
560 560 # to avoid issue related to the span of data. In theory, it would be
561 561 # possible to build pathological revlog where delta pattern would lead
562 562 # to too many reads. However, they do not happen in practice at all. So
563 563 # we skip the span check entirely.
564 564 if not revlog._sparserevlog and maxdist < deltainfo.distance:
565 565 return False
566 566
567 567 # Bad delta from new delta size:
568 568 #
569 569 # If the delta size is larger than the target text, storing the
570 570 # delta will be inefficient.
571 571 if textlen < deltainfo.deltalen:
572 572 return False
573 573
574 574 # Bad delta from cumulated payload size:
575 575 #
576 576 # If the sum of delta get larger than K * target text length.
577 577 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
578 578 return False
579 579
580 580 # Bad delta from chain length:
581 581 #
582 582 # If the number of delta in the chain gets too high.
583 583 if (revlog._maxchainlen
584 584 and revlog._maxchainlen < deltainfo.chainlen):
585 585 return False
586 586
587 587 # bad delta from intermediate snapshot size limit
588 588 #
589 589 # If an intermediate snapshot size is higher than the limit. The
590 590 # limit exist to prevent endless chain of intermediate delta to be
591 591 # created.
592 592 if (deltainfo.snapshotdepth is not None and
593 593 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
594 594 return False
595 595
596 596 # bad delta if new intermediate snapshot is larger than the previous
597 597 # snapshot
598 598 if (deltainfo.snapshotdepth
599 599 and revlog.length(deltainfo.base) < deltainfo.deltalen):
600 600 return False
601 601
602 602 return True
603 603
604 604 # If a revision's full text is that much bigger than a base candidate full
605 605 # text's, it is very unlikely that it will produce a valid delta. We no longer
606 606 # consider these candidates.
607 LIMIT_BASE2TEXT = 50
607 LIMIT_BASE2TEXT = 500
608 608
609 609 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
610 610 """Provides group of revision to be tested as delta base
611 611
612 612 This top level function focus on emitting groups with unique and worthwhile
613 613 content. See _raw_candidate_groups for details about the group order.
614 614 """
615 615 # should we try to build a delta?
616 616 if not (len(revlog) and revlog._storedeltachains):
617 617 yield None
618 618 return
619 619
620 620 deltalength = revlog.length
621 621 deltaparent = revlog.deltaparent
622 622 sparse = revlog._sparserevlog
623 623 good = None
624 624
625 625 deltas_limit = textlen * LIMIT_DELTA2TEXT
626 626
627 627 tested = set([nullrev])
628 628 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
629 629 while True:
630 630 temptative = candidates.send(good)
631 631 if temptative is None:
632 632 break
633 633 group = []
634 634 for rev in temptative:
635 635 # skip over empty delta (no need to include them in a chain)
636 636 while (revlog._generaldelta
637 637 and not (rev == nullrev
638 638 or rev in tested
639 639 or deltalength(rev))):
640 640 tested.add(rev)
641 641 rev = deltaparent(rev)
642 642 # no need to try a delta against nullrev, this will be done as a
643 643 # last resort.
644 644 if rev == nullrev:
645 645 continue
646 646 # filter out revision we tested already
647 647 if rev in tested:
648 648 continue
649 649 tested.add(rev)
650 650 # filter out delta base that will never produce good delta
651 651 if deltas_limit < revlog.length(rev):
652 652 continue
653 653 if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
654 654 continue
655 655 # no delta for rawtext-changing revs (see "candelta" for why)
656 656 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
657 657 continue
658 658 # If we reach here, we are about to build and test a delta.
659 659 # The delta building process will compute the chaininfo in all
660 660 # case, since that computation is cached, it is fine to access it
661 661 # here too.
662 662 chainlen, chainsize = revlog._chaininfo(rev)
663 663 # if chain will be too long, skip base
664 664 if revlog._maxchainlen and chainlen >= revlog._maxchainlen:
665 665 continue
666 666 # if chain already have too much data, skip base
667 667 if deltas_limit < chainsize:
668 668 continue
669 669 group.append(rev)
670 670 if group:
671 671 # XXX: in the sparse revlog case, group can become large,
672 672 # impacting performances. Some bounding or slicing mecanism
673 673 # would help to reduce this impact.
674 674 good = yield tuple(group)
675 675 yield None
676 676
677 677 def _findsnapshots(revlog, cache, start_rev):
678 678 """find snapshot from start_rev to tip"""
679 679 deltaparent = revlog.deltaparent
680 680 issnapshot = revlog.issnapshot
681 681 for rev in revlog.revs(start_rev):
682 682 if issnapshot(rev):
683 683 cache[deltaparent(rev)].append(rev)
684 684
685 685 def _refinedgroups(revlog, p1, p2, cachedelta):
686 686 good = None
687 687 # First we try to reuse a the delta contained in the bundle.
688 688 # (or from the source revlog)
689 689 #
690 690 # This logic only applies to general delta repositories and can be disabled
691 691 # through configuration. Disabling reuse source delta is useful when
692 692 # we want to make sure we recomputed "optimal" deltas.
693 693 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
694 694 # Assume what we received from the server is a good choice
695 695 # build delta will reuse the cache
696 696 good = yield (cachedelta[0],)
697 697 if good is not None:
698 698 yield None
699 699 return
700 700 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
701 701 good = yield candidates
702 702 if good is not None:
703 703 break
704 704
705 705 # If sparse revlog is enabled, we can try to refine the available deltas
706 706 if not revlog._sparserevlog:
707 707 yield None
708 708 return
709 709
710 710 # if we have a refinable value, try to refine it
711 711 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
712 712 # refine snapshot down
713 713 previous = None
714 714 while previous != good:
715 715 previous = good
716 716 base = revlog.deltaparent(good)
717 717 if base == nullrev:
718 718 break
719 719 good = yield (base,)
720 720 # refine snapshot up
721 721 #
722 722 # XXX the _findsnapshots call can be expensive and is "duplicated" with
723 723 # the one done in `_rawgroups`. Once we start working on performance,
724 724 # we should make the two logics share this computation.
725 725 snapshots = collections.defaultdict(list)
726 726 _findsnapshots(revlog, snapshots, good + 1)
727 727 previous = None
728 728 while good != previous:
729 729 previous = good
730 730 children = tuple(sorted(c for c in snapshots[good]))
731 731 good = yield children
732 732
733 733 # we have found nothing
734 734 yield None
735 735
736 736 def _rawgroups(revlog, p1, p2, cachedelta):
737 737 """Provides group of revision to be tested as delta base
738 738
739 739 This lower level function focus on emitting delta theorically interresting
740 740 without looking it any practical details.
741 741
742 742 The group order aims at providing fast or small candidates first.
743 743 """
744 744 gdelta = revlog._generaldelta
745 745 sparse = revlog._sparserevlog
746 746 curr = len(revlog)
747 747 prev = curr - 1
748 748 deltachain = lambda rev: revlog._deltachain(rev)[0]
749 749
750 750 if gdelta:
751 751 # exclude already lazy tested base if any
752 752 parents = [p for p in (p1, p2) if p != nullrev]
753 753
754 754 if not revlog._deltabothparents and len(parents) == 2:
755 755 parents.sort()
756 756 # To minimize the chance of having to build a fulltext,
757 757 # pick first whichever parent is closest to us (max rev)
758 758 yield (parents[1],)
759 759 # then the other one (min rev) if the first did not fit
760 760 yield (parents[0],)
761 761 elif len(parents) > 0:
762 762 # Test all parents (1 or 2), and keep the best candidate
763 763 yield parents
764 764
765 765 if sparse and parents:
766 766 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
767 767 # See if we can use an existing snapshot in the parent chains to use as
768 768 # a base for a new intermediate-snapshot
769 769 #
770 770 # search for snapshot in parents delta chain
771 771 # map: snapshot-level: snapshot-rev
772 772 parents_snaps = collections.defaultdict(set)
773 773 candidate_chains = [deltachain(p) for p in parents]
774 774 for chain in candidate_chains:
775 775 for idx, s in enumerate(chain):
776 776 if not revlog.issnapshot(s):
777 777 break
778 778 parents_snaps[idx].add(s)
779 779 snapfloor = min(parents_snaps[0]) + 1
780 780 _findsnapshots(revlog, snapshots, snapfloor)
781 781 # search for the highest "unrelated" revision
782 782 #
783 783 # Adding snapshots used by "unrelated" revision increase the odd we
784 784 # reuse an independant, yet better snapshot chain.
785 785 #
786 786 # XXX instead of building a set of revisions, we could lazily enumerate
787 787 # over the chains. That would be more efficient, however we stick to
788 788 # simple code for now.
789 789 all_revs = set()
790 790 for chain in candidate_chains:
791 791 all_revs.update(chain)
792 792 other = None
793 793 for r in revlog.revs(prev, snapfloor):
794 794 if r not in all_revs:
795 795 other = r
796 796 break
797 797 if other is not None:
798 798 # To avoid unfair competition, we won't use unrelated intermediate
799 799 # snapshot that are deeper than the ones from the parent delta
800 800 # chain.
801 801 max_depth = max(parents_snaps.keys())
802 802 chain = deltachain(other)
803 803 for idx, s in enumerate(chain):
804 804 if s < snapfloor:
805 805 continue
806 806 if max_depth < idx:
807 807 break
808 808 if not revlog.issnapshot(s):
809 809 break
810 810 parents_snaps[idx].add(s)
811 811 # Test them as possible intermediate snapshot base
812 812 # We test them from highest to lowest level. High level one are more
813 813 # likely to result in small delta
814 814 floor = None
815 815 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
816 816 siblings = set()
817 817 for s in snaps:
818 818 siblings.update(snapshots[s])
819 819 # Before considering making a new intermediate snapshot, we check
820 820 # if an existing snapshot, children of base we consider, would be
821 821 # suitable.
822 822 #
823 823 # It give a change to reuse a delta chain "unrelated" to the
824 824 # current revision instead of starting our own. Without such
825 825 # re-use, topological branches would keep reopening new chains.
826 826 # Creating more and more snapshot as the repository grow.
827 827
828 828 if floor is not None:
829 829 # We only do this for siblings created after the one in our
830 830 # parent's delta chain. Those created before has less chances
831 831 # to be valid base since our ancestors had to create a new
832 832 # snapshot.
833 833 siblings = [r for r in siblings if floor < r]
834 834 yield tuple(sorted(siblings))
835 835 # then test the base from our parent's delta chain.
836 836 yield tuple(sorted(snaps))
837 837 floor = min(snaps)
838 838 # No suitable base found in the parent chain, search if any full
839 839 # snapshots emitted since parent's base would be a suitable base for an
840 840 # intermediate snapshot.
841 841 #
842 842 # It give a chance to reuse a delta chain unrelated to the current
843 843 # revisions instead of starting our own. Without such re-use,
844 844 # topological branches would keep reopening new full chains. Creating
845 845 # more and more snapshot as the repository grow.
846 846 yield tuple(snapshots[nullrev])
847 847
848 848 if not sparse:
849 849 # other approach failed try against prev to hopefully save us a
850 850 # fulltext.
851 851 yield (prev,)
852 852
853 853 class deltacomputer(object):
854 854 def __init__(self, revlog):
855 855 self.revlog = revlog
856 856
857 857 def buildtext(self, revinfo, fh):
858 858 """Builds a fulltext version of a revision
859 859
860 860 revinfo: _revisioninfo instance that contains all needed info
861 861 fh: file handle to either the .i or the .d revlog file,
862 862 depending on whether it is inlined or not
863 863 """
864 864 btext = revinfo.btext
865 865 if btext[0] is not None:
866 866 return btext[0]
867 867
868 868 revlog = self.revlog
869 869 cachedelta = revinfo.cachedelta
870 870 baserev = cachedelta[0]
871 871 delta = cachedelta[1]
872 872
873 873 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
874 874 revinfo.p1, revinfo.p2,
875 875 revinfo.flags, revinfo.node)
876 876 return fulltext
877 877
878 878 def _builddeltadiff(self, base, revinfo, fh):
879 879 revlog = self.revlog
880 880 t = self.buildtext(revinfo, fh)
881 881 if revlog.iscensored(base):
882 882 # deltas based on a censored revision must replace the
883 883 # full content in one patch, so delta works everywhere
884 884 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
885 885 delta = header + t
886 886 else:
887 887 ptext = revlog.revision(base, _df=fh, raw=True)
888 888 delta = mdiff.textdiff(ptext, t)
889 889
890 890 return delta
891 891
892 892 def _builddeltainfo(self, revinfo, base, fh):
893 893 # can we use the cached delta?
894 894 delta = None
895 895 if revinfo.cachedelta:
896 896 cachebase, cachediff = revinfo.cachedelta
897 897 #check if the diff still apply
898 898 currentbase = cachebase
899 899 while (currentbase != nullrev
900 900 and currentbase != base
901 901 and self.revlog.length(currentbase) == 0):
902 902 currentbase = self.revlog.deltaparent(currentbase)
903 903 if currentbase == base:
904 904 delta = revinfo.cachedelta[1]
905 905 if delta is None:
906 906 delta = self._builddeltadiff(base, revinfo, fh)
907 907 revlog = self.revlog
908 908 header, data = revlog.compress(delta)
909 909 deltalen = len(header) + len(data)
910 910 chainbase = revlog.chainbase(base)
911 911 offset = revlog.end(len(revlog) - 1)
912 912 dist = deltalen + offset - revlog.start(chainbase)
913 913 if revlog._generaldelta:
914 914 deltabase = base
915 915 else:
916 916 deltabase = chainbase
917 917 chainlen, compresseddeltalen = revlog._chaininfo(base)
918 918 chainlen += 1
919 919 compresseddeltalen += deltalen
920 920
921 921 revlog = self.revlog
922 922 snapshotdepth = None
923 923 if deltabase == nullrev:
924 924 snapshotdepth = 0
925 925 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
926 926 # A delta chain should always be one full snapshot,
927 927 # zero or more semi-snapshots, and zero or more deltas
928 928 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
929 929 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
930 930 snapshotdepth = len(revlog._deltachain(deltabase)[0])
931 931
932 932 return _deltainfo(dist, deltalen, (header, data), deltabase,
933 933 chainbase, chainlen, compresseddeltalen,
934 934 snapshotdepth)
935 935
936 936 def _fullsnapshotinfo(self, fh, revinfo):
937 937 curr = len(self.revlog)
938 938 rawtext = self.buildtext(revinfo, fh)
939 939 data = self.revlog.compress(rawtext)
940 940 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
941 941 deltabase = chainbase = curr
942 942 snapshotdepth = 0
943 943 chainlen = 1
944 944
945 945 return _deltainfo(dist, deltalen, data, deltabase,
946 946 chainbase, chainlen, compresseddeltalen,
947 947 snapshotdepth)
948 948
949 949 def finddeltainfo(self, revinfo, fh):
950 950 """Find an acceptable delta against a candidate revision
951 951
952 952 revinfo: information about the revision (instance of _revisioninfo)
953 953 fh: file handle to either the .i or the .d revlog file,
954 954 depending on whether it is inlined or not
955 955
956 956 Returns the first acceptable candidate revision, as ordered by
957 957 _candidategroups
958 958
959 959 If no suitable deltabase is found, we return delta info for a full
960 960 snapshot.
961 961 """
962 962 if not revinfo.textlen:
963 963 return self._fullsnapshotinfo(fh, revinfo)
964 964
965 965 # no delta for flag processor revision (see "candelta" for why)
966 966 # not calling candelta since only one revision needs test, also to
967 967 # avoid overhead fetching flags again.
968 968 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
969 969 return self._fullsnapshotinfo(fh, revinfo)
970 970
971 971 cachedelta = revinfo.cachedelta
972 972 p1 = revinfo.p1
973 973 p2 = revinfo.p2
974 974 revlog = self.revlog
975 975
976 976 deltainfo = None
977 977 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
978 978 groups = _candidategroups(self.revlog, revinfo.textlen,
979 979 p1r, p2r, cachedelta)
980 980 candidaterevs = next(groups)
981 981 while candidaterevs is not None:
982 982 nominateddeltas = []
983 983 if deltainfo is not None:
984 984 # if we already found a good delta,
985 985 # challenge it against refined candidates
986 986 nominateddeltas.append(deltainfo)
987 987 for candidaterev in candidaterevs:
988 988 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
989 989 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
990 990 nominateddeltas.append(candidatedelta)
991 991 if nominateddeltas:
992 992 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
993 993 if deltainfo is not None:
994 994 candidaterevs = groups.send(deltainfo.base)
995 995 else:
996 996 candidaterevs = next(groups)
997 997
998 998 if deltainfo is None:
999 999 deltainfo = self._fullsnapshotinfo(fh, revinfo)
1000 1000 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now