##// END OF EJS Templates
sparse-revlog: put the native implementation of slicechunktodensity to use...
Boris Feld -
r40745:39d29542 default
parent child Browse files
Show More
@@ -1,977 +1,981 b''
1 1 # revlogdeltas.py - Logic around delta computation for revlog
2 2 #
3 3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
4 4 # Copyright 2018 Octobus <contact@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 """Helper class to compute deltas stored inside revlogs"""
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import collections
13 13 import struct
14 14
15 15 # import stuff from node for others to import from revlog
16 16 from ..node import (
17 17 nullrev,
18 18 )
19 19 from ..i18n import _
20 20
21 21 from .constants import (
22 22 REVIDX_ISCENSORED,
23 23 REVIDX_RAWTEXT_CHANGING_FLAGS,
24 24 )
25 25
26 26 from ..thirdparty import (
27 27 attr,
28 28 )
29 29
30 30 from .. import (
31 31 error,
32 32 mdiff,
33 33 )
34 34
35 35 # maximum <delta-chain-data>/<revision-text-length> ratio
36 36 LIMIT_DELTA2TEXT = 2
37 37
38 38 class _testrevlog(object):
39 39 """minimalist fake revlog to use in doctests"""
40 40
41 41 def __init__(self, data, density=0.5, mingap=0, snapshot=()):
42 42 """data is an list of revision payload boundaries"""
43 43 self._data = data
44 44 self._srdensitythreshold = density
45 45 self._srmingapsize = mingap
46 46 self._snapshot = set(snapshot)
47 self.index = None
47 48
48 49 def start(self, rev):
49 50 if rev == 0:
50 51 return 0
51 52 return self._data[rev - 1]
52 53
53 54 def end(self, rev):
54 55 return self._data[rev]
55 56
56 57 def length(self, rev):
57 58 return self.end(rev) - self.start(rev)
58 59
59 60 def __len__(self):
60 61 return len(self._data)
61 62
62 63 def issnapshot(self, rev):
63 64 return rev in self._snapshot
64 65
65 66 def slicechunk(revlog, revs, targetsize=None):
66 67 """slice revs to reduce the amount of unrelated data to be read from disk.
67 68
68 69 ``revs`` is sliced into groups that should be read in one time.
69 70 Assume that revs are sorted.
70 71
71 72 The initial chunk is sliced until the overall density (payload/chunks-span
72 73 ratio) is above `revlog._srdensitythreshold`. No gap smaller than
73 74 `revlog._srmingapsize` is skipped.
74 75
75 76 If `targetsize` is set, no chunk larger than `targetsize` will be yield.
76 77 For consistency with other slicing choice, this limit won't go lower than
77 78 `revlog._srmingapsize`.
78 79
79 80 If individual revisions chunk are larger than this limit, they will still
80 81 be raised individually.
81 82
82 83 >>> data = [
83 84 ... 5, #00 (5)
84 85 ... 10, #01 (5)
85 86 ... 12, #02 (2)
86 87 ... 12, #03 (empty)
87 88 ... 27, #04 (15)
88 89 ... 31, #05 (4)
89 90 ... 31, #06 (empty)
90 91 ... 42, #07 (11)
91 92 ... 47, #08 (5)
92 93 ... 47, #09 (empty)
93 94 ... 48, #10 (1)
94 95 ... 51, #11 (3)
95 96 ... 74, #12 (23)
96 97 ... 85, #13 (11)
97 98 ... 86, #14 (1)
98 99 ... 91, #15 (5)
99 100 ... ]
100 101 >>> revlog = _testrevlog(data, snapshot=range(16))
101 102
102 103 >>> list(slicechunk(revlog, list(range(16))))
103 104 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
104 105 >>> list(slicechunk(revlog, [0, 15]))
105 106 [[0], [15]]
106 107 >>> list(slicechunk(revlog, [0, 11, 15]))
107 108 [[0], [11], [15]]
108 109 >>> list(slicechunk(revlog, [0, 11, 13, 15]))
109 110 [[0], [11, 13, 15]]
110 111 >>> list(slicechunk(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
111 112 [[1, 2], [5, 8, 10, 11], [14]]
112 113
113 114 Slicing with a maximum chunk size
114 115 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=15))
115 116 [[0], [11], [13], [15]]
116 117 >>> list(slicechunk(revlog, [0, 11, 13, 15], targetsize=20))
117 118 [[0], [11], [13, 15]]
118 119 """
119 120 if targetsize is not None:
120 121 targetsize = max(targetsize, revlog._srmingapsize)
121 122 # targetsize should not be specified when evaluating delta candidates:
122 123 # * targetsize is used to ensure we stay within specification when reading,
123 for chunk in _slicechunktodensity(revlog, revs,
124 revlog._srdensitythreshold,
125 revlog._srmingapsize):
124 densityslicing = getattr(revlog.index, 'slicechunktodensity', None)
125 if densityslicing is None:
126 densityslicing = lambda x, y, z: _slicechunktodensity(revlog, x, y, z)
127 for chunk in densityslicing(revs,
128 revlog._srdensitythreshold,
129 revlog._srmingapsize):
126 130 for subchunk in _slicechunktosize(revlog, chunk, targetsize):
127 131 yield subchunk
128 132
129 133 def _slicechunktosize(revlog, revs, targetsize=None):
130 134 """slice revs to match the target size
131 135
132 136 This is intended to be used on chunk that density slicing selected by that
133 137 are still too large compared to the read garantee of revlog. This might
134 138 happens when "minimal gap size" interrupted the slicing or when chain are
135 139 built in a way that create large blocks next to each other.
136 140
137 141 >>> data = [
138 142 ... 3, #0 (3)
139 143 ... 5, #1 (2)
140 144 ... 6, #2 (1)
141 145 ... 8, #3 (2)
142 146 ... 8, #4 (empty)
143 147 ... 11, #5 (3)
144 148 ... 12, #6 (1)
145 149 ... 13, #7 (1)
146 150 ... 14, #8 (1)
147 151 ... ]
148 152
149 153 == All snapshots cases ==
150 154 >>> revlog = _testrevlog(data, snapshot=range(9))
151 155
152 156 Cases where chunk is already small enough
153 157 >>> list(_slicechunktosize(revlog, [0], 3))
154 158 [[0]]
155 159 >>> list(_slicechunktosize(revlog, [6, 7], 3))
156 160 [[6, 7]]
157 161 >>> list(_slicechunktosize(revlog, [0], None))
158 162 [[0]]
159 163 >>> list(_slicechunktosize(revlog, [6, 7], None))
160 164 [[6, 7]]
161 165
162 166 cases where we need actual slicing
163 167 >>> list(_slicechunktosize(revlog, [0, 1], 3))
164 168 [[0], [1]]
165 169 >>> list(_slicechunktosize(revlog, [1, 3], 3))
166 170 [[1], [3]]
167 171 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
168 172 [[1, 2], [3]]
169 173 >>> list(_slicechunktosize(revlog, [3, 5], 3))
170 174 [[3], [5]]
171 175 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
172 176 [[3], [5]]
173 177 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
174 178 [[5], [6, 7, 8]]
175 179 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
176 180 [[0], [1, 2], [3], [5], [6, 7, 8]]
177 181
178 182 Case with too large individual chunk (must return valid chunk)
179 183 >>> list(_slicechunktosize(revlog, [0, 1], 2))
180 184 [[0], [1]]
181 185 >>> list(_slicechunktosize(revlog, [1, 3], 1))
182 186 [[1], [3]]
183 187 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
184 188 [[3], [5]]
185 189
186 190 == No Snapshot cases ==
187 191 >>> revlog = _testrevlog(data)
188 192
189 193 Cases where chunk is already small enough
190 194 >>> list(_slicechunktosize(revlog, [0], 3))
191 195 [[0]]
192 196 >>> list(_slicechunktosize(revlog, [6, 7], 3))
193 197 [[6, 7]]
194 198 >>> list(_slicechunktosize(revlog, [0], None))
195 199 [[0]]
196 200 >>> list(_slicechunktosize(revlog, [6, 7], None))
197 201 [[6, 7]]
198 202
199 203 cases where we need actual slicing
200 204 >>> list(_slicechunktosize(revlog, [0, 1], 3))
201 205 [[0], [1]]
202 206 >>> list(_slicechunktosize(revlog, [1, 3], 3))
203 207 [[1], [3]]
204 208 >>> list(_slicechunktosize(revlog, [1, 2, 3], 3))
205 209 [[1], [2, 3]]
206 210 >>> list(_slicechunktosize(revlog, [3, 5], 3))
207 211 [[3], [5]]
208 212 >>> list(_slicechunktosize(revlog, [3, 4, 5], 3))
209 213 [[3], [4, 5]]
210 214 >>> list(_slicechunktosize(revlog, [5, 6, 7, 8], 3))
211 215 [[5], [6, 7, 8]]
212 216 >>> list(_slicechunktosize(revlog, [0, 1, 2, 3, 4, 5, 6, 7, 8], 3))
213 217 [[0], [1, 2], [3], [5], [6, 7, 8]]
214 218
215 219 Case with too large individual chunk (must return valid chunk)
216 220 >>> list(_slicechunktosize(revlog, [0, 1], 2))
217 221 [[0], [1]]
218 222 >>> list(_slicechunktosize(revlog, [1, 3], 1))
219 223 [[1], [3]]
220 224 >>> list(_slicechunktosize(revlog, [3, 4, 5], 2))
221 225 [[3], [5]]
222 226
223 227 == mixed case ==
224 228 >>> revlog = _testrevlog(data, snapshot=[0, 1, 2])
225 229 >>> list(_slicechunktosize(revlog, list(range(9)), 5))
226 230 [[0, 1], [2], [3, 4, 5], [6, 7, 8]]
227 231 """
228 232 assert targetsize is None or 0 <= targetsize
229 233 startdata = revlog.start(revs[0])
230 234 enddata = revlog.end(revs[-1])
231 235 fullspan = enddata - startdata
232 236 if targetsize is None or fullspan <= targetsize:
233 237 yield revs
234 238 return
235 239
236 240 startrevidx = 0
237 241 endrevidx = 1
238 242 iterrevs = enumerate(revs)
239 243 next(iterrevs) # skip first rev.
240 244 # first step: get snapshots out of the way
241 245 for idx, r in iterrevs:
242 246 span = revlog.end(r) - startdata
243 247 snapshot = revlog.issnapshot(r)
244 248 if span <= targetsize and snapshot:
245 249 endrevidx = idx + 1
246 250 else:
247 251 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
248 252 if chunk:
249 253 yield chunk
250 254 startrevidx = idx
251 255 startdata = revlog.start(r)
252 256 endrevidx = idx + 1
253 257 if not snapshot:
254 258 break
255 259
256 260 # for the others, we use binary slicing to quickly converge toward valid
257 261 # chunks (otherwise, we might end up looking for start/end of many
258 262 # revisions). This logic is not looking for the perfect slicing point, it
259 263 # focuses on quickly converging toward valid chunks.
260 264 nbitem = len(revs)
261 265 while (enddata - startdata) > targetsize:
262 266 endrevidx = nbitem
263 267 if nbitem - startrevidx <= 1:
264 268 break # protect against individual chunk larger than limit
265 269 localenddata = revlog.end(revs[endrevidx - 1])
266 270 span = localenddata - startdata
267 271 while span > targetsize:
268 272 if endrevidx - startrevidx <= 1:
269 273 break # protect against individual chunk larger than limit
270 274 endrevidx -= (endrevidx - startrevidx) // 2
271 275 localenddata = revlog.end(revs[endrevidx - 1])
272 276 span = localenddata - startdata
273 277 chunk = _trimchunk(revlog, revs, startrevidx, endrevidx)
274 278 if chunk:
275 279 yield chunk
276 280 startrevidx = endrevidx
277 281 startdata = revlog.start(revs[startrevidx])
278 282
279 283 chunk = _trimchunk(revlog, revs, startrevidx)
280 284 if chunk:
281 285 yield chunk
282 286
283 287 def _slicechunktodensity(revlog, revs, targetdensity=0.5,
284 288 mingapsize=0):
285 289 """slice revs to reduce the amount of unrelated data to be read from disk.
286 290
287 291 ``revs`` is sliced into groups that should be read in one time.
288 292 Assume that revs are sorted.
289 293
290 294 The initial chunk is sliced until the overall density (payload/chunks-span
291 295 ratio) is above `targetdensity`. No gap smaller than `mingapsize` is
292 296 skipped.
293 297
294 298 >>> revlog = _testrevlog([
295 299 ... 5, #00 (5)
296 300 ... 10, #01 (5)
297 301 ... 12, #02 (2)
298 302 ... 12, #03 (empty)
299 303 ... 27, #04 (15)
300 304 ... 31, #05 (4)
301 305 ... 31, #06 (empty)
302 306 ... 42, #07 (11)
303 307 ... 47, #08 (5)
304 308 ... 47, #09 (empty)
305 309 ... 48, #10 (1)
306 310 ... 51, #11 (3)
307 311 ... 74, #12 (23)
308 312 ... 85, #13 (11)
309 313 ... 86, #14 (1)
310 314 ... 91, #15 (5)
311 315 ... ])
312 316
313 317 >>> list(_slicechunktodensity(revlog, list(range(16))))
314 318 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
315 319 >>> list(_slicechunktodensity(revlog, [0, 15]))
316 320 [[0], [15]]
317 321 >>> list(_slicechunktodensity(revlog, [0, 11, 15]))
318 322 [[0], [11], [15]]
319 323 >>> list(_slicechunktodensity(revlog, [0, 11, 13, 15]))
320 324 [[0], [11, 13, 15]]
321 325 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14]))
322 326 [[1, 2], [5, 8, 10, 11], [14]]
323 327 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
324 328 ... mingapsize=20))
325 329 [[1, 2, 3, 5, 8, 10, 11], [14]]
326 330 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
327 331 ... targetdensity=0.95))
328 332 [[1, 2], [5], [8, 10, 11], [14]]
329 333 >>> list(_slicechunktodensity(revlog, [1, 2, 3, 5, 8, 10, 11, 14],
330 334 ... targetdensity=0.95, mingapsize=12))
331 335 [[1, 2], [5, 8, 10, 11], [14]]
332 336 """
333 337 start = revlog.start
334 338 length = revlog.length
335 339
336 340 if len(revs) <= 1:
337 341 yield revs
338 342 return
339 343
340 344 deltachainspan = segmentspan(revlog, revs)
341 345
342 346 if deltachainspan < mingapsize:
343 347 yield revs
344 348 return
345 349
346 350 readdata = deltachainspan
347 351 chainpayload = sum(length(r) for r in revs)
348 352
349 353 if deltachainspan:
350 354 density = chainpayload / float(deltachainspan)
351 355 else:
352 356 density = 1.0
353 357
354 358 if density >= targetdensity:
355 359 yield revs
356 360 return
357 361
358 362 # Store the gaps in a heap to have them sorted by decreasing size
359 363 gaps = []
360 364 prevend = None
361 365 for i, rev in enumerate(revs):
362 366 revstart = start(rev)
363 367 revlen = length(rev)
364 368
365 369 # Skip empty revisions to form larger holes
366 370 if revlen == 0:
367 371 continue
368 372
369 373 if prevend is not None:
370 374 gapsize = revstart - prevend
371 375 # only consider holes that are large enough
372 376 if gapsize > mingapsize:
373 377 gaps.append((gapsize, i))
374 378
375 379 prevend = revstart + revlen
376 380 # sort the gaps to pop them from largest to small
377 381 gaps.sort()
378 382
379 383 # Collect the indices of the largest holes until the density is acceptable
380 384 selected = []
381 385 while gaps and density < targetdensity:
382 386 gapsize, gapidx = gaps.pop()
383 387
384 388 selected.append(gapidx)
385 389
386 390 # the gap sizes are stored as negatives to be sorted decreasingly
387 391 # by the heap
388 392 readdata -= gapsize
389 393 if readdata > 0:
390 394 density = chainpayload / float(readdata)
391 395 else:
392 396 density = 1.0
393 397 selected.sort()
394 398
395 399 # Cut the revs at collected indices
396 400 previdx = 0
397 401 for idx in selected:
398 402
399 403 chunk = _trimchunk(revlog, revs, previdx, idx)
400 404 if chunk:
401 405 yield chunk
402 406
403 407 previdx = idx
404 408
405 409 chunk = _trimchunk(revlog, revs, previdx)
406 410 if chunk:
407 411 yield chunk
408 412
409 413 def _trimchunk(revlog, revs, startidx, endidx=None):
410 414 """returns revs[startidx:endidx] without empty trailing revs
411 415
412 416 Doctest Setup
413 417 >>> revlog = _testrevlog([
414 418 ... 5, #0
415 419 ... 10, #1
416 420 ... 12, #2
417 421 ... 12, #3 (empty)
418 422 ... 17, #4
419 423 ... 21, #5
420 424 ... 21, #6 (empty)
421 425 ... ])
422 426
423 427 Contiguous cases:
424 428 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0)
425 429 [0, 1, 2, 3, 4, 5]
426 430 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 5)
427 431 [0, 1, 2, 3, 4]
428 432 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 0, 4)
429 433 [0, 1, 2]
430 434 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 2, 4)
431 435 [2]
432 436 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3)
433 437 [3, 4, 5]
434 438 >>> _trimchunk(revlog, [0, 1, 2, 3, 4, 5, 6], 3, 5)
435 439 [3, 4]
436 440
437 441 Discontiguous cases:
438 442 >>> _trimchunk(revlog, [1, 3, 5, 6], 0)
439 443 [1, 3, 5]
440 444 >>> _trimchunk(revlog, [1, 3, 5, 6], 0, 2)
441 445 [1]
442 446 >>> _trimchunk(revlog, [1, 3, 5, 6], 1, 3)
443 447 [3, 5]
444 448 >>> _trimchunk(revlog, [1, 3, 5, 6], 1)
445 449 [3, 5]
446 450 """
447 451 length = revlog.length
448 452
449 453 if endidx is None:
450 454 endidx = len(revs)
451 455
452 456 # If we have a non-emtpy delta candidate, there are nothing to trim
453 457 if revs[endidx - 1] < len(revlog):
454 458 # Trim empty revs at the end, except the very first revision of a chain
455 459 while (endidx > 1
456 460 and endidx > startidx
457 461 and length(revs[endidx - 1]) == 0):
458 462 endidx -= 1
459 463
460 464 return revs[startidx:endidx]
461 465
462 466 def segmentspan(revlog, revs):
463 467 """Get the byte span of a segment of revisions
464 468
465 469 revs is a sorted array of revision numbers
466 470
467 471 >>> revlog = _testrevlog([
468 472 ... 5, #0
469 473 ... 10, #1
470 474 ... 12, #2
471 475 ... 12, #3 (empty)
472 476 ... 17, #4
473 477 ... ])
474 478
475 479 >>> segmentspan(revlog, [0, 1, 2, 3, 4])
476 480 17
477 481 >>> segmentspan(revlog, [0, 4])
478 482 17
479 483 >>> segmentspan(revlog, [3, 4])
480 484 5
481 485 >>> segmentspan(revlog, [1, 2, 3,])
482 486 7
483 487 >>> segmentspan(revlog, [1, 3])
484 488 7
485 489 """
486 490 if not revs:
487 491 return 0
488 492 end = revlog.end(revs[-1])
489 493 return end - revlog.start(revs[0])
490 494
491 495 def _textfromdelta(fh, revlog, baserev, delta, p1, p2, flags, expectednode):
492 496 """build full text from a (base, delta) pair and other metadata"""
493 497 # special case deltas which replace entire base; no need to decode
494 498 # base revision. this neatly avoids censored bases, which throw when
495 499 # they're decoded.
496 500 hlen = struct.calcsize(">lll")
497 501 if delta[:hlen] == mdiff.replacediffheader(revlog.rawsize(baserev),
498 502 len(delta) - hlen):
499 503 fulltext = delta[hlen:]
500 504 else:
501 505 # deltabase is rawtext before changed by flag processors, which is
502 506 # equivalent to non-raw text
503 507 basetext = revlog.revision(baserev, _df=fh, raw=False)
504 508 fulltext = mdiff.patch(basetext, delta)
505 509
506 510 try:
507 511 res = revlog._processflags(fulltext, flags, 'read', raw=True)
508 512 fulltext, validatehash = res
509 513 if validatehash:
510 514 revlog.checkhash(fulltext, expectednode, p1=p1, p2=p2)
511 515 if flags & REVIDX_ISCENSORED:
512 516 raise error.StorageError(_('node %s is not censored') %
513 517 expectednode)
514 518 except error.CensoredNodeError:
515 519 # must pass the censored index flag to add censored revisions
516 520 if not flags & REVIDX_ISCENSORED:
517 521 raise
518 522 return fulltext
519 523
520 524 @attr.s(slots=True, frozen=True)
521 525 class _deltainfo(object):
522 526 distance = attr.ib()
523 527 deltalen = attr.ib()
524 528 data = attr.ib()
525 529 base = attr.ib()
526 530 chainbase = attr.ib()
527 531 chainlen = attr.ib()
528 532 compresseddeltalen = attr.ib()
529 533 snapshotdepth = attr.ib()
530 534
531 535 def isgooddeltainfo(revlog, deltainfo, revinfo):
532 536 """Returns True if the given delta is good. Good means that it is within
533 537 the disk span, disk size, and chain length bounds that we know to be
534 538 performant."""
535 539 if deltainfo is None:
536 540 return False
537 541
538 542 # - 'deltainfo.distance' is the distance from the base revision --
539 543 # bounding it limits the amount of I/O we need to do.
540 544 # - 'deltainfo.compresseddeltalen' is the sum of the total size of
541 545 # deltas we need to apply -- bounding it limits the amount of CPU
542 546 # we consume.
543 547
544 548 textlen = revinfo.textlen
545 549 defaultmax = textlen * 4
546 550 maxdist = revlog._maxdeltachainspan
547 551 if not maxdist:
548 552 maxdist = deltainfo.distance # ensure the conditional pass
549 553 maxdist = max(maxdist, defaultmax)
550 554
551 555 # Bad delta from read span:
552 556 #
553 557 # If the span of data read is larger than the maximum allowed.
554 558 #
555 559 # In the sparse-revlog case, we rely on the associated "sparse reading"
556 560 # to avoid issue related to the span of data. In theory, it would be
557 561 # possible to build pathological revlog where delta pattern would lead
558 562 # to too many reads. However, they do not happen in practice at all. So
559 563 # we skip the span check entirely.
560 564 if not revlog._sparserevlog and maxdist < deltainfo.distance:
561 565 return False
562 566
563 567 # Bad delta from new delta size:
564 568 #
565 569 # If the delta size is larger than the target text, storing the
566 570 # delta will be inefficient.
567 571 if textlen < deltainfo.deltalen:
568 572 return False
569 573
570 574 # Bad delta from cumulated payload size:
571 575 #
572 576 # If the sum of delta get larger than K * target text length.
573 577 if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
574 578 return False
575 579
576 580 # Bad delta from chain length:
577 581 #
578 582 # If the number of delta in the chain gets too high.
579 583 if (revlog._maxchainlen
580 584 and revlog._maxchainlen < deltainfo.chainlen):
581 585 return False
582 586
583 587 # bad delta from intermediate snapshot size limit
584 588 #
585 589 # If an intermediate snapshot size is higher than the limit. The
586 590 # limit exist to prevent endless chain of intermediate delta to be
587 591 # created.
588 592 if (deltainfo.snapshotdepth is not None and
589 593 (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
590 594 return False
591 595
592 596 # bad delta if new intermediate snapshot is larger than the previous
593 597 # snapshot
594 598 if (deltainfo.snapshotdepth
595 599 and revlog.length(deltainfo.base) < deltainfo.deltalen):
596 600 return False
597 601
598 602 return True
599 603
600 604 def _candidategroups(revlog, textlen, p1, p2, cachedelta):
601 605 """Provides group of revision to be tested as delta base
602 606
603 607 This top level function focus on emitting groups with unique and worthwhile
604 608 content. See _raw_candidate_groups for details about the group order.
605 609 """
606 610 # should we try to build a delta?
607 611 if not (len(revlog) and revlog._storedeltachains):
608 612 yield None
609 613 return
610 614
611 615 deltalength = revlog.length
612 616 deltaparent = revlog.deltaparent
613 617 good = None
614 618
615 619 deltas_limit = textlen * LIMIT_DELTA2TEXT
616 620
617 621 tested = set([nullrev])
618 622 candidates = _refinedgroups(revlog, p1, p2, cachedelta)
619 623 while True:
620 624 temptative = candidates.send(good)
621 625 if temptative is None:
622 626 break
623 627 group = []
624 628 for rev in temptative:
625 629 # skip over empty delta (no need to include them in a chain)
626 630 while (revlog._generaldelta
627 631 and not (rev == nullrev
628 632 or rev in tested
629 633 or deltalength(rev))):
630 634 tested.add(rev)
631 635 rev = deltaparent(rev)
632 636 # filter out revision we tested already
633 637 if rev in tested:
634 638 continue
635 639 tested.add(rev)
636 640 # filter out delta base that will never produce good delta
637 641 if deltas_limit < revlog.length(rev):
638 642 continue
639 643 # no need to try a delta against nullrev, this will be done as a
640 644 # last resort.
641 645 if rev == nullrev:
642 646 continue
643 647 # no delta for rawtext-changing revs (see "candelta" for why)
644 648 if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
645 649 continue
646 650 group.append(rev)
647 651 if group:
648 652 # XXX: in the sparse revlog case, group can become large,
649 653 # impacting performances. Some bounding or slicing mecanism
650 654 # would help to reduce this impact.
651 655 good = yield tuple(group)
652 656 yield None
653 657
654 658 def _findsnapshots(revlog, cache, start_rev):
655 659 """find snapshot from start_rev to tip"""
656 660 deltaparent = revlog.deltaparent
657 661 issnapshot = revlog.issnapshot
658 662 for rev in revlog.revs(start_rev):
659 663 if issnapshot(rev):
660 664 cache[deltaparent(rev)].append(rev)
661 665
662 666 def _refinedgroups(revlog, p1, p2, cachedelta):
663 667 good = None
664 668 # First we try to reuse a the delta contained in the bundle.
665 669 # (or from the source revlog)
666 670 #
667 671 # This logic only applies to general delta repositories and can be disabled
668 672 # through configuration. Disabling reuse source delta is useful when
669 673 # we want to make sure we recomputed "optimal" deltas.
670 674 if cachedelta and revlog._generaldelta and revlog._lazydeltabase:
671 675 # Assume what we received from the server is a good choice
672 676 # build delta will reuse the cache
673 677 good = yield (cachedelta[0],)
674 678 if good is not None:
675 679 yield None
676 680 return
677 681 for candidates in _rawgroups(revlog, p1, p2, cachedelta):
678 682 good = yield candidates
679 683 if good is not None:
680 684 break
681 685
682 686 # If sparse revlog is enabled, we can try to refine the available deltas
683 687 if not revlog._sparserevlog:
684 688 yield None
685 689 return
686 690
687 691 # if we have a refinable value, try to refine it
688 692 if good is not None and good not in (p1, p2) and revlog.issnapshot(good):
689 693 # refine snapshot down
690 694 previous = None
691 695 while previous != good:
692 696 previous = good
693 697 base = revlog.deltaparent(good)
694 698 if base == nullrev:
695 699 break
696 700 good = yield (base,)
697 701 # refine snapshot up
698 702 #
699 703 # XXX the _findsnapshots call can be expensive and is "duplicated" with
700 704 # the one done in `_rawgroups`. Once we start working on performance,
701 705 # we should make the two logics share this computation.
702 706 snapshots = collections.defaultdict(list)
703 707 _findsnapshots(revlog, snapshots, good + 1)
704 708 previous = None
705 709 while good != previous:
706 710 previous = good
707 711 children = tuple(sorted(c for c in snapshots[good]))
708 712 good = yield children
709 713
710 714 # we have found nothing
711 715 yield None
712 716
713 717 def _rawgroups(revlog, p1, p2, cachedelta):
714 718 """Provides group of revision to be tested as delta base
715 719
716 720 This lower level function focus on emitting delta theorically interresting
717 721 without looking it any practical details.
718 722
719 723 The group order aims at providing fast or small candidates first.
720 724 """
721 725 gdelta = revlog._generaldelta
722 726 sparse = revlog._sparserevlog
723 727 curr = len(revlog)
724 728 prev = curr - 1
725 729 deltachain = lambda rev: revlog._deltachain(rev)[0]
726 730
727 731 if gdelta:
728 732 # exclude already lazy tested base if any
729 733 parents = [p for p in (p1, p2) if p != nullrev]
730 734
731 735 if not revlog._deltabothparents and len(parents) == 2:
732 736 parents.sort()
733 737 # To minimize the chance of having to build a fulltext,
734 738 # pick first whichever parent is closest to us (max rev)
735 739 yield (parents[1],)
736 740 # then the other one (min rev) if the first did not fit
737 741 yield (parents[0],)
738 742 elif len(parents) > 0:
739 743 # Test all parents (1 or 2), and keep the best candidate
740 744 yield parents
741 745
742 746 if sparse and parents:
743 747 snapshots = collections.defaultdict(list) # map: base-rev: snapshot-rev
744 748 # See if we can use an existing snapshot in the parent chains to use as
745 749 # a base for a new intermediate-snapshot
746 750 #
747 751 # search for snapshot in parents delta chain
748 752 # map: snapshot-level: snapshot-rev
749 753 parents_snaps = collections.defaultdict(set)
750 754 candidate_chains = [deltachain(p) for p in parents]
751 755 for chain in candidate_chains:
752 756 for idx, s in enumerate(chain):
753 757 if not revlog.issnapshot(s):
754 758 break
755 759 parents_snaps[idx].add(s)
756 760 snapfloor = min(parents_snaps[0]) + 1
757 761 _findsnapshots(revlog, snapshots, snapfloor)
758 762 # search for the highest "unrelated" revision
759 763 #
760 764 # Adding snapshots used by "unrelated" revision increase the odd we
761 765 # reuse an independant, yet better snapshot chain.
762 766 #
763 767 # XXX instead of building a set of revisions, we could lazily enumerate
764 768 # over the chains. That would be more efficient, however we stick to
765 769 # simple code for now.
766 770 all_revs = set()
767 771 for chain in candidate_chains:
768 772 all_revs.update(chain)
769 773 other = None
770 774 for r in revlog.revs(prev, snapfloor):
771 775 if r not in all_revs:
772 776 other = r
773 777 break
774 778 if other is not None:
775 779 # To avoid unfair competition, we won't use unrelated intermediate
776 780 # snapshot that are deeper than the ones from the parent delta
777 781 # chain.
778 782 max_depth = max(parents_snaps.keys())
779 783 chain = deltachain(other)
780 784 for idx, s in enumerate(chain):
781 785 if s < snapfloor:
782 786 continue
783 787 if max_depth < idx:
784 788 break
785 789 if not revlog.issnapshot(s):
786 790 break
787 791 parents_snaps[idx].add(s)
788 792 # Test them as possible intermediate snapshot base
789 793 # We test them from highest to lowest level. High level one are more
790 794 # likely to result in small delta
791 795 floor = None
792 796 for idx, snaps in sorted(parents_snaps.items(), reverse=True):
793 797 siblings = set()
794 798 for s in snaps:
795 799 siblings.update(snapshots[s])
796 800 # Before considering making a new intermediate snapshot, we check
797 801 # if an existing snapshot, children of base we consider, would be
798 802 # suitable.
799 803 #
800 804 # It give a change to reuse a delta chain "unrelated" to the
801 805 # current revision instead of starting our own. Without such
802 806 # re-use, topological branches would keep reopening new chains.
803 807 # Creating more and more snapshot as the repository grow.
804 808
805 809 if floor is not None:
806 810 # We only do this for siblings created after the one in our
807 811 # parent's delta chain. Those created before has less chances
808 812 # to be valid base since our ancestors had to create a new
809 813 # snapshot.
810 814 siblings = [r for r in siblings if floor < r]
811 815 yield tuple(sorted(siblings))
812 816 # then test the base from our parent's delta chain.
813 817 yield tuple(sorted(snaps))
814 818 floor = min(snaps)
815 819 # No suitable base found in the parent chain, search if any full
816 820 # snapshots emitted since parent's base would be a suitable base for an
817 821 # intermediate snapshot.
818 822 #
819 823 # It give a chance to reuse a delta chain unrelated to the current
820 824 # revisions instead of starting our own. Without such re-use,
821 825 # topological branches would keep reopening new full chains. Creating
822 826 # more and more snapshot as the repository grow.
823 827 yield tuple(snapshots[nullrev])
824 828
825 829 if not sparse:
826 830 # other approach failed try against prev to hopefully save us a
827 831 # fulltext.
828 832 yield (prev,)
829 833
830 834 class deltacomputer(object):
831 835 def __init__(self, revlog):
832 836 self.revlog = revlog
833 837
834 838 def buildtext(self, revinfo, fh):
835 839 """Builds a fulltext version of a revision
836 840
837 841 revinfo: _revisioninfo instance that contains all needed info
838 842 fh: file handle to either the .i or the .d revlog file,
839 843 depending on whether it is inlined or not
840 844 """
841 845 btext = revinfo.btext
842 846 if btext[0] is not None:
843 847 return btext[0]
844 848
845 849 revlog = self.revlog
846 850 cachedelta = revinfo.cachedelta
847 851 baserev = cachedelta[0]
848 852 delta = cachedelta[1]
849 853
850 854 fulltext = btext[0] = _textfromdelta(fh, revlog, baserev, delta,
851 855 revinfo.p1, revinfo.p2,
852 856 revinfo.flags, revinfo.node)
853 857 return fulltext
854 858
855 859 def _builddeltadiff(self, base, revinfo, fh):
856 860 revlog = self.revlog
857 861 t = self.buildtext(revinfo, fh)
858 862 if revlog.iscensored(base):
859 863 # deltas based on a censored revision must replace the
860 864 # full content in one patch, so delta works everywhere
861 865 header = mdiff.replacediffheader(revlog.rawsize(base), len(t))
862 866 delta = header + t
863 867 else:
864 868 ptext = revlog.revision(base, _df=fh, raw=True)
865 869 delta = mdiff.textdiff(ptext, t)
866 870
867 871 return delta
868 872
869 873 def _builddeltainfo(self, revinfo, base, fh):
870 874 # can we use the cached delta?
871 875 delta = None
872 876 if revinfo.cachedelta:
873 877 cachebase, cachediff = revinfo.cachedelta
874 878 #check if the diff still apply
875 879 currentbase = cachebase
876 880 while (currentbase != nullrev
877 881 and currentbase != base
878 882 and self.revlog.length(currentbase) == 0):
879 883 currentbase = self.revlog.deltaparent(currentbase)
880 884 if currentbase == base:
881 885 delta = revinfo.cachedelta[1]
882 886 if delta is None:
883 887 delta = self._builddeltadiff(base, revinfo, fh)
884 888 revlog = self.revlog
885 889 header, data = revlog.compress(delta)
886 890 deltalen = len(header) + len(data)
887 891 chainbase = revlog.chainbase(base)
888 892 offset = revlog.end(len(revlog) - 1)
889 893 dist = deltalen + offset - revlog.start(chainbase)
890 894 if revlog._generaldelta:
891 895 deltabase = base
892 896 else:
893 897 deltabase = chainbase
894 898 chainlen, compresseddeltalen = revlog._chaininfo(base)
895 899 chainlen += 1
896 900 compresseddeltalen += deltalen
897 901
898 902 revlog = self.revlog
899 903 snapshotdepth = None
900 904 if deltabase == nullrev:
901 905 snapshotdepth = 0
902 906 elif revlog._sparserevlog and revlog.issnapshot(deltabase):
903 907 # A delta chain should always be one full snapshot,
904 908 # zero or more semi-snapshots, and zero or more deltas
905 909 p1, p2 = revlog.rev(revinfo.p1), revlog.rev(revinfo.p2)
906 910 if deltabase not in (p1, p2) and revlog.issnapshot(deltabase):
907 911 snapshotdepth = len(revlog._deltachain(deltabase)[0])
908 912
909 913 return _deltainfo(dist, deltalen, (header, data), deltabase,
910 914 chainbase, chainlen, compresseddeltalen,
911 915 snapshotdepth)
912 916
913 917 def _fullsnapshotinfo(self, fh, revinfo):
914 918 curr = len(self.revlog)
915 919 rawtext = self.buildtext(revinfo, fh)
916 920 data = self.revlog.compress(rawtext)
917 921 compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
918 922 deltabase = chainbase = curr
919 923 snapshotdepth = 0
920 924 chainlen = 1
921 925
922 926 return _deltainfo(dist, deltalen, data, deltabase,
923 927 chainbase, chainlen, compresseddeltalen,
924 928 snapshotdepth)
925 929
926 930 def finddeltainfo(self, revinfo, fh):
927 931 """Find an acceptable delta against a candidate revision
928 932
929 933 revinfo: information about the revision (instance of _revisioninfo)
930 934 fh: file handle to either the .i or the .d revlog file,
931 935 depending on whether it is inlined or not
932 936
933 937 Returns the first acceptable candidate revision, as ordered by
934 938 _candidategroups
935 939
936 940 If no suitable deltabase is found, we return delta info for a full
937 941 snapshot.
938 942 """
939 943 if not revinfo.textlen:
940 944 return self._fullsnapshotinfo(fh, revinfo)
941 945
942 946 # no delta for flag processor revision (see "candelta" for why)
943 947 # not calling candelta since only one revision needs test, also to
944 948 # avoid overhead fetching flags again.
945 949 if revinfo.flags & REVIDX_RAWTEXT_CHANGING_FLAGS:
946 950 return self._fullsnapshotinfo(fh, revinfo)
947 951
948 952 cachedelta = revinfo.cachedelta
949 953 p1 = revinfo.p1
950 954 p2 = revinfo.p2
951 955 revlog = self.revlog
952 956
953 957 deltainfo = None
954 958 p1r, p2r = revlog.rev(p1), revlog.rev(p2)
955 959 groups = _candidategroups(self.revlog, revinfo.textlen,
956 960 p1r, p2r, cachedelta)
957 961 candidaterevs = next(groups)
958 962 while candidaterevs is not None:
959 963 nominateddeltas = []
960 964 if deltainfo is not None:
961 965 # if we already found a good delta,
962 966 # challenge it against refined candidates
963 967 nominateddeltas.append(deltainfo)
964 968 for candidaterev in candidaterevs:
965 969 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
966 970 if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
967 971 nominateddeltas.append(candidatedelta)
968 972 if nominateddeltas:
969 973 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
970 974 if deltainfo is not None:
971 975 candidaterevs = groups.send(deltainfo.base)
972 976 else:
973 977 candidaterevs = next(groups)
974 978
975 979 if deltainfo is None:
976 980 deltainfo = self._fullsnapshotinfo(fh, revinfo)
977 981 return deltainfo
General Comments 0
You need to be logged in to leave comments. Login now