##// END OF EJS Templates
copies: cache the ancestor checking call when tracing copy...
marmoute -
r46504:06b64fab default
parent child Browse files
Show More
@@ -1,1112 +1,1130 b''
1 1 # copies.py - copy detection for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import collections
11 11 import os
12 12
13 13 from .i18n import _
14 14
15 15
16 16 from . import (
17 17 match as matchmod,
18 18 node,
19 19 pathutil,
20 20 pycompat,
21 21 util,
22 22 )
23 23
24 24
25 25 from .utils import stringutil
26 26
27 27 from .revlogutils import flagutil
28 28
29 29
30 30 def _filter(src, dst, t):
31 31 """filters out invalid copies after chaining"""
32 32
33 33 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
34 34 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
35 35 # in the following table (not including trivial cases). For example, case 2
36 36 # is where a file existed in 'src' and remained under that name in 'mid' and
37 37 # then was renamed between 'mid' and 'dst'.
38 38 #
39 39 # case src mid dst result
40 40 # 1 x y - -
41 41 # 2 x y y x->y
42 42 # 3 x y x -
43 43 # 4 x y z x->z
44 44 # 5 - x y -
45 45 # 6 x x y x->y
46 46 #
47 47 # _chain() takes care of chaining the copies in 'a' and 'b', but it
48 48 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
49 49 # between 5 and 6, so it includes all cases in its result.
50 50 # Cases 1, 3, and 5 are then removed by _filter().
51 51
52 52 for k, v in list(t.items()):
53 53 # remove copies from files that didn't exist
54 54 if v not in src:
55 55 del t[k]
56 56 # remove criss-crossed copies
57 57 elif k in src and v in dst:
58 58 del t[k]
59 59 # remove copies to files that were then removed
60 60 elif k not in dst:
61 61 del t[k]
62 62
63 63
64 64 def _chain(prefix, suffix):
65 65 """chain two sets of copies 'prefix' and 'suffix'"""
66 66 result = prefix.copy()
67 67 for key, value in pycompat.iteritems(suffix):
68 68 result[key] = prefix.get(value, value)
69 69 return result
70 70
71 71
72 72 def _tracefile(fctx, am, basemf):
73 73 """return file context that is the ancestor of fctx present in ancestor
74 74 manifest am
75 75
76 76 Note: we used to try and stop after a given limit, however checking if that
77 77 limit is reached turned out to be very expensive. we are better off
78 78 disabling that feature."""
79 79
80 80 for f in fctx.ancestors():
81 81 path = f.path()
82 82 if am.get(path, None) == f.filenode():
83 83 return path
84 84 if basemf and basemf.get(path, None) == f.filenode():
85 85 return path
86 86
87 87
88 88 def _dirstatecopies(repo, match=None):
89 89 ds = repo.dirstate
90 90 c = ds.copies().copy()
91 91 for k in list(c):
92 92 if ds[k] not in b'anm' or (match and not match(k)):
93 93 del c[k]
94 94 return c
95 95
96 96
97 97 def _computeforwardmissing(a, b, match=None):
98 98 """Computes which files are in b but not a.
99 99 This is its own function so extensions can easily wrap this call to see what
100 100 files _forwardcopies is about to process.
101 101 """
102 102 ma = a.manifest()
103 103 mb = b.manifest()
104 104 return mb.filesnotin(ma, match=match)
105 105
106 106
107 107 def usechangesetcentricalgo(repo):
108 108 """Checks if we should use changeset-centric copy algorithms"""
109 109 if repo.filecopiesmode == b'changeset-sidedata':
110 110 return True
111 111 readfrom = repo.ui.config(b'experimental', b'copies.read-from')
112 112 changesetsource = (b'changeset-only', b'compatibility')
113 113 return readfrom in changesetsource
114 114
115 115
116 116 def _committedforwardcopies(a, b, base, match):
117 117 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
118 118 # files might have to be traced back to the fctx parent of the last
119 119 # one-side-only changeset, but not further back than that
120 120 repo = a._repo
121 121
122 122 if usechangesetcentricalgo(repo):
123 123 return _changesetforwardcopies(a, b, match)
124 124
125 125 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
126 126 dbg = repo.ui.debug
127 127 if debug:
128 128 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b))
129 129 am = a.manifest()
130 130 basemf = None if base is None else base.manifest()
131 131
132 132 # find where new files came from
133 133 # we currently don't try to find where old files went, too expensive
134 134 # this means we can miss a case like 'hg rm b; hg cp a b'
135 135 cm = {}
136 136
137 137 # Computing the forward missing is quite expensive on large manifests, since
138 138 # it compares the entire manifests. We can optimize it in the common use
139 139 # case of computing what copies are in a commit versus its parent (like
140 140 # during a rebase or histedit). Note, we exclude merge commits from this
141 141 # optimization, since the ctx.files() for a merge commit is not correct for
142 142 # this comparison.
143 143 forwardmissingmatch = match
144 144 if b.p1() == a and b.p2().node() == node.nullid:
145 145 filesmatcher = matchmod.exact(b.files())
146 146 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
147 147 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
148 148
149 149 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
150 150
151 151 if debug:
152 152 dbg(b'debug.copies: missing files to search: %d\n' % len(missing))
153 153
154 154 for f in sorted(missing):
155 155 if debug:
156 156 dbg(b'debug.copies: tracing file: %s\n' % f)
157 157 fctx = b[f]
158 158 fctx._ancestrycontext = ancestrycontext
159 159
160 160 if debug:
161 161 start = util.timer()
162 162 opath = _tracefile(fctx, am, basemf)
163 163 if opath:
164 164 if debug:
165 165 dbg(b'debug.copies: rename of: %s\n' % opath)
166 166 cm[f] = opath
167 167 if debug:
168 168 dbg(
169 169 b'debug.copies: time: %f seconds\n'
170 170 % (util.timer() - start)
171 171 )
172 172 return cm
173 173
174 174
175 175 def _revinfo_getter(repo):
176 176 """returns a function that returns the following data given a <rev>"
177 177
178 178 * p1: revision number of first parent
179 179 * p2: revision number of first parent
180 180 * changes: a ChangingFiles object
181 181 """
182 182 cl = repo.changelog
183 183 parents = cl.parentrevs
184 184 flags = cl.flags
185 185
186 186 HASCOPIESINFO = flagutil.REVIDX_HASCOPIESINFO
187 187
188 188 changelogrevision = cl.changelogrevision
189 189
190 190 # A small cache to avoid doing the work twice for merges
191 191 #
192 192 # In the vast majority of cases, if we ask information for a revision
193 193 # about 1 parent, we'll later ask it for the other. So it make sense to
194 194 # keep the information around when reaching the first parent of a merge
195 195 # and dropping it after it was provided for the second parents.
196 196 #
197 197 # It exists cases were only one parent of the merge will be walked. It
198 198 # happens when the "destination" the copy tracing is descendant from a
199 199 # new root, not common with the "source". In that case, we will only walk
200 200 # through merge parents that are descendant of changesets common
201 201 # between "source" and "destination".
202 202 #
203 203 # With the current case implementation if such changesets have a copy
204 204 # information, we'll keep them in memory until the end of
205 205 # _changesetforwardcopies. We don't expect the case to be frequent
206 206 # enough to matters.
207 207 #
208 208 # In addition, it would be possible to reach pathological case, were
209 209 # many first parent are met before any second parent is reached. In
210 210 # that case the cache could grow. If this even become an issue one can
211 211 # safely introduce a maximum cache size. This would trade extra CPU/IO
212 212 # time to save memory.
213 213 merge_caches = {}
214 214
215 215 def revinfo(rev):
216 216 p1, p2 = parents(rev)
217 217 value = None
218 218 e = merge_caches.pop(rev, None)
219 219 if e is not None:
220 220 return e
221 221 changes = None
222 222 if flags(rev) & HASCOPIESINFO:
223 223 changes = changelogrevision(rev).changes
224 224 value = (p1, p2, changes)
225 225 if p1 != node.nullrev and p2 != node.nullrev:
226 226 # XXX some case we over cache, IGNORE
227 227 merge_caches[rev] = value
228 228 return value
229 229
230 230 return revinfo
231 231
232 232
233 def cached_is_ancestor(is_ancestor):
234 """return a cached version of is_ancestor"""
235 cache = {}
236
237 def _is_ancestor(anc, desc):
238 if anc > desc:
239 return False
240 elif anc == desc:
241 return True
242 key = (anc, desc)
243 ret = cache.get(key)
244 if ret is None:
245 ret = cache[key] = is_ancestor(anc, desc)
246 return ret
247
248 return _is_ancestor
249
250
233 251 def _changesetforwardcopies(a, b, match):
234 252 if a.rev() in (node.nullrev, b.rev()):
235 253 return {}
236 254
237 255 repo = a.repo().unfiltered()
238 256 children = {}
239 257
240 258 cl = repo.changelog
241 isancestor = cl.isancestorrev # XXX we should had chaching to this.
259 isancestor = cached_is_ancestor(cl.isancestorrev)
242 260 missingrevs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
243 261 mrset = set(missingrevs)
244 262 roots = set()
245 263 for r in missingrevs:
246 264 for p in cl.parentrevs(r):
247 265 if p == node.nullrev:
248 266 continue
249 267 if p not in children:
250 268 children[p] = [r]
251 269 else:
252 270 children[p].append(r)
253 271 if p not in mrset:
254 272 roots.add(p)
255 273 if not roots:
256 274 # no common revision to track copies from
257 275 return {}
258 276 min_root = min(roots)
259 277
260 278 from_head = set(
261 279 cl.reachableroots(min_root, [b.rev()], list(roots), includepath=True)
262 280 )
263 281
264 282 iterrevs = set(from_head)
265 283 iterrevs &= mrset
266 284 iterrevs.update(roots)
267 285 iterrevs.remove(b.rev())
268 286 revs = sorted(iterrevs)
269 287
270 288 if repo.filecopiesmode == b'changeset-sidedata':
271 289 revinfo = _revinfo_getter(repo)
272 290 return _combine_changeset_copies(
273 291 revs, children, b.rev(), revinfo, match, isancestor
274 292 )
275 293 else:
276 294 revinfo = _revinfo_getter_extra(repo)
277 295 return _combine_changeset_copies_extra(
278 296 revs, children, b.rev(), revinfo, match, isancestor
279 297 )
280 298
281 299
282 300 def _combine_changeset_copies(
283 301 revs, children, targetrev, revinfo, match, isancestor
284 302 ):
285 303 """combine the copies information for each item of iterrevs
286 304
287 305 revs: sorted iterable of revision to visit
288 306 children: a {parent: [children]} mapping.
289 307 targetrev: the final copies destination revision (not in iterrevs)
290 308 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
291 309 match: a matcher
292 310
293 311 It returns the aggregated copies information for `targetrev`.
294 312 """
295 313 all_copies = {}
296 314 alwaysmatch = match.always()
297 315 for r in revs:
298 316 copies = all_copies.pop(r, None)
299 317 if copies is None:
300 318 # this is a root
301 319 copies = {}
302 320 for i, c in enumerate(children[r]):
303 321 p1, p2, changes = revinfo(c)
304 322 childcopies = {}
305 323 if r == p1:
306 324 parent = 1
307 325 if changes is not None:
308 326 childcopies = changes.copied_from_p1
309 327 else:
310 328 assert r == p2
311 329 parent = 2
312 330 if changes is not None:
313 331 childcopies = changes.copied_from_p2
314 332 if not alwaysmatch:
315 333 childcopies = {
316 334 dst: src for dst, src in childcopies.items() if match(dst)
317 335 }
318 336 newcopies = copies
319 337 if childcopies:
320 338 newcopies = copies.copy()
321 339 for dest, source in pycompat.iteritems(childcopies):
322 340 prev = copies.get(source)
323 341 if prev is not None and prev[1] is not None:
324 342 source = prev[1]
325 343 newcopies[dest] = (c, source)
326 344 assert newcopies is not copies
327 345 if changes is not None:
328 346 for f in changes.removed:
329 347 if f in newcopies:
330 348 if newcopies is copies:
331 349 # copy on write to avoid affecting potential other
332 350 # branches. when there are no other branches, this
333 351 # could be avoided.
334 352 newcopies = copies.copy()
335 353 newcopies[f] = (c, None)
336 354 othercopies = all_copies.get(c)
337 355 if othercopies is None:
338 356 all_copies[c] = newcopies
339 357 else:
340 358 # we are the second parent to work on c, we need to merge our
341 359 # work with the other.
342 360 #
343 361 # In case of conflict, parent 1 take precedence over parent 2.
344 362 # This is an arbitrary choice made anew when implementing
345 363 # changeset based copies. It was made without regards with
346 364 # potential filelog related behavior.
347 365 if parent == 1:
348 366 _merge_copies_dict(
349 367 othercopies, newcopies, isancestor, changes
350 368 )
351 369 else:
352 370 _merge_copies_dict(
353 371 newcopies, othercopies, isancestor, changes
354 372 )
355 373 all_copies[c] = newcopies
356 374
357 375 final_copies = {}
358 376 for dest, (tt, source) in all_copies[targetrev].items():
359 377 if source is not None:
360 378 final_copies[dest] = source
361 379 return final_copies
362 380
363 381
364 382 def _merge_copies_dict(minor, major, isancestor, changes):
365 383 """merge two copies-mapping together, minor and major
366 384
367 385 In case of conflict, value from "major" will be picked.
368 386
369 387 - `isancestors(low_rev, high_rev)`: callable return True if `low_rev` is an
370 388 ancestors of `high_rev`,
371 389
372 390 - `ismerged(path)`: callable return True if `path` have been merged in the
373 391 current revision,
374 392 """
375 393 for dest, value in major.items():
376 394 other = minor.get(dest)
377 395 if other is None:
378 396 minor[dest] = value
379 397 else:
380 398 new_tt = value[0]
381 399 other_tt = other[0]
382 400 if value[1] == other[1]:
383 401 continue
384 402 # content from "major" wins, unless it is older
385 403 # than the branch point or there is a merge
386 404 if new_tt == other_tt:
387 405 minor[dest] = value
388 406 elif (
389 407 changes is not None
390 408 and value[1] is None
391 409 and dest in changes.salvaged
392 410 ):
393 411 pass
394 412 elif (
395 413 changes is not None
396 414 and other[1] is None
397 415 and dest in changes.salvaged
398 416 ):
399 417 minor[dest] = value
400 418 elif changes is not None and dest in changes.merged:
401 419 minor[dest] = value
402 420 elif not isancestor(new_tt, other_tt):
403 421 if value[1] is not None:
404 422 minor[dest] = value
405 423 elif isancestor(other_tt, new_tt):
406 424 minor[dest] = value
407 425
408 426
409 427 def _revinfo_getter_extra(repo):
410 428 """return a function that return multiple data given a <rev>"i
411 429
412 430 * p1: revision number of first parent
413 431 * p2: revision number of first parent
414 432 * p1copies: mapping of copies from p1
415 433 * p2copies: mapping of copies from p2
416 434 * removed: a list of removed files
417 435 * ismerged: a callback to know if file was merged in that revision
418 436 """
419 437 cl = repo.changelog
420 438 parents = cl.parentrevs
421 439
422 440 def get_ismerged(rev):
423 441 ctx = repo[rev]
424 442
425 443 def ismerged(path):
426 444 if path not in ctx.files():
427 445 return False
428 446 fctx = ctx[path]
429 447 parents = fctx._filelog.parents(fctx._filenode)
430 448 nb_parents = 0
431 449 for n in parents:
432 450 if n != node.nullid:
433 451 nb_parents += 1
434 452 return nb_parents >= 2
435 453
436 454 return ismerged
437 455
438 456 def revinfo(rev):
439 457 p1, p2 = parents(rev)
440 458 ctx = repo[rev]
441 459 p1copies, p2copies = ctx._copies
442 460 removed = ctx.filesremoved()
443 461 return p1, p2, p1copies, p2copies, removed, get_ismerged(rev)
444 462
445 463 return revinfo
446 464
447 465
448 466 def _combine_changeset_copies_extra(
449 467 revs, children, targetrev, revinfo, match, isancestor
450 468 ):
451 469 """version of `_combine_changeset_copies` that works with the Google
452 470 specific "extra" based storage for copy information"""
453 471 all_copies = {}
454 472 alwaysmatch = match.always()
455 473 for r in revs:
456 474 copies = all_copies.pop(r, None)
457 475 if copies is None:
458 476 # this is a root
459 477 copies = {}
460 478 for i, c in enumerate(children[r]):
461 479 p1, p2, p1copies, p2copies, removed, ismerged = revinfo(c)
462 480 if r == p1:
463 481 parent = 1
464 482 childcopies = p1copies
465 483 else:
466 484 assert r == p2
467 485 parent = 2
468 486 childcopies = p2copies
469 487 if not alwaysmatch:
470 488 childcopies = {
471 489 dst: src for dst, src in childcopies.items() if match(dst)
472 490 }
473 491 newcopies = copies
474 492 if childcopies:
475 493 newcopies = copies.copy()
476 494 for dest, source in pycompat.iteritems(childcopies):
477 495 prev = copies.get(source)
478 496 if prev is not None and prev[1] is not None:
479 497 source = prev[1]
480 498 newcopies[dest] = (c, source)
481 499 assert newcopies is not copies
482 500 for f in removed:
483 501 if f in newcopies:
484 502 if newcopies is copies:
485 503 # copy on write to avoid affecting potential other
486 504 # branches. when there are no other branches, this
487 505 # could be avoided.
488 506 newcopies = copies.copy()
489 507 newcopies[f] = (c, None)
490 508 othercopies = all_copies.get(c)
491 509 if othercopies is None:
492 510 all_copies[c] = newcopies
493 511 else:
494 512 # we are the second parent to work on c, we need to merge our
495 513 # work with the other.
496 514 #
497 515 # In case of conflict, parent 1 take precedence over parent 2.
498 516 # This is an arbitrary choice made anew when implementing
499 517 # changeset based copies. It was made without regards with
500 518 # potential filelog related behavior.
501 519 if parent == 1:
502 520 _merge_copies_dict_extra(
503 521 othercopies, newcopies, isancestor, ismerged
504 522 )
505 523 else:
506 524 _merge_copies_dict_extra(
507 525 newcopies, othercopies, isancestor, ismerged
508 526 )
509 527 all_copies[c] = newcopies
510 528
511 529 final_copies = {}
512 530 for dest, (tt, source) in all_copies[targetrev].items():
513 531 if source is not None:
514 532 final_copies[dest] = source
515 533 return final_copies
516 534
517 535
518 536 def _merge_copies_dict_extra(minor, major, isancestor, ismerged):
519 537 """version of `_merge_copies_dict` that works with the Google
520 538 specific "extra" based storage for copy information"""
521 539 for dest, value in major.items():
522 540 other = minor.get(dest)
523 541 if other is None:
524 542 minor[dest] = value
525 543 else:
526 544 new_tt = value[0]
527 545 other_tt = other[0]
528 546 if value[1] == other[1]:
529 547 continue
530 548 # content from "major" wins, unless it is older
531 549 # than the branch point or there is a merge
532 550 if (
533 551 new_tt == other_tt
534 552 or not isancestor(new_tt, other_tt)
535 553 or ismerged(dest)
536 554 ):
537 555 minor[dest] = value
538 556
539 557
540 558 def _forwardcopies(a, b, base=None, match=None):
541 559 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
542 560
543 561 if base is None:
544 562 base = a
545 563 match = a.repo().narrowmatch(match)
546 564 # check for working copy
547 565 if b.rev() is None:
548 566 cm = _committedforwardcopies(a, b.p1(), base, match)
549 567 # combine copies from dirstate if necessary
550 568 copies = _chain(cm, _dirstatecopies(b._repo, match))
551 569 else:
552 570 copies = _committedforwardcopies(a, b, base, match)
553 571 return copies
554 572
555 573
556 574 def _backwardrenames(a, b, match):
557 575 if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
558 576 return {}
559 577
560 578 # Even though we're not taking copies into account, 1:n rename situations
561 579 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
562 580 # arbitrarily pick one of the renames.
563 581 # We don't want to pass in "match" here, since that would filter
564 582 # the destination by it. Since we're reversing the copies, we want
565 583 # to filter the source instead.
566 584 f = _forwardcopies(b, a)
567 585 r = {}
568 586 for k, v in sorted(pycompat.iteritems(f)):
569 587 if match and not match(v):
570 588 continue
571 589 # remove copies
572 590 if v in a:
573 591 continue
574 592 r[v] = k
575 593 return r
576 594
577 595
578 596 def pathcopies(x, y, match=None):
579 597 """find {dst@y: src@x} copy mapping for directed compare"""
580 598 repo = x._repo
581 599 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
582 600 if debug:
583 601 repo.ui.debug(
584 602 b'debug.copies: searching copies from %s to %s\n' % (x, y)
585 603 )
586 604 if x == y or not x or not y:
587 605 return {}
588 606 if y.rev() is None and x == y.p1():
589 607 if debug:
590 608 repo.ui.debug(b'debug.copies: search mode: dirstate\n')
591 609 # short-circuit to avoid issues with merge states
592 610 return _dirstatecopies(repo, match)
593 611 a = y.ancestor(x)
594 612 if a == x:
595 613 if debug:
596 614 repo.ui.debug(b'debug.copies: search mode: forward\n')
597 615 copies = _forwardcopies(x, y, match=match)
598 616 elif a == y:
599 617 if debug:
600 618 repo.ui.debug(b'debug.copies: search mode: backward\n')
601 619 copies = _backwardrenames(x, y, match=match)
602 620 else:
603 621 if debug:
604 622 repo.ui.debug(b'debug.copies: search mode: combined\n')
605 623 base = None
606 624 if a.rev() != node.nullrev:
607 625 base = x
608 626 copies = _chain(
609 627 _backwardrenames(x, a, match=match),
610 628 _forwardcopies(a, y, base, match=match),
611 629 )
612 630 _filter(x, y, copies)
613 631 return copies
614 632
615 633
616 634 def mergecopies(repo, c1, c2, base):
617 635 """
618 636 Finds moves and copies between context c1 and c2 that are relevant for
619 637 merging. 'base' will be used as the merge base.
620 638
621 639 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
622 640 files that were moved/ copied in one merge parent and modified in another.
623 641 For example:
624 642
625 643 o ---> 4 another commit
626 644 |
627 645 | o ---> 3 commit that modifies a.txt
628 646 | /
629 647 o / ---> 2 commit that moves a.txt to b.txt
630 648 |/
631 649 o ---> 1 merge base
632 650
633 651 If we try to rebase revision 3 on revision 4, since there is no a.txt in
634 652 revision 4, and if user have copytrace disabled, we prints the following
635 653 message:
636 654
637 655 ```other changed <file> which local deleted```
638 656
639 657 Returns a tuple where:
640 658
641 659 "branch_copies" an instance of branch_copies.
642 660
643 661 "diverge" is a mapping of source name -> list of destination names
644 662 for divergent renames.
645 663
646 664 This function calls different copytracing algorithms based on config.
647 665 """
648 666 # avoid silly behavior for update from empty dir
649 667 if not c1 or not c2 or c1 == c2:
650 668 return branch_copies(), branch_copies(), {}
651 669
652 670 narrowmatch = c1.repo().narrowmatch()
653 671
654 672 # avoid silly behavior for parent -> working dir
655 673 if c2.node() is None and c1.node() == repo.dirstate.p1():
656 674 return (
657 675 branch_copies(_dirstatecopies(repo, narrowmatch)),
658 676 branch_copies(),
659 677 {},
660 678 )
661 679
662 680 copytracing = repo.ui.config(b'experimental', b'copytrace')
663 681 if stringutil.parsebool(copytracing) is False:
664 682 # stringutil.parsebool() returns None when it is unable to parse the
665 683 # value, so we should rely on making sure copytracing is on such cases
666 684 return branch_copies(), branch_copies(), {}
667 685
668 686 if usechangesetcentricalgo(repo):
669 687 # The heuristics don't make sense when we need changeset-centric algos
670 688 return _fullcopytracing(repo, c1, c2, base)
671 689
672 690 # Copy trace disabling is explicitly below the node == p1 logic above
673 691 # because the logic above is required for a simple copy to be kept across a
674 692 # rebase.
675 693 if copytracing == b'heuristics':
676 694 # Do full copytracing if only non-public revisions are involved as
677 695 # that will be fast enough and will also cover the copies which could
678 696 # be missed by heuristics
679 697 if _isfullcopytraceable(repo, c1, base):
680 698 return _fullcopytracing(repo, c1, c2, base)
681 699 return _heuristicscopytracing(repo, c1, c2, base)
682 700 else:
683 701 return _fullcopytracing(repo, c1, c2, base)
684 702
685 703
686 704 def _isfullcopytraceable(repo, c1, base):
687 705 """ Checks that if base, source and destination are all no-public branches,
688 706 if yes let's use the full copytrace algorithm for increased capabilities
689 707 since it will be fast enough.
690 708
691 709 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
692 710 number of changesets from c1 to base such that if number of changesets are
693 711 more than the limit, full copytracing algorithm won't be used.
694 712 """
695 713 if c1.rev() is None:
696 714 c1 = c1.p1()
697 715 if c1.mutable() and base.mutable():
698 716 sourcecommitlimit = repo.ui.configint(
699 717 b'experimental', b'copytrace.sourcecommitlimit'
700 718 )
701 719 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
702 720 return commits < sourcecommitlimit
703 721 return False
704 722
705 723
706 724 def _checksinglesidecopies(
707 725 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
708 726 ):
709 727 if src not in m2:
710 728 # deleted on side 2
711 729 if src not in m1:
712 730 # renamed on side 1, deleted on side 2
713 731 renamedelete[src] = dsts1
714 732 elif src not in mb:
715 733 # Work around the "short-circuit to avoid issues with merge states"
716 734 # thing in pathcopies(): pathcopies(x, y) can return a copy where the
717 735 # destination doesn't exist in y.
718 736 pass
719 737 elif mb[src] != m2[src] and not _related(c2[src], base[src]):
720 738 return
721 739 elif mb[src] != m2[src] or mb.flags(src) != m2.flags(src):
722 740 # modified on side 2
723 741 for dst in dsts1:
724 742 copy[dst] = src
725 743
726 744
727 745 class branch_copies(object):
728 746 """Information about copies made on one side of a merge/graft.
729 747
730 748 "copy" is a mapping from destination name -> source name,
731 749 where source is in c1 and destination is in c2 or vice-versa.
732 750
733 751 "movewithdir" is a mapping from source name -> destination name,
734 752 where the file at source present in one context but not the other
735 753 needs to be moved to destination by the merge process, because the
736 754 other context moved the directory it is in.
737 755
738 756 "renamedelete" is a mapping of source name -> list of destination
739 757 names for files deleted in c1 that were renamed in c2 or vice-versa.
740 758
741 759 "dirmove" is a mapping of detected source dir -> destination dir renames.
742 760 This is needed for handling changes to new files previously grafted into
743 761 renamed directories.
744 762 """
745 763
746 764 def __init__(
747 765 self, copy=None, renamedelete=None, dirmove=None, movewithdir=None
748 766 ):
749 767 self.copy = {} if copy is None else copy
750 768 self.renamedelete = {} if renamedelete is None else renamedelete
751 769 self.dirmove = {} if dirmove is None else dirmove
752 770 self.movewithdir = {} if movewithdir is None else movewithdir
753 771
754 772 def __repr__(self):
755 773 return (
756 774 '<branch_copies\n copy=%r\n renamedelete=%r\n dirmove=%r\n movewithdir=%r\n>'
757 775 % (self.copy, self.renamedelete, self.dirmove, self.movewithdir,)
758 776 )
759 777
760 778
761 779 def _fullcopytracing(repo, c1, c2, base):
762 780 """ The full copytracing algorithm which finds all the new files that were
763 781 added from merge base up to the top commit and for each file it checks if
764 782 this file was copied from another file.
765 783
766 784 This is pretty slow when a lot of changesets are involved but will track all
767 785 the copies.
768 786 """
769 787 m1 = c1.manifest()
770 788 m2 = c2.manifest()
771 789 mb = base.manifest()
772 790
773 791 copies1 = pathcopies(base, c1)
774 792 copies2 = pathcopies(base, c2)
775 793
776 794 if not (copies1 or copies2):
777 795 return branch_copies(), branch_copies(), {}
778 796
779 797 inversecopies1 = {}
780 798 inversecopies2 = {}
781 799 for dst, src in copies1.items():
782 800 inversecopies1.setdefault(src, []).append(dst)
783 801 for dst, src in copies2.items():
784 802 inversecopies2.setdefault(src, []).append(dst)
785 803
786 804 copy1 = {}
787 805 copy2 = {}
788 806 diverge = {}
789 807 renamedelete1 = {}
790 808 renamedelete2 = {}
791 809 allsources = set(inversecopies1) | set(inversecopies2)
792 810 for src in allsources:
793 811 dsts1 = inversecopies1.get(src)
794 812 dsts2 = inversecopies2.get(src)
795 813 if dsts1 and dsts2:
796 814 # copied/renamed on both sides
797 815 if src not in m1 and src not in m2:
798 816 # renamed on both sides
799 817 dsts1 = set(dsts1)
800 818 dsts2 = set(dsts2)
801 819 # If there's some overlap in the rename destinations, we
802 820 # consider it not divergent. For example, if side 1 copies 'a'
803 821 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
804 822 # and 'd' and deletes 'a'.
805 823 if dsts1 & dsts2:
806 824 for dst in dsts1 & dsts2:
807 825 copy1[dst] = src
808 826 copy2[dst] = src
809 827 else:
810 828 diverge[src] = sorted(dsts1 | dsts2)
811 829 elif src in m1 and src in m2:
812 830 # copied on both sides
813 831 dsts1 = set(dsts1)
814 832 dsts2 = set(dsts2)
815 833 for dst in dsts1 & dsts2:
816 834 copy1[dst] = src
817 835 copy2[dst] = src
818 836 # TODO: Handle cases where it was renamed on one side and copied
819 837 # on the other side
820 838 elif dsts1:
821 839 # copied/renamed only on side 1
822 840 _checksinglesidecopies(
823 841 src, dsts1, m1, m2, mb, c2, base, copy1, renamedelete1
824 842 )
825 843 elif dsts2:
826 844 # copied/renamed only on side 2
827 845 _checksinglesidecopies(
828 846 src, dsts2, m2, m1, mb, c1, base, copy2, renamedelete2
829 847 )
830 848
831 849 # find interesting file sets from manifests
832 850 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
833 851 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
834 852 u1 = sorted(addedinm1 - addedinm2)
835 853 u2 = sorted(addedinm2 - addedinm1)
836 854
837 855 header = b" unmatched files in %s"
838 856 if u1:
839 857 repo.ui.debug(b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1)))
840 858 if u2:
841 859 repo.ui.debug(b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2)))
842 860
843 861 if repo.ui.debugflag:
844 862 renamedeleteset = set()
845 863 divergeset = set()
846 864 for dsts in diverge.values():
847 865 divergeset.update(dsts)
848 866 for dsts in renamedelete1.values():
849 867 renamedeleteset.update(dsts)
850 868 for dsts in renamedelete2.values():
851 869 renamedeleteset.update(dsts)
852 870
853 871 repo.ui.debug(
854 872 b" all copies found (* = to merge, ! = divergent, "
855 873 b"% = renamed and deleted):\n"
856 874 )
857 875 for side, copies in ((b"local", copies1), (b"remote", copies2)):
858 876 if not copies:
859 877 continue
860 878 repo.ui.debug(b" on %s side:\n" % side)
861 879 for f in sorted(copies):
862 880 note = b""
863 881 if f in copy1 or f in copy2:
864 882 note += b"*"
865 883 if f in divergeset:
866 884 note += b"!"
867 885 if f in renamedeleteset:
868 886 note += b"%"
869 887 repo.ui.debug(
870 888 b" src: '%s' -> dst: '%s' %s\n" % (copies[f], f, note)
871 889 )
872 890 del renamedeleteset
873 891 del divergeset
874 892
875 893 repo.ui.debug(b" checking for directory renames\n")
876 894
877 895 dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2)
878 896 dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1)
879 897
880 898 branch_copies1 = branch_copies(copy1, renamedelete1, dirmove1, movewithdir1)
881 899 branch_copies2 = branch_copies(copy2, renamedelete2, dirmove2, movewithdir2)
882 900
883 901 return branch_copies1, branch_copies2, diverge
884 902
885 903
886 904 def _dir_renames(repo, ctx, copy, fullcopy, addedfiles):
887 905 """Finds moved directories and files that should move with them.
888 906
889 907 ctx: the context for one of the sides
890 908 copy: files copied on the same side (as ctx)
891 909 fullcopy: files copied on the same side (as ctx), including those that
892 910 merge.manifestmerge() won't care about
893 911 addedfiles: added files on the other side (compared to ctx)
894 912 """
895 913 # generate a directory move map
896 914 d = ctx.dirs()
897 915 invalid = set()
898 916 dirmove = {}
899 917
900 918 # examine each file copy for a potential directory move, which is
901 919 # when all the files in a directory are moved to a new directory
902 920 for dst, src in pycompat.iteritems(fullcopy):
903 921 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
904 922 if dsrc in invalid:
905 923 # already seen to be uninteresting
906 924 continue
907 925 elif dsrc in d and ddst in d:
908 926 # directory wasn't entirely moved locally
909 927 invalid.add(dsrc)
910 928 elif dsrc in dirmove and dirmove[dsrc] != ddst:
911 929 # files from the same directory moved to two different places
912 930 invalid.add(dsrc)
913 931 else:
914 932 # looks good so far
915 933 dirmove[dsrc] = ddst
916 934
917 935 for i in invalid:
918 936 if i in dirmove:
919 937 del dirmove[i]
920 938 del d, invalid
921 939
922 940 if not dirmove:
923 941 return {}, {}
924 942
925 943 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
926 944
927 945 for d in dirmove:
928 946 repo.ui.debug(
929 947 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
930 948 )
931 949
932 950 movewithdir = {}
933 951 # check unaccounted nonoverlapping files against directory moves
934 952 for f in addedfiles:
935 953 if f not in fullcopy:
936 954 for d in dirmove:
937 955 if f.startswith(d):
938 956 # new file added in a directory that was moved, move it
939 957 df = dirmove[d] + f[len(d) :]
940 958 if df not in copy:
941 959 movewithdir[f] = df
942 960 repo.ui.debug(
943 961 b" pending file src: '%s' -> dst: '%s'\n"
944 962 % (f, df)
945 963 )
946 964 break
947 965
948 966 return dirmove, movewithdir
949 967
950 968
951 969 def _heuristicscopytracing(repo, c1, c2, base):
952 970 """ Fast copytracing using filename heuristics
953 971
954 972 Assumes that moves or renames are of following two types:
955 973
956 974 1) Inside a directory only (same directory name but different filenames)
957 975 2) Move from one directory to another
958 976 (same filenames but different directory names)
959 977
960 978 Works only when there are no merge commits in the "source branch".
961 979 Source branch is commits from base up to c2 not including base.
962 980
963 981 If merge is involved it fallbacks to _fullcopytracing().
964 982
965 983 Can be used by setting the following config:
966 984
967 985 [experimental]
968 986 copytrace = heuristics
969 987
970 988 In some cases the copy/move candidates found by heuristics can be very large
971 989 in number and that will make the algorithm slow. The number of possible
972 990 candidates to check can be limited by using the config
973 991 `experimental.copytrace.movecandidateslimit` which defaults to 100.
974 992 """
975 993
976 994 if c1.rev() is None:
977 995 c1 = c1.p1()
978 996 if c2.rev() is None:
979 997 c2 = c2.p1()
980 998
981 999 changedfiles = set()
982 1000 m1 = c1.manifest()
983 1001 if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
984 1002 # If base is not in c2 branch, we switch to fullcopytracing
985 1003 repo.ui.debug(
986 1004 b"switching to full copytracing as base is not "
987 1005 b"an ancestor of c2\n"
988 1006 )
989 1007 return _fullcopytracing(repo, c1, c2, base)
990 1008
991 1009 ctx = c2
992 1010 while ctx != base:
993 1011 if len(ctx.parents()) == 2:
994 1012 # To keep things simple let's not handle merges
995 1013 repo.ui.debug(b"switching to full copytracing because of merges\n")
996 1014 return _fullcopytracing(repo, c1, c2, base)
997 1015 changedfiles.update(ctx.files())
998 1016 ctx = ctx.p1()
999 1017
1000 1018 copies2 = {}
1001 1019 cp = _forwardcopies(base, c2)
1002 1020 for dst, src in pycompat.iteritems(cp):
1003 1021 if src in m1:
1004 1022 copies2[dst] = src
1005 1023
1006 1024 # file is missing if it isn't present in the destination, but is present in
1007 1025 # the base and present in the source.
1008 1026 # Presence in the base is important to exclude added files, presence in the
1009 1027 # source is important to exclude removed files.
1010 1028 filt = lambda f: f not in m1 and f in base and f in c2
1011 1029 missingfiles = [f for f in changedfiles if filt(f)]
1012 1030
1013 1031 copies1 = {}
1014 1032 if missingfiles:
1015 1033 basenametofilename = collections.defaultdict(list)
1016 1034 dirnametofilename = collections.defaultdict(list)
1017 1035
1018 1036 for f in m1.filesnotin(base.manifest()):
1019 1037 basename = os.path.basename(f)
1020 1038 dirname = os.path.dirname(f)
1021 1039 basenametofilename[basename].append(f)
1022 1040 dirnametofilename[dirname].append(f)
1023 1041
1024 1042 for f in missingfiles:
1025 1043 basename = os.path.basename(f)
1026 1044 dirname = os.path.dirname(f)
1027 1045 samebasename = basenametofilename[basename]
1028 1046 samedirname = dirnametofilename[dirname]
1029 1047 movecandidates = samebasename + samedirname
1030 1048 # f is guaranteed to be present in c2, that's why
1031 1049 # c2.filectx(f) won't fail
1032 1050 f2 = c2.filectx(f)
1033 1051 # we can have a lot of candidates which can slow down the heuristics
1034 1052 # config value to limit the number of candidates moves to check
1035 1053 maxcandidates = repo.ui.configint(
1036 1054 b'experimental', b'copytrace.movecandidateslimit'
1037 1055 )
1038 1056
1039 1057 if len(movecandidates) > maxcandidates:
1040 1058 repo.ui.status(
1041 1059 _(
1042 1060 b"skipping copytracing for '%s', more "
1043 1061 b"candidates than the limit: %d\n"
1044 1062 )
1045 1063 % (f, len(movecandidates))
1046 1064 )
1047 1065 continue
1048 1066
1049 1067 for candidate in movecandidates:
1050 1068 f1 = c1.filectx(candidate)
1051 1069 if _related(f1, f2):
1052 1070 # if there are a few related copies then we'll merge
1053 1071 # changes into all of them. This matches the behaviour
1054 1072 # of upstream copytracing
1055 1073 copies1[candidate] = f
1056 1074
1057 1075 return branch_copies(copies1), branch_copies(copies2), {}
1058 1076
1059 1077
1060 1078 def _related(f1, f2):
1061 1079 """return True if f1 and f2 filectx have a common ancestor
1062 1080
1063 1081 Walk back to common ancestor to see if the two files originate
1064 1082 from the same file. Since workingfilectx's rev() is None it messes
1065 1083 up the integer comparison logic, hence the pre-step check for
1066 1084 None (f1 and f2 can only be workingfilectx's initially).
1067 1085 """
1068 1086
1069 1087 if f1 == f2:
1070 1088 return True # a match
1071 1089
1072 1090 g1, g2 = f1.ancestors(), f2.ancestors()
1073 1091 try:
1074 1092 f1r, f2r = f1.linkrev(), f2.linkrev()
1075 1093
1076 1094 if f1r is None:
1077 1095 f1 = next(g1)
1078 1096 if f2r is None:
1079 1097 f2 = next(g2)
1080 1098
1081 1099 while True:
1082 1100 f1r, f2r = f1.linkrev(), f2.linkrev()
1083 1101 if f1r > f2r:
1084 1102 f1 = next(g1)
1085 1103 elif f2r > f1r:
1086 1104 f2 = next(g2)
1087 1105 else: # f1 and f2 point to files in the same linkrev
1088 1106 return f1 == f2 # true if they point to the same file
1089 1107 except StopIteration:
1090 1108 return False
1091 1109
1092 1110
1093 1111 def graftcopies(wctx, ctx, base):
1094 1112 """reproduce copies between base and ctx in the wctx
1095 1113
1096 1114 Unlike mergecopies(), this function will only consider copies between base
1097 1115 and ctx; it will ignore copies between base and wctx. Also unlike
1098 1116 mergecopies(), this function will apply copies to the working copy (instead
1099 1117 of just returning information about the copies). That makes it cheaper
1100 1118 (especially in the common case of base==ctx.p1()) and useful also when
1101 1119 experimental.copytrace=off.
1102 1120
1103 1121 merge.update() will have already marked most copies, but it will only
1104 1122 mark copies if it thinks the source files are related (see
1105 1123 merge._related()). It will also not mark copies if the file wasn't modified
1106 1124 on the local side. This function adds the copies that were "missed"
1107 1125 by merge.update().
1108 1126 """
1109 1127 new_copies = pathcopies(base, ctx)
1110 1128 _filter(wctx.p1(), wctx, new_copies)
1111 1129 for dst, src in pycompat.iteritems(new_copies):
1112 1130 wctx[dst].markcopied(src)
General Comments 0
You need to be logged in to leave comments. Login now