##// END OF EJS Templates
copies: make calculating lazy for dir move detection's "addedfiles"...
Kyle Lippincott -
r46725:2f357d05 default
parent child Browse files
Show More
@@ -1,1178 +1,1194
1 1 # copies.py - copy detection for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import collections
11 11 import os
12 12
13 13 from .i18n import _
14 14
15 15
16 16 from . import (
17 17 match as matchmod,
18 18 node,
19 19 pathutil,
20 20 policy,
21 21 pycompat,
22 22 util,
23 23 )
24 24
25 25
26 26 from .utils import stringutil
27 27
28 28 from .revlogutils import (
29 29 flagutil,
30 30 sidedata as sidedatamod,
31 31 )
32 32
33 33 rustmod = policy.importrust("copy_tracing")
34 34
35 35
36 36 def _filter(src, dst, t):
37 37 """filters out invalid copies after chaining"""
38 38
39 39 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
40 40 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
41 41 # in the following table (not including trivial cases). For example, case 2
42 42 # is where a file existed in 'src' and remained under that name in 'mid' and
43 43 # then was renamed between 'mid' and 'dst'.
44 44 #
45 45 # case src mid dst result
46 46 # 1 x y - -
47 47 # 2 x y y x->y
48 48 # 3 x y x -
49 49 # 4 x y z x->z
50 50 # 5 - x y -
51 51 # 6 x x y x->y
52 52 #
53 53 # _chain() takes care of chaining the copies in 'a' and 'b', but it
54 54 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
55 55 # between 5 and 6, so it includes all cases in its result.
56 56 # Cases 1, 3, and 5 are then removed by _filter().
57 57
58 58 for k, v in list(t.items()):
59 59 # remove copies from files that didn't exist
60 60 if v not in src:
61 61 del t[k]
62 62 # remove criss-crossed copies
63 63 elif k in src and v in dst:
64 64 del t[k]
65 65 # remove copies to files that were then removed
66 66 elif k not in dst:
67 67 del t[k]
68 68
69 69
70 70 def _chain(prefix, suffix):
71 71 """chain two sets of copies 'prefix' and 'suffix'"""
72 72 result = prefix.copy()
73 73 for key, value in pycompat.iteritems(suffix):
74 74 result[key] = prefix.get(value, value)
75 75 return result
76 76
77 77
78 78 def _tracefile(fctx, am, basemf):
79 79 """return file context that is the ancestor of fctx present in ancestor
80 80 manifest am
81 81
82 82 Note: we used to try and stop after a given limit, however checking if that
83 83 limit is reached turned out to be very expensive. we are better off
84 84 disabling that feature."""
85 85
86 86 for f in fctx.ancestors():
87 87 path = f.path()
88 88 if am.get(path, None) == f.filenode():
89 89 return path
90 90 if basemf and basemf.get(path, None) == f.filenode():
91 91 return path
92 92
93 93
94 94 def _dirstatecopies(repo, match=None):
95 95 ds = repo.dirstate
96 96 c = ds.copies().copy()
97 97 for k in list(c):
98 98 if ds[k] not in b'anm' or (match and not match(k)):
99 99 del c[k]
100 100 return c
101 101
102 102
103 103 def _computeforwardmissing(a, b, match=None):
104 104 """Computes which files are in b but not a.
105 105 This is its own function so extensions can easily wrap this call to see what
106 106 files _forwardcopies is about to process.
107 107 """
108 108 ma = a.manifest()
109 109 mb = b.manifest()
110 110 return mb.filesnotin(ma, match=match)
111 111
112 112
113 113 def usechangesetcentricalgo(repo):
114 114 """Checks if we should use changeset-centric copy algorithms"""
115 115 if repo.filecopiesmode == b'changeset-sidedata':
116 116 return True
117 117 readfrom = repo.ui.config(b'experimental', b'copies.read-from')
118 118 changesetsource = (b'changeset-only', b'compatibility')
119 119 return readfrom in changesetsource
120 120
121 121
122 122 def _committedforwardcopies(a, b, base, match):
123 123 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
124 124 # files might have to be traced back to the fctx parent of the last
125 125 # one-side-only changeset, but not further back than that
126 126 repo = a._repo
127 127
128 128 if usechangesetcentricalgo(repo):
129 129 return _changesetforwardcopies(a, b, match)
130 130
131 131 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
132 132 dbg = repo.ui.debug
133 133 if debug:
134 134 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b))
135 135 am = a.manifest()
136 136 basemf = None if base is None else base.manifest()
137 137
138 138 # find where new files came from
139 139 # we currently don't try to find where old files went, too expensive
140 140 # this means we can miss a case like 'hg rm b; hg cp a b'
141 141 cm = {}
142 142
143 143 # Computing the forward missing is quite expensive on large manifests, since
144 144 # it compares the entire manifests. We can optimize it in the common use
145 145 # case of computing what copies are in a commit versus its parent (like
146 146 # during a rebase or histedit). Note, we exclude merge commits from this
147 147 # optimization, since the ctx.files() for a merge commit is not correct for
148 148 # this comparison.
149 149 forwardmissingmatch = match
150 150 if b.p1() == a and b.p2().node() == node.nullid:
151 151 filesmatcher = matchmod.exact(b.files())
152 152 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
153 153 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
154 154
155 155 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
156 156
157 157 if debug:
158 158 dbg(b'debug.copies: missing files to search: %d\n' % len(missing))
159 159
160 160 for f in sorted(missing):
161 161 if debug:
162 162 dbg(b'debug.copies: tracing file: %s\n' % f)
163 163 fctx = b[f]
164 164 fctx._ancestrycontext = ancestrycontext
165 165
166 166 if debug:
167 167 start = util.timer()
168 168 opath = _tracefile(fctx, am, basemf)
169 169 if opath:
170 170 if debug:
171 171 dbg(b'debug.copies: rename of: %s\n' % opath)
172 172 cm[f] = opath
173 173 if debug:
174 174 dbg(
175 175 b'debug.copies: time: %f seconds\n'
176 176 % (util.timer() - start)
177 177 )
178 178 return cm
179 179
180 180
181 181 def _revinfo_getter(repo, match):
182 182 """returns a function that returns the following data given a <rev>"
183 183
184 184 * p1: revision number of first parent
185 185 * p2: revision number of first parent
186 186 * changes: a ChangingFiles object
187 187 """
188 188 cl = repo.changelog
189 189 parents = cl.parentrevs
190 190 flags = cl.flags
191 191
192 192 HASCOPIESINFO = flagutil.REVIDX_HASCOPIESINFO
193 193
194 194 changelogrevision = cl.changelogrevision
195 195
196 196 # A small cache to avoid doing the work twice for merges
197 197 #
198 198 # In the vast majority of cases, if we ask information for a revision
199 199 # about 1 parent, we'll later ask it for the other. So it make sense to
200 200 # keep the information around when reaching the first parent of a merge
201 201 # and dropping it after it was provided for the second parents.
202 202 #
203 203 # It exists cases were only one parent of the merge will be walked. It
204 204 # happens when the "destination" the copy tracing is descendant from a
205 205 # new root, not common with the "source". In that case, we will only walk
206 206 # through merge parents that are descendant of changesets common
207 207 # between "source" and "destination".
208 208 #
209 209 # With the current case implementation if such changesets have a copy
210 210 # information, we'll keep them in memory until the end of
211 211 # _changesetforwardcopies. We don't expect the case to be frequent
212 212 # enough to matters.
213 213 #
214 214 # In addition, it would be possible to reach pathological case, were
215 215 # many first parent are met before any second parent is reached. In
216 216 # that case the cache could grow. If this even become an issue one can
217 217 # safely introduce a maximum cache size. This would trade extra CPU/IO
218 218 # time to save memory.
219 219 merge_caches = {}
220 220
221 221 alwaysmatch = match.always()
222 222
223 223 if rustmod is not None and alwaysmatch:
224 224
225 225 def revinfo(rev):
226 226 p1, p2 = parents(rev)
227 227 value = None
228 228 e = merge_caches.pop(rev, None)
229 229 if e is not None:
230 230 return e
231 231 if flags(rev) & HASCOPIESINFO:
232 232 raw = changelogrevision(rev)._sidedata.get(sidedatamod.SD_FILES)
233 233 else:
234 234 raw = None
235 235 value = (p1, p2, raw)
236 236 if p1 != node.nullrev and p2 != node.nullrev:
237 237 # XXX some case we over cache, IGNORE
238 238 merge_caches[rev] = value
239 239 return value
240 240
241 241 else:
242 242
243 243 def revinfo(rev):
244 244 p1, p2 = parents(rev)
245 245 value = None
246 246 e = merge_caches.pop(rev, None)
247 247 if e is not None:
248 248 return e
249 249 changes = None
250 250 if flags(rev) & HASCOPIESINFO:
251 251 changes = changelogrevision(rev).changes
252 252 value = (p1, p2, changes)
253 253 if p1 != node.nullrev and p2 != node.nullrev:
254 254 # XXX some case we over cache, IGNORE
255 255 merge_caches[rev] = value
256 256 return value
257 257
258 258 return revinfo
259 259
260 260
261 261 def cached_is_ancestor(is_ancestor):
262 262 """return a cached version of is_ancestor"""
263 263 cache = {}
264 264
265 265 def _is_ancestor(anc, desc):
266 266 if anc > desc:
267 267 return False
268 268 elif anc == desc:
269 269 return True
270 270 key = (anc, desc)
271 271 ret = cache.get(key)
272 272 if ret is None:
273 273 ret = cache[key] = is_ancestor(anc, desc)
274 274 return ret
275 275
276 276 return _is_ancestor
277 277
278 278
279 279 def _changesetforwardcopies(a, b, match):
280 280 if a.rev() in (node.nullrev, b.rev()):
281 281 return {}
282 282
283 283 repo = a.repo().unfiltered()
284 284 children = {}
285 285
286 286 cl = repo.changelog
287 287 isancestor = cl.isancestorrev
288 288 missingrevs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
289 289 mrset = set(missingrevs)
290 290 roots = set()
291 291 for r in missingrevs:
292 292 for p in cl.parentrevs(r):
293 293 if p == node.nullrev:
294 294 continue
295 295 if p not in children:
296 296 children[p] = [r]
297 297 else:
298 298 children[p].append(r)
299 299 if p not in mrset:
300 300 roots.add(p)
301 301 if not roots:
302 302 # no common revision to track copies from
303 303 return {}
304 304 min_root = min(roots)
305 305
306 306 from_head = set(
307 307 cl.reachableroots(min_root, [b.rev()], list(roots), includepath=True)
308 308 )
309 309
310 310 iterrevs = set(from_head)
311 311 iterrevs &= mrset
312 312 iterrevs.update(roots)
313 313 iterrevs.remove(b.rev())
314 314 revs = sorted(iterrevs)
315 315
316 316 if repo.filecopiesmode == b'changeset-sidedata':
317 317 revinfo = _revinfo_getter(repo, match)
318 318 return _combine_changeset_copies(
319 319 revs, children, b.rev(), revinfo, match, isancestor
320 320 )
321 321 else:
322 322 revinfo = _revinfo_getter_extra(repo)
323 323 return _combine_changeset_copies_extra(
324 324 revs, children, b.rev(), revinfo, match, isancestor
325 325 )
326 326
327 327
328 328 def _combine_changeset_copies(
329 329 revs, children, targetrev, revinfo, match, isancestor
330 330 ):
331 331 """combine the copies information for each item of iterrevs
332 332
333 333 revs: sorted iterable of revision to visit
334 334 children: a {parent: [children]} mapping.
335 335 targetrev: the final copies destination revision (not in iterrevs)
336 336 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
337 337 match: a matcher
338 338
339 339 It returns the aggregated copies information for `targetrev`.
340 340 """
341 341
342 342 alwaysmatch = match.always()
343 343
344 344 if rustmod is not None and alwaysmatch:
345 345 return rustmod.combine_changeset_copies(
346 346 list(revs), children, targetrev, revinfo, isancestor
347 347 )
348 348
349 349 isancestor = cached_is_ancestor(isancestor)
350 350
351 351 all_copies = {}
352 352 for r in revs:
353 353 copies = all_copies.pop(r, None)
354 354 if copies is None:
355 355 # this is a root
356 356 copies = {}
357 357 for i, c in enumerate(children[r]):
358 358 p1, p2, changes = revinfo(c)
359 359 childcopies = {}
360 360 if r == p1:
361 361 parent = 1
362 362 if changes is not None:
363 363 childcopies = changes.copied_from_p1
364 364 else:
365 365 assert r == p2
366 366 parent = 2
367 367 if changes is not None:
368 368 childcopies = changes.copied_from_p2
369 369 if not alwaysmatch:
370 370 childcopies = {
371 371 dst: src for dst, src in childcopies.items() if match(dst)
372 372 }
373 373 newcopies = copies
374 374 if childcopies:
375 375 newcopies = copies.copy()
376 376 for dest, source in pycompat.iteritems(childcopies):
377 377 prev = copies.get(source)
378 378 if prev is not None and prev[1] is not None:
379 379 source = prev[1]
380 380 newcopies[dest] = (c, source)
381 381 assert newcopies is not copies
382 382 if changes is not None and changes.removed:
383 383 if newcopies is copies:
384 384 newcopies = copies.copy()
385 385 for f in changes.removed:
386 386 if f in newcopies:
387 387 if newcopies is copies:
388 388 # copy on write to avoid affecting potential other
389 389 # branches. when there are no other branches, this
390 390 # could be avoided.
391 391 newcopies = copies.copy()
392 392 newcopies[f] = (c, None)
393 393 othercopies = all_copies.get(c)
394 394 if othercopies is None:
395 395 all_copies[c] = newcopies
396 396 elif newcopies is othercopies:
397 397 # nothing to merge:
398 398 pass
399 399 else:
400 400 # we are the second parent to work on c, we need to merge our
401 401 # work with the other.
402 402 #
403 403 # In case of conflict, parent 1 take precedence over parent 2.
404 404 # This is an arbitrary choice made anew when implementing
405 405 # changeset based copies. It was made without regards with
406 406 # potential filelog related behavior.
407 407 if parent == 1:
408 408 if newcopies is copies:
409 409 newcopies = copies.copy()
410 410 minor, major = othercopies, newcopies
411 411 else:
412 412 # we do not know if the other dict is a copy or not, so we
413 413 # need to blindly copy it. Future change should make this
414 414 # unnecessary.
415 415 minor, major = newcopies, othercopies.copy()
416 416 copies = _merge_copies_dict(minor, major, isancestor, changes)
417 417 all_copies[c] = copies
418 418
419 419 final_copies = {}
420 420 for dest, (tt, source) in all_copies[targetrev].items():
421 421 if source is not None:
422 422 final_copies[dest] = source
423 423 return final_copies
424 424
425 425
426 426 def _merge_copies_dict(minor, major, isancestor, changes):
427 427 """merge two copies-mapping together, minor and major
428 428
429 429 In case of conflict, value from "major" will be picked.
430 430
431 431 - `isancestors(low_rev, high_rev)`: callable return True if `low_rev` is an
432 432 ancestors of `high_rev`,
433 433
434 434 - `ismerged(path)`: callable return True if `path` have been merged in the
435 435 current revision,
436 436
437 437 return the resulting dict (in practice, the "minor" object, updated)
438 438 """
439 439 for dest, value in major.items():
440 440 other = minor.get(dest)
441 441 if other is None:
442 442 minor[dest] = value
443 443 else:
444 444 new_tt = value[0]
445 445 other_tt = other[0]
446 446 if value[1] == other[1]:
447 447 continue
448 448 # content from "major" wins, unless it is older
449 449 # than the branch point or there is a merge
450 450 if new_tt == other_tt:
451 451 minor[dest] = value
452 452 elif (
453 453 changes is not None
454 454 and value[1] is None
455 455 and dest in changes.salvaged
456 456 ):
457 457 pass
458 458 elif (
459 459 changes is not None
460 460 and other[1] is None
461 461 and dest in changes.salvaged
462 462 ):
463 463 minor[dest] = value
464 464 elif changes is not None and dest in changes.merged:
465 465 minor[dest] = value
466 466 elif not isancestor(new_tt, other_tt):
467 467 if value[1] is not None:
468 468 minor[dest] = value
469 469 elif isancestor(other_tt, new_tt):
470 470 minor[dest] = value
471 471 return minor
472 472
473 473
474 474 def _revinfo_getter_extra(repo):
475 475 """return a function that return multiple data given a <rev>"i
476 476
477 477 * p1: revision number of first parent
478 478 * p2: revision number of first parent
479 479 * p1copies: mapping of copies from p1
480 480 * p2copies: mapping of copies from p2
481 481 * removed: a list of removed files
482 482 * ismerged: a callback to know if file was merged in that revision
483 483 """
484 484 cl = repo.changelog
485 485 parents = cl.parentrevs
486 486
487 487 def get_ismerged(rev):
488 488 ctx = repo[rev]
489 489
490 490 def ismerged(path):
491 491 if path not in ctx.files():
492 492 return False
493 493 fctx = ctx[path]
494 494 parents = fctx._filelog.parents(fctx._filenode)
495 495 nb_parents = 0
496 496 for n in parents:
497 497 if n != node.nullid:
498 498 nb_parents += 1
499 499 return nb_parents >= 2
500 500
501 501 return ismerged
502 502
503 503 def revinfo(rev):
504 504 p1, p2 = parents(rev)
505 505 ctx = repo[rev]
506 506 p1copies, p2copies = ctx._copies
507 507 removed = ctx.filesremoved()
508 508 return p1, p2, p1copies, p2copies, removed, get_ismerged(rev)
509 509
510 510 return revinfo
511 511
512 512
513 513 def _combine_changeset_copies_extra(
514 514 revs, children, targetrev, revinfo, match, isancestor
515 515 ):
516 516 """version of `_combine_changeset_copies` that works with the Google
517 517 specific "extra" based storage for copy information"""
518 518 all_copies = {}
519 519 alwaysmatch = match.always()
520 520 for r in revs:
521 521 copies = all_copies.pop(r, None)
522 522 if copies is None:
523 523 # this is a root
524 524 copies = {}
525 525 for i, c in enumerate(children[r]):
526 526 p1, p2, p1copies, p2copies, removed, ismerged = revinfo(c)
527 527 if r == p1:
528 528 parent = 1
529 529 childcopies = p1copies
530 530 else:
531 531 assert r == p2
532 532 parent = 2
533 533 childcopies = p2copies
534 534 if not alwaysmatch:
535 535 childcopies = {
536 536 dst: src for dst, src in childcopies.items() if match(dst)
537 537 }
538 538 newcopies = copies
539 539 if childcopies:
540 540 newcopies = copies.copy()
541 541 for dest, source in pycompat.iteritems(childcopies):
542 542 prev = copies.get(source)
543 543 if prev is not None and prev[1] is not None:
544 544 source = prev[1]
545 545 newcopies[dest] = (c, source)
546 546 assert newcopies is not copies
547 547 for f in removed:
548 548 if f in newcopies:
549 549 if newcopies is copies:
550 550 # copy on write to avoid affecting potential other
551 551 # branches. when there are no other branches, this
552 552 # could be avoided.
553 553 newcopies = copies.copy()
554 554 newcopies[f] = (c, None)
555 555 othercopies = all_copies.get(c)
556 556 if othercopies is None:
557 557 all_copies[c] = newcopies
558 558 else:
559 559 # we are the second parent to work on c, we need to merge our
560 560 # work with the other.
561 561 #
562 562 # In case of conflict, parent 1 take precedence over parent 2.
563 563 # This is an arbitrary choice made anew when implementing
564 564 # changeset based copies. It was made without regards with
565 565 # potential filelog related behavior.
566 566 if parent == 1:
567 567 _merge_copies_dict_extra(
568 568 othercopies, newcopies, isancestor, ismerged
569 569 )
570 570 else:
571 571 _merge_copies_dict_extra(
572 572 newcopies, othercopies, isancestor, ismerged
573 573 )
574 574 all_copies[c] = newcopies
575 575
576 576 final_copies = {}
577 577 for dest, (tt, source) in all_copies[targetrev].items():
578 578 if source is not None:
579 579 final_copies[dest] = source
580 580 return final_copies
581 581
582 582
583 583 def _merge_copies_dict_extra(minor, major, isancestor, ismerged):
584 584 """version of `_merge_copies_dict` that works with the Google
585 585 specific "extra" based storage for copy information"""
586 586 for dest, value in major.items():
587 587 other = minor.get(dest)
588 588 if other is None:
589 589 minor[dest] = value
590 590 else:
591 591 new_tt = value[0]
592 592 other_tt = other[0]
593 593 if value[1] == other[1]:
594 594 continue
595 595 # content from "major" wins, unless it is older
596 596 # than the branch point or there is a merge
597 597 if (
598 598 new_tt == other_tt
599 599 or not isancestor(new_tt, other_tt)
600 600 or ismerged(dest)
601 601 ):
602 602 minor[dest] = value
603 603
604 604
605 605 def _forwardcopies(a, b, base=None, match=None):
606 606 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
607 607
608 608 if base is None:
609 609 base = a
610 610 match = a.repo().narrowmatch(match)
611 611 # check for working copy
612 612 if b.rev() is None:
613 613 cm = _committedforwardcopies(a, b.p1(), base, match)
614 614 # combine copies from dirstate if necessary
615 615 copies = _chain(cm, _dirstatecopies(b._repo, match))
616 616 else:
617 617 copies = _committedforwardcopies(a, b, base, match)
618 618 return copies
619 619
620 620
621 621 def _backwardrenames(a, b, match):
622 622 if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
623 623 return {}
624 624
625 625 # Even though we're not taking copies into account, 1:n rename situations
626 626 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
627 627 # arbitrarily pick one of the renames.
628 628 # We don't want to pass in "match" here, since that would filter
629 629 # the destination by it. Since we're reversing the copies, we want
630 630 # to filter the source instead.
631 631 f = _forwardcopies(b, a)
632 632 r = {}
633 633 for k, v in sorted(pycompat.iteritems(f)):
634 634 if match and not match(v):
635 635 continue
636 636 # remove copies
637 637 if v in a:
638 638 continue
639 639 r[v] = k
640 640 return r
641 641
642 642
643 643 def pathcopies(x, y, match=None):
644 644 """find {dst@y: src@x} copy mapping for directed compare"""
645 645 repo = x._repo
646 646 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
647 647 if debug:
648 648 repo.ui.debug(
649 649 b'debug.copies: searching copies from %s to %s\n' % (x, y)
650 650 )
651 651 if x == y or not x or not y:
652 652 return {}
653 653 if y.rev() is None and x == y.p1():
654 654 if debug:
655 655 repo.ui.debug(b'debug.copies: search mode: dirstate\n')
656 656 # short-circuit to avoid issues with merge states
657 657 return _dirstatecopies(repo, match)
658 658 a = y.ancestor(x)
659 659 if a == x:
660 660 if debug:
661 661 repo.ui.debug(b'debug.copies: search mode: forward\n')
662 662 copies = _forwardcopies(x, y, match=match)
663 663 elif a == y:
664 664 if debug:
665 665 repo.ui.debug(b'debug.copies: search mode: backward\n')
666 666 copies = _backwardrenames(x, y, match=match)
667 667 else:
668 668 if debug:
669 669 repo.ui.debug(b'debug.copies: search mode: combined\n')
670 670 base = None
671 671 if a.rev() != node.nullrev:
672 672 base = x
673 673 copies = _chain(
674 674 _backwardrenames(x, a, match=match),
675 675 _forwardcopies(a, y, base, match=match),
676 676 )
677 677 _filter(x, y, copies)
678 678 return copies
679 679
680 680
681 681 def mergecopies(repo, c1, c2, base):
682 682 """
683 683 Finds moves and copies between context c1 and c2 that are relevant for
684 684 merging. 'base' will be used as the merge base.
685 685
686 686 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
687 687 files that were moved/ copied in one merge parent and modified in another.
688 688 For example:
689 689
690 690 o ---> 4 another commit
691 691 |
692 692 | o ---> 3 commit that modifies a.txt
693 693 | /
694 694 o / ---> 2 commit that moves a.txt to b.txt
695 695 |/
696 696 o ---> 1 merge base
697 697
698 698 If we try to rebase revision 3 on revision 4, since there is no a.txt in
699 699 revision 4, and if user have copytrace disabled, we prints the following
700 700 message:
701 701
702 702 ```other changed <file> which local deleted```
703 703
704 704 Returns a tuple where:
705 705
706 706 "branch_copies" an instance of branch_copies.
707 707
708 708 "diverge" is a mapping of source name -> list of destination names
709 709 for divergent renames.
710 710
711 711 This function calls different copytracing algorithms based on config.
712 712 """
713 713 # avoid silly behavior for update from empty dir
714 714 if not c1 or not c2 or c1 == c2:
715 715 return branch_copies(), branch_copies(), {}
716 716
717 717 narrowmatch = c1.repo().narrowmatch()
718 718
719 719 # avoid silly behavior for parent -> working dir
720 720 if c2.node() is None and c1.node() == repo.dirstate.p1():
721 721 return (
722 722 branch_copies(_dirstatecopies(repo, narrowmatch)),
723 723 branch_copies(),
724 724 {},
725 725 )
726 726
727 727 copytracing = repo.ui.config(b'experimental', b'copytrace')
728 728 if stringutil.parsebool(copytracing) is False:
729 729 # stringutil.parsebool() returns None when it is unable to parse the
730 730 # value, so we should rely on making sure copytracing is on such cases
731 731 return branch_copies(), branch_copies(), {}
732 732
733 733 if usechangesetcentricalgo(repo):
734 734 # The heuristics don't make sense when we need changeset-centric algos
735 735 return _fullcopytracing(repo, c1, c2, base)
736 736
737 737 # Copy trace disabling is explicitly below the node == p1 logic above
738 738 # because the logic above is required for a simple copy to be kept across a
739 739 # rebase.
740 740 if copytracing == b'heuristics':
741 741 # Do full copytracing if only non-public revisions are involved as
742 742 # that will be fast enough and will also cover the copies which could
743 743 # be missed by heuristics
744 744 if _isfullcopytraceable(repo, c1, base):
745 745 return _fullcopytracing(repo, c1, c2, base)
746 746 return _heuristicscopytracing(repo, c1, c2, base)
747 747 else:
748 748 return _fullcopytracing(repo, c1, c2, base)
749 749
750 750
751 751 def _isfullcopytraceable(repo, c1, base):
752 752 """Checks that if base, source and destination are all no-public branches,
753 753 if yes let's use the full copytrace algorithm for increased capabilities
754 754 since it will be fast enough.
755 755
756 756 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
757 757 number of changesets from c1 to base such that if number of changesets are
758 758 more than the limit, full copytracing algorithm won't be used.
759 759 """
760 760 if c1.rev() is None:
761 761 c1 = c1.p1()
762 762 if c1.mutable() and base.mutable():
763 763 sourcecommitlimit = repo.ui.configint(
764 764 b'experimental', b'copytrace.sourcecommitlimit'
765 765 )
766 766 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
767 767 return commits < sourcecommitlimit
768 768 return False
769 769
770 770
771 771 def _checksinglesidecopies(
772 772 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
773 773 ):
774 774 if src not in m2:
775 775 # deleted on side 2
776 776 if src not in m1:
777 777 # renamed on side 1, deleted on side 2
778 778 renamedelete[src] = dsts1
779 779 elif src not in mb:
780 780 # Work around the "short-circuit to avoid issues with merge states"
781 781 # thing in pathcopies(): pathcopies(x, y) can return a copy where the
782 782 # destination doesn't exist in y.
783 783 pass
784 784 elif mb[src] != m2[src] and not _related(c2[src], base[src]):
785 785 return
786 786 elif mb[src] != m2[src] or mb.flags(src) != m2.flags(src):
787 787 # modified on side 2
788 788 for dst in dsts1:
789 789 copy[dst] = src
790 790
791 791
792 792 class branch_copies(object):
793 793 """Information about copies made on one side of a merge/graft.
794 794
795 795 "copy" is a mapping from destination name -> source name,
796 796 where source is in c1 and destination is in c2 or vice-versa.
797 797
798 798 "movewithdir" is a mapping from source name -> destination name,
799 799 where the file at source present in one context but not the other
800 800 needs to be moved to destination by the merge process, because the
801 801 other context moved the directory it is in.
802 802
803 803 "renamedelete" is a mapping of source name -> list of destination
804 804 names for files deleted in c1 that were renamed in c2 or vice-versa.
805 805
806 806 "dirmove" is a mapping of detected source dir -> destination dir renames.
807 807 This is needed for handling changes to new files previously grafted into
808 808 renamed directories.
809 809 """
810 810
811 811 def __init__(
812 812 self, copy=None, renamedelete=None, dirmove=None, movewithdir=None
813 813 ):
814 814 self.copy = {} if copy is None else copy
815 815 self.renamedelete = {} if renamedelete is None else renamedelete
816 816 self.dirmove = {} if dirmove is None else dirmove
817 817 self.movewithdir = {} if movewithdir is None else movewithdir
818 818
819 819 def __repr__(self):
820 820 return '<branch_copies\n copy=%r\n renamedelete=%r\n dirmove=%r\n movewithdir=%r\n>' % (
821 821 self.copy,
822 822 self.renamedelete,
823 823 self.dirmove,
824 824 self.movewithdir,
825 825 )
826 826
827 827
828 828 def _fullcopytracing(repo, c1, c2, base):
829 829 """The full copytracing algorithm which finds all the new files that were
830 830 added from merge base up to the top commit and for each file it checks if
831 831 this file was copied from another file.
832 832
833 833 This is pretty slow when a lot of changesets are involved but will track all
834 834 the copies.
835 835 """
836 836 m1 = c1.manifest()
837 837 m2 = c2.manifest()
838 838 mb = base.manifest()
839 839
840 840 copies1 = pathcopies(base, c1)
841 841 copies2 = pathcopies(base, c2)
842 842
843 843 if not (copies1 or copies2):
844 844 return branch_copies(), branch_copies(), {}
845 845
846 846 inversecopies1 = {}
847 847 inversecopies2 = {}
848 848 for dst, src in copies1.items():
849 849 inversecopies1.setdefault(src, []).append(dst)
850 850 for dst, src in copies2.items():
851 851 inversecopies2.setdefault(src, []).append(dst)
852 852
853 853 copy1 = {}
854 854 copy2 = {}
855 855 diverge = {}
856 856 renamedelete1 = {}
857 857 renamedelete2 = {}
858 858 allsources = set(inversecopies1) | set(inversecopies2)
859 859 for src in allsources:
860 860 dsts1 = inversecopies1.get(src)
861 861 dsts2 = inversecopies2.get(src)
862 862 if dsts1 and dsts2:
863 863 # copied/renamed on both sides
864 864 if src not in m1 and src not in m2:
865 865 # renamed on both sides
866 866 dsts1 = set(dsts1)
867 867 dsts2 = set(dsts2)
868 868 # If there's some overlap in the rename destinations, we
869 869 # consider it not divergent. For example, if side 1 copies 'a'
870 870 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
871 871 # and 'd' and deletes 'a'.
872 872 if dsts1 & dsts2:
873 873 for dst in dsts1 & dsts2:
874 874 copy1[dst] = src
875 875 copy2[dst] = src
876 876 else:
877 877 diverge[src] = sorted(dsts1 | dsts2)
878 878 elif src in m1 and src in m2:
879 879 # copied on both sides
880 880 dsts1 = set(dsts1)
881 881 dsts2 = set(dsts2)
882 882 for dst in dsts1 & dsts2:
883 883 copy1[dst] = src
884 884 copy2[dst] = src
885 885 # TODO: Handle cases where it was renamed on one side and copied
886 886 # on the other side
887 887 elif dsts1:
888 888 # copied/renamed only on side 1
889 889 _checksinglesidecopies(
890 890 src, dsts1, m1, m2, mb, c2, base, copy1, renamedelete1
891 891 )
892 892 elif dsts2:
893 893 # copied/renamed only on side 2
894 894 _checksinglesidecopies(
895 895 src, dsts2, m2, m1, mb, c1, base, copy2, renamedelete2
896 896 )
897 897
898 898 # find interesting file sets from manifests
899 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
900 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
901 u1 = sorted(addedinm1 - addedinm2)
902 u2 = sorted(addedinm2 - addedinm1)
899 cache = []
900
901 def _get_addedfiles(idx):
902 if not cache:
903 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
904 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
905 u1 = sorted(addedinm1 - addedinm2)
906 u2 = sorted(addedinm2 - addedinm1)
907 cache.extend((u1, u2))
908 return cache[idx]
903 909
904 header = b" unmatched files in %s"
905 if u1:
906 repo.ui.debug(b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1)))
907 if u2:
908 repo.ui.debug(b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2)))
910 u1fn = lambda: _get_addedfiles(0)
911 u2fn = lambda: _get_addedfiles(1)
912 if repo.ui.debugflag:
913 u1 = u1fn()
914 u2 = u2fn()
909 915
910 if repo.ui.debugflag:
916 header = b" unmatched files in %s"
917 if u1:
918 repo.ui.debug(
919 b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1))
920 )
921 if u2:
922 repo.ui.debug(
923 b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2))
924 )
925
911 926 renamedeleteset = set()
912 927 divergeset = set()
913 928 for dsts in diverge.values():
914 929 divergeset.update(dsts)
915 930 for dsts in renamedelete1.values():
916 931 renamedeleteset.update(dsts)
917 932 for dsts in renamedelete2.values():
918 933 renamedeleteset.update(dsts)
919 934
920 935 repo.ui.debug(
921 936 b" all copies found (* = to merge, ! = divergent, "
922 937 b"% = renamed and deleted):\n"
923 938 )
924 939 for side, copies in ((b"local", copies1), (b"remote", copies2)):
925 940 if not copies:
926 941 continue
927 942 repo.ui.debug(b" on %s side:\n" % side)
928 943 for f in sorted(copies):
929 944 note = b""
930 945 if f in copy1 or f in copy2:
931 946 note += b"*"
932 947 if f in divergeset:
933 948 note += b"!"
934 949 if f in renamedeleteset:
935 950 note += b"%"
936 951 repo.ui.debug(
937 952 b" src: '%s' -> dst: '%s' %s\n" % (copies[f], f, note)
938 953 )
939 954 del renamedeleteset
940 955 del divergeset
941 956
942 957 repo.ui.debug(b" checking for directory renames\n")
943 958
944 dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2)
945 dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1)
959 dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2fn)
960 dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1fn)
946 961
947 962 branch_copies1 = branch_copies(copy1, renamedelete1, dirmove1, movewithdir1)
948 963 branch_copies2 = branch_copies(copy2, renamedelete2, dirmove2, movewithdir2)
949 964
950 965 return branch_copies1, branch_copies2, diverge
951 966
952 967
953 def _dir_renames(repo, ctx, copy, fullcopy, addedfiles):
968 def _dir_renames(repo, ctx, copy, fullcopy, addedfilesfn):
954 969 """Finds moved directories and files that should move with them.
955 970
956 971 ctx: the context for one of the sides
957 972 copy: files copied on the same side (as ctx)
958 973 fullcopy: files copied on the same side (as ctx), including those that
959 974 merge.manifestmerge() won't care about
960 addedfiles: added files on the other side (compared to ctx)
975 addedfilesfn: function returning added files on the other side (compared to
976 ctx)
961 977 """
962 978 # generate a directory move map
963 979 invalid = set()
964 980 dirmove = {}
965 981
966 982 # examine each file copy for a potential directory move, which is
967 983 # when all the files in a directory are moved to a new directory
968 984 for dst, src in pycompat.iteritems(fullcopy):
969 985 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
970 986 if dsrc in invalid:
971 987 # already seen to be uninteresting
972 988 continue
973 989 elif ctx.hasdir(dsrc) and ctx.hasdir(ddst):
974 990 # directory wasn't entirely moved locally
975 991 invalid.add(dsrc)
976 992 elif dsrc in dirmove and dirmove[dsrc] != ddst:
977 993 # files from the same directory moved to two different places
978 994 invalid.add(dsrc)
979 995 else:
980 996 # looks good so far
981 997 dirmove[dsrc] = ddst
982 998
983 999 for i in invalid:
984 1000 if i in dirmove:
985 1001 del dirmove[i]
986 1002 del invalid
987 1003
988 1004 if not dirmove:
989 1005 return {}, {}
990 1006
991 1007 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
992 1008
993 1009 for d in dirmove:
994 1010 repo.ui.debug(
995 1011 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
996 1012 )
997 1013
998 1014 movewithdir = {}
999 1015 # check unaccounted nonoverlapping files against directory moves
1000 for f in addedfiles:
1016 for f in addedfilesfn():
1001 1017 if f not in fullcopy:
1002 1018 for d in dirmove:
1003 1019 if f.startswith(d):
1004 1020 # new file added in a directory that was moved, move it
1005 1021 df = dirmove[d] + f[len(d) :]
1006 1022 if df not in copy:
1007 1023 movewithdir[f] = df
1008 1024 repo.ui.debug(
1009 1025 b" pending file src: '%s' -> dst: '%s'\n"
1010 1026 % (f, df)
1011 1027 )
1012 1028 break
1013 1029
1014 1030 return dirmove, movewithdir
1015 1031
1016 1032
1017 1033 def _heuristicscopytracing(repo, c1, c2, base):
1018 1034 """Fast copytracing using filename heuristics
1019 1035
1020 1036 Assumes that moves or renames are of following two types:
1021 1037
1022 1038 1) Inside a directory only (same directory name but different filenames)
1023 1039 2) Move from one directory to another
1024 1040 (same filenames but different directory names)
1025 1041
1026 1042 Works only when there are no merge commits in the "source branch".
1027 1043 Source branch is commits from base up to c2 not including base.
1028 1044
1029 1045 If merge is involved it fallbacks to _fullcopytracing().
1030 1046
1031 1047 Can be used by setting the following config:
1032 1048
1033 1049 [experimental]
1034 1050 copytrace = heuristics
1035 1051
1036 1052 In some cases the copy/move candidates found by heuristics can be very large
1037 1053 in number and that will make the algorithm slow. The number of possible
1038 1054 candidates to check can be limited by using the config
1039 1055 `experimental.copytrace.movecandidateslimit` which defaults to 100.
1040 1056 """
1041 1057
1042 1058 if c1.rev() is None:
1043 1059 c1 = c1.p1()
1044 1060 if c2.rev() is None:
1045 1061 c2 = c2.p1()
1046 1062
1047 1063 changedfiles = set()
1048 1064 m1 = c1.manifest()
1049 1065 if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
1050 1066 # If base is not in c2 branch, we switch to fullcopytracing
1051 1067 repo.ui.debug(
1052 1068 b"switching to full copytracing as base is not "
1053 1069 b"an ancestor of c2\n"
1054 1070 )
1055 1071 return _fullcopytracing(repo, c1, c2, base)
1056 1072
1057 1073 ctx = c2
1058 1074 while ctx != base:
1059 1075 if len(ctx.parents()) == 2:
1060 1076 # To keep things simple let's not handle merges
1061 1077 repo.ui.debug(b"switching to full copytracing because of merges\n")
1062 1078 return _fullcopytracing(repo, c1, c2, base)
1063 1079 changedfiles.update(ctx.files())
1064 1080 ctx = ctx.p1()
1065 1081
1066 1082 copies2 = {}
1067 1083 cp = _forwardcopies(base, c2)
1068 1084 for dst, src in pycompat.iteritems(cp):
1069 1085 if src in m1:
1070 1086 copies2[dst] = src
1071 1087
1072 1088 # file is missing if it isn't present in the destination, but is present in
1073 1089 # the base and present in the source.
1074 1090 # Presence in the base is important to exclude added files, presence in the
1075 1091 # source is important to exclude removed files.
1076 1092 filt = lambda f: f not in m1 and f in base and f in c2
1077 1093 missingfiles = [f for f in changedfiles if filt(f)]
1078 1094
1079 1095 copies1 = {}
1080 1096 if missingfiles:
1081 1097 basenametofilename = collections.defaultdict(list)
1082 1098 dirnametofilename = collections.defaultdict(list)
1083 1099
1084 1100 for f in m1.filesnotin(base.manifest()):
1085 1101 basename = os.path.basename(f)
1086 1102 dirname = os.path.dirname(f)
1087 1103 basenametofilename[basename].append(f)
1088 1104 dirnametofilename[dirname].append(f)
1089 1105
1090 1106 for f in missingfiles:
1091 1107 basename = os.path.basename(f)
1092 1108 dirname = os.path.dirname(f)
1093 1109 samebasename = basenametofilename[basename]
1094 1110 samedirname = dirnametofilename[dirname]
1095 1111 movecandidates = samebasename + samedirname
1096 1112 # f is guaranteed to be present in c2, that's why
1097 1113 # c2.filectx(f) won't fail
1098 1114 f2 = c2.filectx(f)
1099 1115 # we can have a lot of candidates which can slow down the heuristics
1100 1116 # config value to limit the number of candidates moves to check
1101 1117 maxcandidates = repo.ui.configint(
1102 1118 b'experimental', b'copytrace.movecandidateslimit'
1103 1119 )
1104 1120
1105 1121 if len(movecandidates) > maxcandidates:
1106 1122 repo.ui.status(
1107 1123 _(
1108 1124 b"skipping copytracing for '%s', more "
1109 1125 b"candidates than the limit: %d\n"
1110 1126 )
1111 1127 % (f, len(movecandidates))
1112 1128 )
1113 1129 continue
1114 1130
1115 1131 for candidate in movecandidates:
1116 1132 f1 = c1.filectx(candidate)
1117 1133 if _related(f1, f2):
1118 1134 # if there are a few related copies then we'll merge
1119 1135 # changes into all of them. This matches the behaviour
1120 1136 # of upstream copytracing
1121 1137 copies1[candidate] = f
1122 1138
1123 1139 return branch_copies(copies1), branch_copies(copies2), {}
1124 1140
1125 1141
1126 1142 def _related(f1, f2):
1127 1143 """return True if f1 and f2 filectx have a common ancestor
1128 1144
1129 1145 Walk back to common ancestor to see if the two files originate
1130 1146 from the same file. Since workingfilectx's rev() is None it messes
1131 1147 up the integer comparison logic, hence the pre-step check for
1132 1148 None (f1 and f2 can only be workingfilectx's initially).
1133 1149 """
1134 1150
1135 1151 if f1 == f2:
1136 1152 return True # a match
1137 1153
1138 1154 g1, g2 = f1.ancestors(), f2.ancestors()
1139 1155 try:
1140 1156 f1r, f2r = f1.linkrev(), f2.linkrev()
1141 1157
1142 1158 if f1r is None:
1143 1159 f1 = next(g1)
1144 1160 if f2r is None:
1145 1161 f2 = next(g2)
1146 1162
1147 1163 while True:
1148 1164 f1r, f2r = f1.linkrev(), f2.linkrev()
1149 1165 if f1r > f2r:
1150 1166 f1 = next(g1)
1151 1167 elif f2r > f1r:
1152 1168 f2 = next(g2)
1153 1169 else: # f1 and f2 point to files in the same linkrev
1154 1170 return f1 == f2 # true if they point to the same file
1155 1171 except StopIteration:
1156 1172 return False
1157 1173
1158 1174
1159 1175 def graftcopies(wctx, ctx, base):
1160 1176 """reproduce copies between base and ctx in the wctx
1161 1177
1162 1178 Unlike mergecopies(), this function will only consider copies between base
1163 1179 and ctx; it will ignore copies between base and wctx. Also unlike
1164 1180 mergecopies(), this function will apply copies to the working copy (instead
1165 1181 of just returning information about the copies). That makes it cheaper
1166 1182 (especially in the common case of base==ctx.p1()) and useful also when
1167 1183 experimental.copytrace=off.
1168 1184
1169 1185 merge.update() will have already marked most copies, but it will only
1170 1186 mark copies if it thinks the source files are related (see
1171 1187 merge._related()). It will also not mark copies if the file wasn't modified
1172 1188 on the local side. This function adds the copies that were "missed"
1173 1189 by merge.update().
1174 1190 """
1175 1191 new_copies = pathcopies(base, ctx)
1176 1192 _filter(wctx.p1(), wctx, new_copies)
1177 1193 for dst, src in pycompat.iteritems(new_copies):
1178 1194 wctx[dst].markcopied(src)
General Comments 0
You need to be logged in to leave comments. Login now