##// END OF EJS Templates
copies: avoid materializing a full directory map during copy tracing...
Kyle Lippincott -
r46632:b9588ff9 default
parent child Browse files
Show More
@@ -1,1149 +1,1148 b''
1 1 # copies.py - copy detection for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import collections
11 11 import os
12 12
13 13 from .i18n import _
14 14
15 15
16 16 from . import (
17 17 match as matchmod,
18 18 node,
19 19 pathutil,
20 20 policy,
21 21 pycompat,
22 22 util,
23 23 )
24 24
25 25
26 26 from .utils import stringutil
27 27
28 28 from .revlogutils import flagutil
29 29
30 30 rustmod = policy.importrust("copy_tracing")
31 31
32 32
33 33 def _filter(src, dst, t):
34 34 """filters out invalid copies after chaining"""
35 35
36 36 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
37 37 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
38 38 # in the following table (not including trivial cases). For example, case 2
39 39 # is where a file existed in 'src' and remained under that name in 'mid' and
40 40 # then was renamed between 'mid' and 'dst'.
41 41 #
42 42 # case src mid dst result
43 43 # 1 x y - -
44 44 # 2 x y y x->y
45 45 # 3 x y x -
46 46 # 4 x y z x->z
47 47 # 5 - x y -
48 48 # 6 x x y x->y
49 49 #
50 50 # _chain() takes care of chaining the copies in 'a' and 'b', but it
51 51 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
52 52 # between 5 and 6, so it includes all cases in its result.
53 53 # Cases 1, 3, and 5 are then removed by _filter().
54 54
55 55 for k, v in list(t.items()):
56 56 # remove copies from files that didn't exist
57 57 if v not in src:
58 58 del t[k]
59 59 # remove criss-crossed copies
60 60 elif k in src and v in dst:
61 61 del t[k]
62 62 # remove copies to files that were then removed
63 63 elif k not in dst:
64 64 del t[k]
65 65
66 66
67 67 def _chain(prefix, suffix):
68 68 """chain two sets of copies 'prefix' and 'suffix'"""
69 69 result = prefix.copy()
70 70 for key, value in pycompat.iteritems(suffix):
71 71 result[key] = prefix.get(value, value)
72 72 return result
73 73
74 74
75 75 def _tracefile(fctx, am, basemf):
76 76 """return file context that is the ancestor of fctx present in ancestor
77 77 manifest am
78 78
79 79 Note: we used to try and stop after a given limit, however checking if that
80 80 limit is reached turned out to be very expensive. we are better off
81 81 disabling that feature."""
82 82
83 83 for f in fctx.ancestors():
84 84 path = f.path()
85 85 if am.get(path, None) == f.filenode():
86 86 return path
87 87 if basemf and basemf.get(path, None) == f.filenode():
88 88 return path
89 89
90 90
91 91 def _dirstatecopies(repo, match=None):
92 92 ds = repo.dirstate
93 93 c = ds.copies().copy()
94 94 for k in list(c):
95 95 if ds[k] not in b'anm' or (match and not match(k)):
96 96 del c[k]
97 97 return c
98 98
99 99
100 100 def _computeforwardmissing(a, b, match=None):
101 101 """Computes which files are in b but not a.
102 102 This is its own function so extensions can easily wrap this call to see what
103 103 files _forwardcopies is about to process.
104 104 """
105 105 ma = a.manifest()
106 106 mb = b.manifest()
107 107 return mb.filesnotin(ma, match=match)
108 108
109 109
110 110 def usechangesetcentricalgo(repo):
111 111 """Checks if we should use changeset-centric copy algorithms"""
112 112 if repo.filecopiesmode == b'changeset-sidedata':
113 113 return True
114 114 readfrom = repo.ui.config(b'experimental', b'copies.read-from')
115 115 changesetsource = (b'changeset-only', b'compatibility')
116 116 return readfrom in changesetsource
117 117
118 118
119 119 def _committedforwardcopies(a, b, base, match):
120 120 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
121 121 # files might have to be traced back to the fctx parent of the last
122 122 # one-side-only changeset, but not further back than that
123 123 repo = a._repo
124 124
125 125 if usechangesetcentricalgo(repo):
126 126 return _changesetforwardcopies(a, b, match)
127 127
128 128 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
129 129 dbg = repo.ui.debug
130 130 if debug:
131 131 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b))
132 132 am = a.manifest()
133 133 basemf = None if base is None else base.manifest()
134 134
135 135 # find where new files came from
136 136 # we currently don't try to find where old files went, too expensive
137 137 # this means we can miss a case like 'hg rm b; hg cp a b'
138 138 cm = {}
139 139
140 140 # Computing the forward missing is quite expensive on large manifests, since
141 141 # it compares the entire manifests. We can optimize it in the common use
142 142 # case of computing what copies are in a commit versus its parent (like
143 143 # during a rebase or histedit). Note, we exclude merge commits from this
144 144 # optimization, since the ctx.files() for a merge commit is not correct for
145 145 # this comparison.
146 146 forwardmissingmatch = match
147 147 if b.p1() == a and b.p2().node() == node.nullid:
148 148 filesmatcher = matchmod.exact(b.files())
149 149 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
150 150 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
151 151
152 152 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
153 153
154 154 if debug:
155 155 dbg(b'debug.copies: missing files to search: %d\n' % len(missing))
156 156
157 157 for f in sorted(missing):
158 158 if debug:
159 159 dbg(b'debug.copies: tracing file: %s\n' % f)
160 160 fctx = b[f]
161 161 fctx._ancestrycontext = ancestrycontext
162 162
163 163 if debug:
164 164 start = util.timer()
165 165 opath = _tracefile(fctx, am, basemf)
166 166 if opath:
167 167 if debug:
168 168 dbg(b'debug.copies: rename of: %s\n' % opath)
169 169 cm[f] = opath
170 170 if debug:
171 171 dbg(
172 172 b'debug.copies: time: %f seconds\n'
173 173 % (util.timer() - start)
174 174 )
175 175 return cm
176 176
177 177
178 178 def _revinfo_getter(repo):
179 179 """returns a function that returns the following data given a <rev>"
180 180
181 181 * p1: revision number of first parent
182 182 * p2: revision number of first parent
183 183 * changes: a ChangingFiles object
184 184 """
185 185 cl = repo.changelog
186 186 parents = cl.parentrevs
187 187 flags = cl.flags
188 188
189 189 HASCOPIESINFO = flagutil.REVIDX_HASCOPIESINFO
190 190
191 191 changelogrevision = cl.changelogrevision
192 192
193 193 # A small cache to avoid doing the work twice for merges
194 194 #
195 195 # In the vast majority of cases, if we ask information for a revision
196 196 # about 1 parent, we'll later ask it for the other. So it make sense to
197 197 # keep the information around when reaching the first parent of a merge
198 198 # and dropping it after it was provided for the second parents.
199 199 #
200 200 # It exists cases were only one parent of the merge will be walked. It
201 201 # happens when the "destination" the copy tracing is descendant from a
202 202 # new root, not common with the "source". In that case, we will only walk
203 203 # through merge parents that are descendant of changesets common
204 204 # between "source" and "destination".
205 205 #
206 206 # With the current case implementation if such changesets have a copy
207 207 # information, we'll keep them in memory until the end of
208 208 # _changesetforwardcopies. We don't expect the case to be frequent
209 209 # enough to matters.
210 210 #
211 211 # In addition, it would be possible to reach pathological case, were
212 212 # many first parent are met before any second parent is reached. In
213 213 # that case the cache could grow. If this even become an issue one can
214 214 # safely introduce a maximum cache size. This would trade extra CPU/IO
215 215 # time to save memory.
216 216 merge_caches = {}
217 217
218 218 def revinfo(rev):
219 219 p1, p2 = parents(rev)
220 220 value = None
221 221 e = merge_caches.pop(rev, None)
222 222 if e is not None:
223 223 return e
224 224 changes = None
225 225 if flags(rev) & HASCOPIESINFO:
226 226 changes = changelogrevision(rev).changes
227 227 value = (p1, p2, changes)
228 228 if p1 != node.nullrev and p2 != node.nullrev:
229 229 # XXX some case we over cache, IGNORE
230 230 merge_caches[rev] = value
231 231 return value
232 232
233 233 return revinfo
234 234
235 235
236 236 def cached_is_ancestor(is_ancestor):
237 237 """return a cached version of is_ancestor"""
238 238 cache = {}
239 239
240 240 def _is_ancestor(anc, desc):
241 241 if anc > desc:
242 242 return False
243 243 elif anc == desc:
244 244 return True
245 245 key = (anc, desc)
246 246 ret = cache.get(key)
247 247 if ret is None:
248 248 ret = cache[key] = is_ancestor(anc, desc)
249 249 return ret
250 250
251 251 return _is_ancestor
252 252
253 253
254 254 def _changesetforwardcopies(a, b, match):
255 255 if a.rev() in (node.nullrev, b.rev()):
256 256 return {}
257 257
258 258 repo = a.repo().unfiltered()
259 259 children = {}
260 260
261 261 cl = repo.changelog
262 262 isancestor = cl.isancestorrev
263 263 missingrevs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
264 264 mrset = set(missingrevs)
265 265 roots = set()
266 266 for r in missingrevs:
267 267 for p in cl.parentrevs(r):
268 268 if p == node.nullrev:
269 269 continue
270 270 if p not in children:
271 271 children[p] = [r]
272 272 else:
273 273 children[p].append(r)
274 274 if p not in mrset:
275 275 roots.add(p)
276 276 if not roots:
277 277 # no common revision to track copies from
278 278 return {}
279 279 min_root = min(roots)
280 280
281 281 from_head = set(
282 282 cl.reachableroots(min_root, [b.rev()], list(roots), includepath=True)
283 283 )
284 284
285 285 iterrevs = set(from_head)
286 286 iterrevs &= mrset
287 287 iterrevs.update(roots)
288 288 iterrevs.remove(b.rev())
289 289 revs = sorted(iterrevs)
290 290
291 291 if repo.filecopiesmode == b'changeset-sidedata':
292 292 revinfo = _revinfo_getter(repo)
293 293 return _combine_changeset_copies(
294 294 revs, children, b.rev(), revinfo, match, isancestor
295 295 )
296 296 else:
297 297 revinfo = _revinfo_getter_extra(repo)
298 298 return _combine_changeset_copies_extra(
299 299 revs, children, b.rev(), revinfo, match, isancestor
300 300 )
301 301
302 302
303 303 def _combine_changeset_copies(
304 304 revs, children, targetrev, revinfo, match, isancestor
305 305 ):
306 306 """combine the copies information for each item of iterrevs
307 307
308 308 revs: sorted iterable of revision to visit
309 309 children: a {parent: [children]} mapping.
310 310 targetrev: the final copies destination revision (not in iterrevs)
311 311 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
312 312 match: a matcher
313 313
314 314 It returns the aggregated copies information for `targetrev`.
315 315 """
316 316
317 317 alwaysmatch = match.always()
318 318
319 319 if rustmod is not None and alwaysmatch:
320 320 return rustmod.combine_changeset_copies(
321 321 list(revs), children, targetrev, revinfo, isancestor
322 322 )
323 323
324 324 isancestor = cached_is_ancestor(isancestor)
325 325
326 326 all_copies = {}
327 327 for r in revs:
328 328 copies = all_copies.pop(r, None)
329 329 if copies is None:
330 330 # this is a root
331 331 copies = {}
332 332 for i, c in enumerate(children[r]):
333 333 p1, p2, changes = revinfo(c)
334 334 childcopies = {}
335 335 if r == p1:
336 336 parent = 1
337 337 if changes is not None:
338 338 childcopies = changes.copied_from_p1
339 339 else:
340 340 assert r == p2
341 341 parent = 2
342 342 if changes is not None:
343 343 childcopies = changes.copied_from_p2
344 344 if not alwaysmatch:
345 345 childcopies = {
346 346 dst: src for dst, src in childcopies.items() if match(dst)
347 347 }
348 348 newcopies = copies
349 349 if childcopies:
350 350 newcopies = copies.copy()
351 351 for dest, source in pycompat.iteritems(childcopies):
352 352 prev = copies.get(source)
353 353 if prev is not None and prev[1] is not None:
354 354 source = prev[1]
355 355 newcopies[dest] = (c, source)
356 356 assert newcopies is not copies
357 357 if changes is not None and changes.removed:
358 358 if newcopies is copies:
359 359 newcopies = copies.copy()
360 360 for f in changes.removed:
361 361 if f in newcopies:
362 362 if newcopies is copies:
363 363 # copy on write to avoid affecting potential other
364 364 # branches. when there are no other branches, this
365 365 # could be avoided.
366 366 newcopies = copies.copy()
367 367 newcopies[f] = (c, None)
368 368 othercopies = all_copies.get(c)
369 369 if othercopies is None:
370 370 all_copies[c] = newcopies
371 371 elif newcopies is othercopies:
372 372 # nothing to merge:
373 373 pass
374 374 else:
375 375 # we are the second parent to work on c, we need to merge our
376 376 # work with the other.
377 377 #
378 378 # In case of conflict, parent 1 take precedence over parent 2.
379 379 # This is an arbitrary choice made anew when implementing
380 380 # changeset based copies. It was made without regards with
381 381 # potential filelog related behavior.
382 382 if parent == 1:
383 383 minor, major = othercopies, newcopies
384 384 else:
385 385 minor, major = newcopies, othercopies
386 386 copies = _merge_copies_dict(minor, major, isancestor, changes)
387 387 all_copies[c] = copies
388 388
389 389 final_copies = {}
390 390 for dest, (tt, source) in all_copies[targetrev].items():
391 391 if source is not None:
392 392 final_copies[dest] = source
393 393 return final_copies
394 394
395 395
396 396 def _merge_copies_dict(minor, major, isancestor, changes):
397 397 """merge two copies-mapping together, minor and major
398 398
399 399 In case of conflict, value from "major" will be picked.
400 400
401 401 - `isancestors(low_rev, high_rev)`: callable return True if `low_rev` is an
402 402 ancestors of `high_rev`,
403 403
404 404 - `ismerged(path)`: callable return True if `path` have been merged in the
405 405 current revision,
406 406
407 407 return the resulting dict (in practice, the "minor" object, updated)
408 408 """
409 409 for dest, value in major.items():
410 410 other = minor.get(dest)
411 411 if other is None:
412 412 minor[dest] = value
413 413 else:
414 414 new_tt = value[0]
415 415 other_tt = other[0]
416 416 if value[1] == other[1]:
417 417 continue
418 418 # content from "major" wins, unless it is older
419 419 # than the branch point or there is a merge
420 420 if new_tt == other_tt:
421 421 minor[dest] = value
422 422 elif (
423 423 changes is not None
424 424 and value[1] is None
425 425 and dest in changes.salvaged
426 426 ):
427 427 pass
428 428 elif (
429 429 changes is not None
430 430 and other[1] is None
431 431 and dest in changes.salvaged
432 432 ):
433 433 minor[dest] = value
434 434 elif changes is not None and dest in changes.merged:
435 435 minor[dest] = value
436 436 elif not isancestor(new_tt, other_tt):
437 437 if value[1] is not None:
438 438 minor[dest] = value
439 439 elif isancestor(other_tt, new_tt):
440 440 minor[dest] = value
441 441 return minor
442 442
443 443
444 444 def _revinfo_getter_extra(repo):
445 445 """return a function that return multiple data given a <rev>"i
446 446
447 447 * p1: revision number of first parent
448 448 * p2: revision number of first parent
449 449 * p1copies: mapping of copies from p1
450 450 * p2copies: mapping of copies from p2
451 451 * removed: a list of removed files
452 452 * ismerged: a callback to know if file was merged in that revision
453 453 """
454 454 cl = repo.changelog
455 455 parents = cl.parentrevs
456 456
457 457 def get_ismerged(rev):
458 458 ctx = repo[rev]
459 459
460 460 def ismerged(path):
461 461 if path not in ctx.files():
462 462 return False
463 463 fctx = ctx[path]
464 464 parents = fctx._filelog.parents(fctx._filenode)
465 465 nb_parents = 0
466 466 for n in parents:
467 467 if n != node.nullid:
468 468 nb_parents += 1
469 469 return nb_parents >= 2
470 470
471 471 return ismerged
472 472
473 473 def revinfo(rev):
474 474 p1, p2 = parents(rev)
475 475 ctx = repo[rev]
476 476 p1copies, p2copies = ctx._copies
477 477 removed = ctx.filesremoved()
478 478 return p1, p2, p1copies, p2copies, removed, get_ismerged(rev)
479 479
480 480 return revinfo
481 481
482 482
483 483 def _combine_changeset_copies_extra(
484 484 revs, children, targetrev, revinfo, match, isancestor
485 485 ):
486 486 """version of `_combine_changeset_copies` that works with the Google
487 487 specific "extra" based storage for copy information"""
488 488 all_copies = {}
489 489 alwaysmatch = match.always()
490 490 for r in revs:
491 491 copies = all_copies.pop(r, None)
492 492 if copies is None:
493 493 # this is a root
494 494 copies = {}
495 495 for i, c in enumerate(children[r]):
496 496 p1, p2, p1copies, p2copies, removed, ismerged = revinfo(c)
497 497 if r == p1:
498 498 parent = 1
499 499 childcopies = p1copies
500 500 else:
501 501 assert r == p2
502 502 parent = 2
503 503 childcopies = p2copies
504 504 if not alwaysmatch:
505 505 childcopies = {
506 506 dst: src for dst, src in childcopies.items() if match(dst)
507 507 }
508 508 newcopies = copies
509 509 if childcopies:
510 510 newcopies = copies.copy()
511 511 for dest, source in pycompat.iteritems(childcopies):
512 512 prev = copies.get(source)
513 513 if prev is not None and prev[1] is not None:
514 514 source = prev[1]
515 515 newcopies[dest] = (c, source)
516 516 assert newcopies is not copies
517 517 for f in removed:
518 518 if f in newcopies:
519 519 if newcopies is copies:
520 520 # copy on write to avoid affecting potential other
521 521 # branches. when there are no other branches, this
522 522 # could be avoided.
523 523 newcopies = copies.copy()
524 524 newcopies[f] = (c, None)
525 525 othercopies = all_copies.get(c)
526 526 if othercopies is None:
527 527 all_copies[c] = newcopies
528 528 else:
529 529 # we are the second parent to work on c, we need to merge our
530 530 # work with the other.
531 531 #
532 532 # In case of conflict, parent 1 take precedence over parent 2.
533 533 # This is an arbitrary choice made anew when implementing
534 534 # changeset based copies. It was made without regards with
535 535 # potential filelog related behavior.
536 536 if parent == 1:
537 537 _merge_copies_dict_extra(
538 538 othercopies, newcopies, isancestor, ismerged
539 539 )
540 540 else:
541 541 _merge_copies_dict_extra(
542 542 newcopies, othercopies, isancestor, ismerged
543 543 )
544 544 all_copies[c] = newcopies
545 545
546 546 final_copies = {}
547 547 for dest, (tt, source) in all_copies[targetrev].items():
548 548 if source is not None:
549 549 final_copies[dest] = source
550 550 return final_copies
551 551
552 552
553 553 def _merge_copies_dict_extra(minor, major, isancestor, ismerged):
554 554 """version of `_merge_copies_dict` that works with the Google
555 555 specific "extra" based storage for copy information"""
556 556 for dest, value in major.items():
557 557 other = minor.get(dest)
558 558 if other is None:
559 559 minor[dest] = value
560 560 else:
561 561 new_tt = value[0]
562 562 other_tt = other[0]
563 563 if value[1] == other[1]:
564 564 continue
565 565 # content from "major" wins, unless it is older
566 566 # than the branch point or there is a merge
567 567 if (
568 568 new_tt == other_tt
569 569 or not isancestor(new_tt, other_tt)
570 570 or ismerged(dest)
571 571 ):
572 572 minor[dest] = value
573 573
574 574
575 575 def _forwardcopies(a, b, base=None, match=None):
576 576 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
577 577
578 578 if base is None:
579 579 base = a
580 580 match = a.repo().narrowmatch(match)
581 581 # check for working copy
582 582 if b.rev() is None:
583 583 cm = _committedforwardcopies(a, b.p1(), base, match)
584 584 # combine copies from dirstate if necessary
585 585 copies = _chain(cm, _dirstatecopies(b._repo, match))
586 586 else:
587 587 copies = _committedforwardcopies(a, b, base, match)
588 588 return copies
589 589
590 590
591 591 def _backwardrenames(a, b, match):
592 592 if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
593 593 return {}
594 594
595 595 # Even though we're not taking copies into account, 1:n rename situations
596 596 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
597 597 # arbitrarily pick one of the renames.
598 598 # We don't want to pass in "match" here, since that would filter
599 599 # the destination by it. Since we're reversing the copies, we want
600 600 # to filter the source instead.
601 601 f = _forwardcopies(b, a)
602 602 r = {}
603 603 for k, v in sorted(pycompat.iteritems(f)):
604 604 if match and not match(v):
605 605 continue
606 606 # remove copies
607 607 if v in a:
608 608 continue
609 609 r[v] = k
610 610 return r
611 611
612 612
613 613 def pathcopies(x, y, match=None):
614 614 """find {dst@y: src@x} copy mapping for directed compare"""
615 615 repo = x._repo
616 616 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
617 617 if debug:
618 618 repo.ui.debug(
619 619 b'debug.copies: searching copies from %s to %s\n' % (x, y)
620 620 )
621 621 if x == y or not x or not y:
622 622 return {}
623 623 if y.rev() is None and x == y.p1():
624 624 if debug:
625 625 repo.ui.debug(b'debug.copies: search mode: dirstate\n')
626 626 # short-circuit to avoid issues with merge states
627 627 return _dirstatecopies(repo, match)
628 628 a = y.ancestor(x)
629 629 if a == x:
630 630 if debug:
631 631 repo.ui.debug(b'debug.copies: search mode: forward\n')
632 632 copies = _forwardcopies(x, y, match=match)
633 633 elif a == y:
634 634 if debug:
635 635 repo.ui.debug(b'debug.copies: search mode: backward\n')
636 636 copies = _backwardrenames(x, y, match=match)
637 637 else:
638 638 if debug:
639 639 repo.ui.debug(b'debug.copies: search mode: combined\n')
640 640 base = None
641 641 if a.rev() != node.nullrev:
642 642 base = x
643 643 copies = _chain(
644 644 _backwardrenames(x, a, match=match),
645 645 _forwardcopies(a, y, base, match=match),
646 646 )
647 647 _filter(x, y, copies)
648 648 return copies
649 649
650 650
651 651 def mergecopies(repo, c1, c2, base):
652 652 """
653 653 Finds moves and copies between context c1 and c2 that are relevant for
654 654 merging. 'base' will be used as the merge base.
655 655
656 656 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
657 657 files that were moved/ copied in one merge parent and modified in another.
658 658 For example:
659 659
660 660 o ---> 4 another commit
661 661 |
662 662 | o ---> 3 commit that modifies a.txt
663 663 | /
664 664 o / ---> 2 commit that moves a.txt to b.txt
665 665 |/
666 666 o ---> 1 merge base
667 667
668 668 If we try to rebase revision 3 on revision 4, since there is no a.txt in
669 669 revision 4, and if user have copytrace disabled, we prints the following
670 670 message:
671 671
672 672 ```other changed <file> which local deleted```
673 673
674 674 Returns a tuple where:
675 675
676 676 "branch_copies" an instance of branch_copies.
677 677
678 678 "diverge" is a mapping of source name -> list of destination names
679 679 for divergent renames.
680 680
681 681 This function calls different copytracing algorithms based on config.
682 682 """
683 683 # avoid silly behavior for update from empty dir
684 684 if not c1 or not c2 or c1 == c2:
685 685 return branch_copies(), branch_copies(), {}
686 686
687 687 narrowmatch = c1.repo().narrowmatch()
688 688
689 689 # avoid silly behavior for parent -> working dir
690 690 if c2.node() is None and c1.node() == repo.dirstate.p1():
691 691 return (
692 692 branch_copies(_dirstatecopies(repo, narrowmatch)),
693 693 branch_copies(),
694 694 {},
695 695 )
696 696
697 697 copytracing = repo.ui.config(b'experimental', b'copytrace')
698 698 if stringutil.parsebool(copytracing) is False:
699 699 # stringutil.parsebool() returns None when it is unable to parse the
700 700 # value, so we should rely on making sure copytracing is on such cases
701 701 return branch_copies(), branch_copies(), {}
702 702
703 703 if usechangesetcentricalgo(repo):
704 704 # The heuristics don't make sense when we need changeset-centric algos
705 705 return _fullcopytracing(repo, c1, c2, base)
706 706
707 707 # Copy trace disabling is explicitly below the node == p1 logic above
708 708 # because the logic above is required for a simple copy to be kept across a
709 709 # rebase.
710 710 if copytracing == b'heuristics':
711 711 # Do full copytracing if only non-public revisions are involved as
712 712 # that will be fast enough and will also cover the copies which could
713 713 # be missed by heuristics
714 714 if _isfullcopytraceable(repo, c1, base):
715 715 return _fullcopytracing(repo, c1, c2, base)
716 716 return _heuristicscopytracing(repo, c1, c2, base)
717 717 else:
718 718 return _fullcopytracing(repo, c1, c2, base)
719 719
720 720
721 721 def _isfullcopytraceable(repo, c1, base):
722 722 """Checks that if base, source and destination are all no-public branches,
723 723 if yes let's use the full copytrace algorithm for increased capabilities
724 724 since it will be fast enough.
725 725
726 726 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
727 727 number of changesets from c1 to base such that if number of changesets are
728 728 more than the limit, full copytracing algorithm won't be used.
729 729 """
730 730 if c1.rev() is None:
731 731 c1 = c1.p1()
732 732 if c1.mutable() and base.mutable():
733 733 sourcecommitlimit = repo.ui.configint(
734 734 b'experimental', b'copytrace.sourcecommitlimit'
735 735 )
736 736 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
737 737 return commits < sourcecommitlimit
738 738 return False
739 739
740 740
741 741 def _checksinglesidecopies(
742 742 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
743 743 ):
744 744 if src not in m2:
745 745 # deleted on side 2
746 746 if src not in m1:
747 747 # renamed on side 1, deleted on side 2
748 748 renamedelete[src] = dsts1
749 749 elif src not in mb:
750 750 # Work around the "short-circuit to avoid issues with merge states"
751 751 # thing in pathcopies(): pathcopies(x, y) can return a copy where the
752 752 # destination doesn't exist in y.
753 753 pass
754 754 elif mb[src] != m2[src] and not _related(c2[src], base[src]):
755 755 return
756 756 elif mb[src] != m2[src] or mb.flags(src) != m2.flags(src):
757 757 # modified on side 2
758 758 for dst in dsts1:
759 759 copy[dst] = src
760 760
761 761
762 762 class branch_copies(object):
763 763 """Information about copies made on one side of a merge/graft.
764 764
765 765 "copy" is a mapping from destination name -> source name,
766 766 where source is in c1 and destination is in c2 or vice-versa.
767 767
768 768 "movewithdir" is a mapping from source name -> destination name,
769 769 where the file at source present in one context but not the other
770 770 needs to be moved to destination by the merge process, because the
771 771 other context moved the directory it is in.
772 772
773 773 "renamedelete" is a mapping of source name -> list of destination
774 774 names for files deleted in c1 that were renamed in c2 or vice-versa.
775 775
776 776 "dirmove" is a mapping of detected source dir -> destination dir renames.
777 777 This is needed for handling changes to new files previously grafted into
778 778 renamed directories.
779 779 """
780 780
781 781 def __init__(
782 782 self, copy=None, renamedelete=None, dirmove=None, movewithdir=None
783 783 ):
784 784 self.copy = {} if copy is None else copy
785 785 self.renamedelete = {} if renamedelete is None else renamedelete
786 786 self.dirmove = {} if dirmove is None else dirmove
787 787 self.movewithdir = {} if movewithdir is None else movewithdir
788 788
789 789 def __repr__(self):
790 790 return '<branch_copies\n copy=%r\n renamedelete=%r\n dirmove=%r\n movewithdir=%r\n>' % (
791 791 self.copy,
792 792 self.renamedelete,
793 793 self.dirmove,
794 794 self.movewithdir,
795 795 )
796 796
797 797
798 798 def _fullcopytracing(repo, c1, c2, base):
799 799 """The full copytracing algorithm which finds all the new files that were
800 800 added from merge base up to the top commit and for each file it checks if
801 801 this file was copied from another file.
802 802
803 803 This is pretty slow when a lot of changesets are involved but will track all
804 804 the copies.
805 805 """
806 806 m1 = c1.manifest()
807 807 m2 = c2.manifest()
808 808 mb = base.manifest()
809 809
810 810 copies1 = pathcopies(base, c1)
811 811 copies2 = pathcopies(base, c2)
812 812
813 813 if not (copies1 or copies2):
814 814 return branch_copies(), branch_copies(), {}
815 815
816 816 inversecopies1 = {}
817 817 inversecopies2 = {}
818 818 for dst, src in copies1.items():
819 819 inversecopies1.setdefault(src, []).append(dst)
820 820 for dst, src in copies2.items():
821 821 inversecopies2.setdefault(src, []).append(dst)
822 822
823 823 copy1 = {}
824 824 copy2 = {}
825 825 diverge = {}
826 826 renamedelete1 = {}
827 827 renamedelete2 = {}
828 828 allsources = set(inversecopies1) | set(inversecopies2)
829 829 for src in allsources:
830 830 dsts1 = inversecopies1.get(src)
831 831 dsts2 = inversecopies2.get(src)
832 832 if dsts1 and dsts2:
833 833 # copied/renamed on both sides
834 834 if src not in m1 and src not in m2:
835 835 # renamed on both sides
836 836 dsts1 = set(dsts1)
837 837 dsts2 = set(dsts2)
838 838 # If there's some overlap in the rename destinations, we
839 839 # consider it not divergent. For example, if side 1 copies 'a'
840 840 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
841 841 # and 'd' and deletes 'a'.
842 842 if dsts1 & dsts2:
843 843 for dst in dsts1 & dsts2:
844 844 copy1[dst] = src
845 845 copy2[dst] = src
846 846 else:
847 847 diverge[src] = sorted(dsts1 | dsts2)
848 848 elif src in m1 and src in m2:
849 849 # copied on both sides
850 850 dsts1 = set(dsts1)
851 851 dsts2 = set(dsts2)
852 852 for dst in dsts1 & dsts2:
853 853 copy1[dst] = src
854 854 copy2[dst] = src
855 855 # TODO: Handle cases where it was renamed on one side and copied
856 856 # on the other side
857 857 elif dsts1:
858 858 # copied/renamed only on side 1
859 859 _checksinglesidecopies(
860 860 src, dsts1, m1, m2, mb, c2, base, copy1, renamedelete1
861 861 )
862 862 elif dsts2:
863 863 # copied/renamed only on side 2
864 864 _checksinglesidecopies(
865 865 src, dsts2, m2, m1, mb, c1, base, copy2, renamedelete2
866 866 )
867 867
868 868 # find interesting file sets from manifests
869 869 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
870 870 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
871 871 u1 = sorted(addedinm1 - addedinm2)
872 872 u2 = sorted(addedinm2 - addedinm1)
873 873
874 874 header = b" unmatched files in %s"
875 875 if u1:
876 876 repo.ui.debug(b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1)))
877 877 if u2:
878 878 repo.ui.debug(b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2)))
879 879
880 880 if repo.ui.debugflag:
881 881 renamedeleteset = set()
882 882 divergeset = set()
883 883 for dsts in diverge.values():
884 884 divergeset.update(dsts)
885 885 for dsts in renamedelete1.values():
886 886 renamedeleteset.update(dsts)
887 887 for dsts in renamedelete2.values():
888 888 renamedeleteset.update(dsts)
889 889
890 890 repo.ui.debug(
891 891 b" all copies found (* = to merge, ! = divergent, "
892 892 b"% = renamed and deleted):\n"
893 893 )
894 894 for side, copies in ((b"local", copies1), (b"remote", copies2)):
895 895 if not copies:
896 896 continue
897 897 repo.ui.debug(b" on %s side:\n" % side)
898 898 for f in sorted(copies):
899 899 note = b""
900 900 if f in copy1 or f in copy2:
901 901 note += b"*"
902 902 if f in divergeset:
903 903 note += b"!"
904 904 if f in renamedeleteset:
905 905 note += b"%"
906 906 repo.ui.debug(
907 907 b" src: '%s' -> dst: '%s' %s\n" % (copies[f], f, note)
908 908 )
909 909 del renamedeleteset
910 910 del divergeset
911 911
912 912 repo.ui.debug(b" checking for directory renames\n")
913 913
914 914 dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2)
915 915 dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1)
916 916
917 917 branch_copies1 = branch_copies(copy1, renamedelete1, dirmove1, movewithdir1)
918 918 branch_copies2 = branch_copies(copy2, renamedelete2, dirmove2, movewithdir2)
919 919
920 920 return branch_copies1, branch_copies2, diverge
921 921
922 922
923 923 def _dir_renames(repo, ctx, copy, fullcopy, addedfiles):
924 924 """Finds moved directories and files that should move with them.
925 925
926 926 ctx: the context for one of the sides
927 927 copy: files copied on the same side (as ctx)
928 928 fullcopy: files copied on the same side (as ctx), including those that
929 929 merge.manifestmerge() won't care about
930 930 addedfiles: added files on the other side (compared to ctx)
931 931 """
932 932 # generate a directory move map
933 d = ctx.dirs()
934 933 invalid = set()
935 934 dirmove = {}
936 935
937 936 # examine each file copy for a potential directory move, which is
938 937 # when all the files in a directory are moved to a new directory
939 938 for dst, src in pycompat.iteritems(fullcopy):
940 939 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
941 940 if dsrc in invalid:
942 941 # already seen to be uninteresting
943 942 continue
944 elif dsrc in d and ddst in d:
943 elif ctx.hasdir(dsrc) and ctx.hasdir(ddst):
945 944 # directory wasn't entirely moved locally
946 945 invalid.add(dsrc)
947 946 elif dsrc in dirmove and dirmove[dsrc] != ddst:
948 947 # files from the same directory moved to two different places
949 948 invalid.add(dsrc)
950 949 else:
951 950 # looks good so far
952 951 dirmove[dsrc] = ddst
953 952
954 953 for i in invalid:
955 954 if i in dirmove:
956 955 del dirmove[i]
957 del d, invalid
956 del invalid
958 957
959 958 if not dirmove:
960 959 return {}, {}
961 960
962 961 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
963 962
964 963 for d in dirmove:
965 964 repo.ui.debug(
966 965 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
967 966 )
968 967
969 968 movewithdir = {}
970 969 # check unaccounted nonoverlapping files against directory moves
971 970 for f in addedfiles:
972 971 if f not in fullcopy:
973 972 for d in dirmove:
974 973 if f.startswith(d):
975 974 # new file added in a directory that was moved, move it
976 975 df = dirmove[d] + f[len(d) :]
977 976 if df not in copy:
978 977 movewithdir[f] = df
979 978 repo.ui.debug(
980 979 b" pending file src: '%s' -> dst: '%s'\n"
981 980 % (f, df)
982 981 )
983 982 break
984 983
985 984 return dirmove, movewithdir
986 985
987 986
988 987 def _heuristicscopytracing(repo, c1, c2, base):
989 988 """Fast copytracing using filename heuristics
990 989
991 990 Assumes that moves or renames are of following two types:
992 991
993 992 1) Inside a directory only (same directory name but different filenames)
994 993 2) Move from one directory to another
995 994 (same filenames but different directory names)
996 995
997 996 Works only when there are no merge commits in the "source branch".
998 997 Source branch is commits from base up to c2 not including base.
999 998
1000 999 If merge is involved it fallbacks to _fullcopytracing().
1001 1000
1002 1001 Can be used by setting the following config:
1003 1002
1004 1003 [experimental]
1005 1004 copytrace = heuristics
1006 1005
1007 1006 In some cases the copy/move candidates found by heuristics can be very large
1008 1007 in number and that will make the algorithm slow. The number of possible
1009 1008 candidates to check can be limited by using the config
1010 1009 `experimental.copytrace.movecandidateslimit` which defaults to 100.
1011 1010 """
1012 1011
1013 1012 if c1.rev() is None:
1014 1013 c1 = c1.p1()
1015 1014 if c2.rev() is None:
1016 1015 c2 = c2.p1()
1017 1016
1018 1017 changedfiles = set()
1019 1018 m1 = c1.manifest()
1020 1019 if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
1021 1020 # If base is not in c2 branch, we switch to fullcopytracing
1022 1021 repo.ui.debug(
1023 1022 b"switching to full copytracing as base is not "
1024 1023 b"an ancestor of c2\n"
1025 1024 )
1026 1025 return _fullcopytracing(repo, c1, c2, base)
1027 1026
1028 1027 ctx = c2
1029 1028 while ctx != base:
1030 1029 if len(ctx.parents()) == 2:
1031 1030 # To keep things simple let's not handle merges
1032 1031 repo.ui.debug(b"switching to full copytracing because of merges\n")
1033 1032 return _fullcopytracing(repo, c1, c2, base)
1034 1033 changedfiles.update(ctx.files())
1035 1034 ctx = ctx.p1()
1036 1035
1037 1036 copies2 = {}
1038 1037 cp = _forwardcopies(base, c2)
1039 1038 for dst, src in pycompat.iteritems(cp):
1040 1039 if src in m1:
1041 1040 copies2[dst] = src
1042 1041
1043 1042 # file is missing if it isn't present in the destination, but is present in
1044 1043 # the base and present in the source.
1045 1044 # Presence in the base is important to exclude added files, presence in the
1046 1045 # source is important to exclude removed files.
1047 1046 filt = lambda f: f not in m1 and f in base and f in c2
1048 1047 missingfiles = [f for f in changedfiles if filt(f)]
1049 1048
1050 1049 copies1 = {}
1051 1050 if missingfiles:
1052 1051 basenametofilename = collections.defaultdict(list)
1053 1052 dirnametofilename = collections.defaultdict(list)
1054 1053
1055 1054 for f in m1.filesnotin(base.manifest()):
1056 1055 basename = os.path.basename(f)
1057 1056 dirname = os.path.dirname(f)
1058 1057 basenametofilename[basename].append(f)
1059 1058 dirnametofilename[dirname].append(f)
1060 1059
1061 1060 for f in missingfiles:
1062 1061 basename = os.path.basename(f)
1063 1062 dirname = os.path.dirname(f)
1064 1063 samebasename = basenametofilename[basename]
1065 1064 samedirname = dirnametofilename[dirname]
1066 1065 movecandidates = samebasename + samedirname
1067 1066 # f is guaranteed to be present in c2, that's why
1068 1067 # c2.filectx(f) won't fail
1069 1068 f2 = c2.filectx(f)
1070 1069 # we can have a lot of candidates which can slow down the heuristics
1071 1070 # config value to limit the number of candidates moves to check
1072 1071 maxcandidates = repo.ui.configint(
1073 1072 b'experimental', b'copytrace.movecandidateslimit'
1074 1073 )
1075 1074
1076 1075 if len(movecandidates) > maxcandidates:
1077 1076 repo.ui.status(
1078 1077 _(
1079 1078 b"skipping copytracing for '%s', more "
1080 1079 b"candidates than the limit: %d\n"
1081 1080 )
1082 1081 % (f, len(movecandidates))
1083 1082 )
1084 1083 continue
1085 1084
1086 1085 for candidate in movecandidates:
1087 1086 f1 = c1.filectx(candidate)
1088 1087 if _related(f1, f2):
1089 1088 # if there are a few related copies then we'll merge
1090 1089 # changes into all of them. This matches the behaviour
1091 1090 # of upstream copytracing
1092 1091 copies1[candidate] = f
1093 1092
1094 1093 return branch_copies(copies1), branch_copies(copies2), {}
1095 1094
1096 1095
1097 1096 def _related(f1, f2):
1098 1097 """return True if f1 and f2 filectx have a common ancestor
1099 1098
1100 1099 Walk back to common ancestor to see if the two files originate
1101 1100 from the same file. Since workingfilectx's rev() is None it messes
1102 1101 up the integer comparison logic, hence the pre-step check for
1103 1102 None (f1 and f2 can only be workingfilectx's initially).
1104 1103 """
1105 1104
1106 1105 if f1 == f2:
1107 1106 return True # a match
1108 1107
1109 1108 g1, g2 = f1.ancestors(), f2.ancestors()
1110 1109 try:
1111 1110 f1r, f2r = f1.linkrev(), f2.linkrev()
1112 1111
1113 1112 if f1r is None:
1114 1113 f1 = next(g1)
1115 1114 if f2r is None:
1116 1115 f2 = next(g2)
1117 1116
1118 1117 while True:
1119 1118 f1r, f2r = f1.linkrev(), f2.linkrev()
1120 1119 if f1r > f2r:
1121 1120 f1 = next(g1)
1122 1121 elif f2r > f1r:
1123 1122 f2 = next(g2)
1124 1123 else: # f1 and f2 point to files in the same linkrev
1125 1124 return f1 == f2 # true if they point to the same file
1126 1125 except StopIteration:
1127 1126 return False
1128 1127
1129 1128
1130 1129 def graftcopies(wctx, ctx, base):
1131 1130 """reproduce copies between base and ctx in the wctx
1132 1131
1133 1132 Unlike mergecopies(), this function will only consider copies between base
1134 1133 and ctx; it will ignore copies between base and wctx. Also unlike
1135 1134 mergecopies(), this function will apply copies to the working copy (instead
1136 1135 of just returning information about the copies). That makes it cheaper
1137 1136 (especially in the common case of base==ctx.p1()) and useful also when
1138 1137 experimental.copytrace=off.
1139 1138
1140 1139 merge.update() will have already marked most copies, but it will only
1141 1140 mark copies if it thinks the source files are related (see
1142 1141 merge._related()). It will also not mark copies if the file wasn't modified
1143 1142 on the local side. This function adds the copies that were "missed"
1144 1143 by merge.update().
1145 1144 """
1146 1145 new_copies = pathcopies(base, ctx)
1147 1146 _filter(wctx.p1(), wctx, new_copies)
1148 1147 for dst, src in pycompat.iteritems(new_copies):
1149 1148 wctx[dst].markcopied(src)
General Comments 0
You need to be logged in to leave comments. Login now