##// END OF EJS Templates
copies: extract value comparison in the python copy tracing...
marmoute -
r46776:3a0c4133 default
parent child Browse files
Show More
@@ -1,1217 +1,1224 b''
1 1 # coding: utf8
2 2 # copies.py - copy detection for Mercurial
3 3 #
4 4 # Copyright 2008 Matt Mackall <mpm@selenic.com>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import collections
12 12 import os
13 13
14 14 from .i18n import _
15 15 from .node import (
16 16 nullid,
17 17 nullrev,
18 18 )
19 19
20 20 from . import (
21 21 match as matchmod,
22 22 pathutil,
23 23 policy,
24 24 pycompat,
25 25 util,
26 26 )
27 27
28 28
29 29 from .utils import stringutil
30 30
31 31 from .revlogutils import (
32 32 flagutil,
33 33 sidedata as sidedatamod,
34 34 )
35 35
36 36 rustmod = policy.importrust("copy_tracing")
37 37
38 38
39 39 def _filter(src, dst, t):
40 40 """filters out invalid copies after chaining"""
41 41
42 42 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
43 43 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
44 44 # in the following table (not including trivial cases). For example, case 2
45 45 # is where a file existed in 'src' and remained under that name in 'mid' and
46 46 # then was renamed between 'mid' and 'dst'.
47 47 #
48 48 # case src mid dst result
49 49 # 1 x y - -
50 50 # 2 x y y x->y
51 51 # 3 x y x -
52 52 # 4 x y z x->z
53 53 # 5 - x y -
54 54 # 6 x x y x->y
55 55 #
56 56 # _chain() takes care of chaining the copies in 'a' and 'b', but it
57 57 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
58 58 # between 5 and 6, so it includes all cases in its result.
59 59 # Cases 1, 3, and 5 are then removed by _filter().
60 60
61 61 for k, v in list(t.items()):
62 62 # remove copies from files that didn't exist
63 63 if v not in src:
64 64 del t[k]
65 65 # remove criss-crossed copies
66 66 elif k in src and v in dst:
67 67 del t[k]
68 68 # remove copies to files that were then removed
69 69 elif k not in dst:
70 70 del t[k]
71 71
72 72
73 73 def _chain(prefix, suffix):
74 74 """chain two sets of copies 'prefix' and 'suffix'"""
75 75 result = prefix.copy()
76 76 for key, value in pycompat.iteritems(suffix):
77 77 result[key] = prefix.get(value, value)
78 78 return result
79 79
80 80
81 81 def _tracefile(fctx, am, basemf):
82 82 """return file context that is the ancestor of fctx present in ancestor
83 83 manifest am
84 84
85 85 Note: we used to try and stop after a given limit, however checking if that
86 86 limit is reached turned out to be very expensive. we are better off
87 87 disabling that feature."""
88 88
89 89 for f in fctx.ancestors():
90 90 path = f.path()
91 91 if am.get(path, None) == f.filenode():
92 92 return path
93 93 if basemf and basemf.get(path, None) == f.filenode():
94 94 return path
95 95
96 96
97 97 def _dirstatecopies(repo, match=None):
98 98 ds = repo.dirstate
99 99 c = ds.copies().copy()
100 100 for k in list(c):
101 101 if ds[k] not in b'anm' or (match and not match(k)):
102 102 del c[k]
103 103 return c
104 104
105 105
106 106 def _computeforwardmissing(a, b, match=None):
107 107 """Computes which files are in b but not a.
108 108 This is its own function so extensions can easily wrap this call to see what
109 109 files _forwardcopies is about to process.
110 110 """
111 111 ma = a.manifest()
112 112 mb = b.manifest()
113 113 return mb.filesnotin(ma, match=match)
114 114
115 115
116 116 def usechangesetcentricalgo(repo):
117 117 """Checks if we should use changeset-centric copy algorithms"""
118 118 if repo.filecopiesmode == b'changeset-sidedata':
119 119 return True
120 120 readfrom = repo.ui.config(b'experimental', b'copies.read-from')
121 121 changesetsource = (b'changeset-only', b'compatibility')
122 122 return readfrom in changesetsource
123 123
124 124
125 125 def _committedforwardcopies(a, b, base, match):
126 126 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
127 127 # files might have to be traced back to the fctx parent of the last
128 128 # one-side-only changeset, but not further back than that
129 129 repo = a._repo
130 130
131 131 if usechangesetcentricalgo(repo):
132 132 return _changesetforwardcopies(a, b, match)
133 133
134 134 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
135 135 dbg = repo.ui.debug
136 136 if debug:
137 137 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b))
138 138 am = a.manifest()
139 139 basemf = None if base is None else base.manifest()
140 140
141 141 # find where new files came from
142 142 # we currently don't try to find where old files went, too expensive
143 143 # this means we can miss a case like 'hg rm b; hg cp a b'
144 144 cm = {}
145 145
146 146 # Computing the forward missing is quite expensive on large manifests, since
147 147 # it compares the entire manifests. We can optimize it in the common use
148 148 # case of computing what copies are in a commit versus its parent (like
149 149 # during a rebase or histedit). Note, we exclude merge commits from this
150 150 # optimization, since the ctx.files() for a merge commit is not correct for
151 151 # this comparison.
152 152 forwardmissingmatch = match
153 153 if b.p1() == a and b.p2().node() == nullid:
154 154 filesmatcher = matchmod.exact(b.files())
155 155 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
156 156 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
157 157
158 158 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
159 159
160 160 if debug:
161 161 dbg(b'debug.copies: missing files to search: %d\n' % len(missing))
162 162
163 163 for f in sorted(missing):
164 164 if debug:
165 165 dbg(b'debug.copies: tracing file: %s\n' % f)
166 166 fctx = b[f]
167 167 fctx._ancestrycontext = ancestrycontext
168 168
169 169 if debug:
170 170 start = util.timer()
171 171 opath = _tracefile(fctx, am, basemf)
172 172 if opath:
173 173 if debug:
174 174 dbg(b'debug.copies: rename of: %s\n' % opath)
175 175 cm[f] = opath
176 176 if debug:
177 177 dbg(
178 178 b'debug.copies: time: %f seconds\n'
179 179 % (util.timer() - start)
180 180 )
181 181 return cm
182 182
183 183
184 184 def _revinfo_getter(repo, match):
185 185 """returns a function that returns the following data given a <rev>"
186 186
187 187 * p1: revision number of first parent
188 188 * p2: revision number of first parent
189 189 * changes: a ChangingFiles object
190 190 """
191 191 cl = repo.changelog
192 192 parents = cl.parentrevs
193 193 flags = cl.flags
194 194
195 195 HASCOPIESINFO = flagutil.REVIDX_HASCOPIESINFO
196 196
197 197 changelogrevision = cl.changelogrevision
198 198
199 199 if rustmod is not None:
200 200
201 201 def revinfo(rev):
202 202 p1, p2 = parents(rev)
203 203 if flags(rev) & HASCOPIESINFO:
204 204 raw = changelogrevision(rev)._sidedata.get(sidedatamod.SD_FILES)
205 205 else:
206 206 raw = None
207 207 return (p1, p2, raw)
208 208
209 209 else:
210 210
211 211 def revinfo(rev):
212 212 p1, p2 = parents(rev)
213 213 if flags(rev) & HASCOPIESINFO:
214 214 changes = changelogrevision(rev).changes
215 215 else:
216 216 changes = None
217 217 return (p1, p2, changes)
218 218
219 219 return revinfo
220 220
221 221
222 222 def cached_is_ancestor(is_ancestor):
223 223 """return a cached version of is_ancestor"""
224 224 cache = {}
225 225
226 226 def _is_ancestor(anc, desc):
227 227 if anc > desc:
228 228 return False
229 229 elif anc == desc:
230 230 return True
231 231 key = (anc, desc)
232 232 ret = cache.get(key)
233 233 if ret is None:
234 234 ret = cache[key] = is_ancestor(anc, desc)
235 235 return ret
236 236
237 237 return _is_ancestor
238 238
239 239
240 240 def _changesetforwardcopies(a, b, match):
241 241 if a.rev() in (nullrev, b.rev()):
242 242 return {}
243 243
244 244 repo = a.repo().unfiltered()
245 245 children = {}
246 246
247 247 cl = repo.changelog
248 248 isancestor = cl.isancestorrev
249 249
250 250 # To track rename from "A" to B, we need to gather all parent β†’ children
251 251 # edges that are contains in `::B` but not in `::A`.
252 252 #
253 253 #
254 254 # To do so, we need to gather all revisions exclusiveΒΉ to "B" (ieΒΉ: `::b -
255 255 # ::a`) and also all the "roots point", ie the parents of the exclusive set
256 256 # that belong to ::a. These are exactly all the revisions needed to express
257 257 # the parent β†’ children we need to combine.
258 258 #
259 259 # [1] actually, we need to gather all the edges within `(::a)::b`, ie:
260 260 # excluding paths that leads to roots that are not ancestors of `a`. We
261 261 # keep this out of the explanation because it is hard enough without this special case..
262 262
263 263 parents = cl._uncheckedparentrevs
264 264 graph_roots = (nullrev, nullrev)
265 265
266 266 ancestors = cl.ancestors([a.rev()], inclusive=True)
267 267 revs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
268 268 roots = set()
269 269 has_graph_roots = False
270 270
271 271 # iterate over `only(B, A)`
272 272 for r in revs:
273 273 ps = parents(r)
274 274 if ps == graph_roots:
275 275 has_graph_roots = True
276 276 else:
277 277 p1, p2 = ps
278 278
279 279 # find all the "root points" (see larger comment above)
280 280 if p1 != nullrev and p1 in ancestors:
281 281 roots.add(p1)
282 282 if p2 != nullrev and p2 in ancestors:
283 283 roots.add(p2)
284 284 if not roots:
285 285 # no common revision to track copies from
286 286 return {}
287 287 if has_graph_roots:
288 288 # this deal with the special case mentionned in the [1] footnotes. We
289 289 # must filter out revisions that leads to non-common graphroots.
290 290 roots = list(roots)
291 291 m = min(roots)
292 292 h = [b.rev()]
293 293 roots_to_head = cl.reachableroots(m, h, roots, includepath=True)
294 294 roots_to_head = set(roots_to_head)
295 295 revs = [r for r in revs if r in roots_to_head]
296 296
297 297 if repo.filecopiesmode == b'changeset-sidedata':
298 298 # When using side-data, we will process the edges "from" the children.
299 299 # We iterate over the childre, gathering previous collected data for
300 300 # the parents. Do know when the parents data is no longer necessary, we
301 301 # keep a counter of how many children each revision has.
302 302 #
303 303 # An interresting property of `children_count` is that it only contains
304 304 # revision that will be relevant for a edge of the graph. So if a
305 305 # children has parent not in `children_count`, that edges should not be
306 306 # processed.
307 307 children_count = dict((r, 0) for r in roots)
308 308 for r in revs:
309 309 for p in cl.parentrevs(r):
310 310 if p == nullrev:
311 311 continue
312 312 children_count[r] = 0
313 313 if p in children_count:
314 314 children_count[p] += 1
315 315 revinfo = _revinfo_getter(repo, match)
316 316 return _combine_changeset_copies(
317 317 revs, children_count, b.rev(), revinfo, match, isancestor
318 318 )
319 319 else:
320 320 # When not using side-data, we will process the edges "from" the parent.
321 321 # so we need a full mapping of the parent -> children relation.
322 322 children = dict((r, []) for r in roots)
323 323 for r in revs:
324 324 for p in cl.parentrevs(r):
325 325 if p == nullrev:
326 326 continue
327 327 children[r] = []
328 328 if p in children:
329 329 children[p].append(r)
330 330 x = revs.pop()
331 331 assert x == b.rev()
332 332 revs.extend(roots)
333 333 revs.sort()
334 334
335 335 revinfo = _revinfo_getter_extra(repo)
336 336 return _combine_changeset_copies_extra(
337 337 revs, children, b.rev(), revinfo, match, isancestor
338 338 )
339 339
340 340
341 341 def _combine_changeset_copies(
342 342 revs, children_count, targetrev, revinfo, match, isancestor
343 343 ):
344 344 """combine the copies information for each item of iterrevs
345 345
346 346 revs: sorted iterable of revision to visit
347 347 children_count: a {parent: <number-of-relevant-children>} mapping.
348 348 targetrev: the final copies destination revision (not in iterrevs)
349 349 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
350 350 match: a matcher
351 351
352 352 It returns the aggregated copies information for `targetrev`.
353 353 """
354 354
355 355 alwaysmatch = match.always()
356 356
357 357 if rustmod is not None:
358 358 final_copies = rustmod.combine_changeset_copies(
359 359 list(revs), children_count, targetrev, revinfo, isancestor
360 360 )
361 361 else:
362 362 isancestor = cached_is_ancestor(isancestor)
363 363
364 364 all_copies = {}
365 365 # iterate over all the "children" side of copy tracing "edge"
366 366 for current_rev in revs:
367 367 p1, p2, changes = revinfo(current_rev)
368 368 current_copies = None
369 369 # iterate over all parents to chain the existing data with the
370 370 # data from the parent β†’ child edge.
371 371 for parent, parent_rev in ((1, p1), (2, p2)):
372 372 if parent_rev == nullrev:
373 373 continue
374 374 remaining_children = children_count.get(parent_rev)
375 375 if remaining_children is None:
376 376 continue
377 377 remaining_children -= 1
378 378 children_count[parent_rev] = remaining_children
379 379 if remaining_children:
380 380 copies = all_copies.get(parent_rev, None)
381 381 else:
382 382 copies = all_copies.pop(parent_rev, None)
383 383
384 384 if copies is None:
385 385 # this is a root
386 386 copies = {}
387 387
388 388 newcopies = copies
389 389 # chain the data in the edge with the existing data
390 390 if changes is not None:
391 391 childcopies = {}
392 392 if parent == 1:
393 393 childcopies = changes.copied_from_p1
394 394 elif parent == 2:
395 395 childcopies = changes.copied_from_p2
396 396
397 397 if childcopies:
398 398 newcopies = copies.copy()
399 399 for dest, source in pycompat.iteritems(childcopies):
400 400 prev = copies.get(source)
401 401 if prev is not None and prev[1] is not None:
402 402 source = prev[1]
403 403 newcopies[dest] = (current_rev, source)
404 404 assert newcopies is not copies
405 405 if changes.removed:
406 406 if newcopies is copies:
407 407 newcopies = copies.copy()
408 408 for f in changes.removed:
409 409 if f in newcopies:
410 410 if newcopies is copies:
411 411 # copy on write to avoid affecting potential other
412 412 # branches. when there are no other branches, this
413 413 # could be avoided.
414 414 newcopies = copies.copy()
415 415 newcopies[f] = (current_rev, None)
416 416 # check potential need to combine the data from another parent (for
417 417 # that child). See comment below for details.
418 418 if current_copies is None:
419 419 current_copies = newcopies
420 420 elif current_copies is newcopies:
421 421 # nothing to merge:
422 422 pass
423 423 else:
424 424 # we are the second parent to work on c, we need to merge our
425 425 # work with the other.
426 426 #
427 427 # In case of conflict, parent 1 take precedence over parent 2.
428 428 # This is an arbitrary choice made anew when implementing
429 429 # changeset based copies. It was made without regards with
430 430 # potential filelog related behavior.
431 431 assert parent == 2
432 432 current_copies = _merge_copies_dict(
433 433 newcopies, current_copies, isancestor, changes
434 434 )
435 435 all_copies[current_rev] = current_copies
436 436
437 437 # filter out internal details and return a {dest: source mapping}
438 438 final_copies = {}
439 439 for dest, (tt, source) in all_copies[targetrev].items():
440 440 if source is not None:
441 441 final_copies[dest] = source
442 442 if not alwaysmatch:
443 443 for filename in list(final_copies.keys()):
444 444 if not match(filename):
445 445 del final_copies[filename]
446 446 return final_copies
447 447
448 448
449 # constant to decide which side to pick with _merge_copies_dict
450 PICK_MINOR = 0
451 PICK_MAJOR = 1
452 PICK_EITHER = 2
453
454
449 455 def _merge_copies_dict(minor, major, isancestor, changes):
450 456 """merge two copies-mapping together, minor and major
451 457
452 458 In case of conflict, value from "major" will be picked.
453 459
454 460 - `isancestors(low_rev, high_rev)`: callable return True if `low_rev` is an
455 461 ancestors of `high_rev`,
456 462
457 463 - `ismerged(path)`: callable return True if `path` have been merged in the
458 464 current revision,
459 465
460 466 return the resulting dict (in practice, the "minor" object, updated)
461 467 """
462 468 for dest, value in major.items():
463 469 other = minor.get(dest)
464 470 if other is None:
465 471 minor[dest] = value
466 472 else:
467 new_tt = value[0]
468 other_tt = other[0]
469 if value[1] == other[1]:
470 continue
471 # content from "major" wins, unless it is older
472 # than the branch point or there is a merge
473 if new_tt == other_tt:
473 pick = _compare_values(changes, isancestor, dest, other, value)
474 if pick == PICK_MAJOR:
474 475 minor[dest] = value
475 elif (
476 changes is not None
477 and value[1] is None
478 and dest in changes.salvaged
479 ):
480 pass
481 elif (
482 changes is not None
483 and other[1] is None
484 and dest in changes.salvaged
485 ):
486 minor[dest] = value
487 elif changes is not None and dest in changes.merged:
488 minor[dest] = value
489 elif not isancestor(new_tt, other_tt):
490 if value[1] is not None:
491 minor[dest] = value
492 elif isancestor(other_tt, new_tt):
493 minor[dest] = value
494 476 return minor
495 477
496 478
479 def _compare_values(changes, isancestor, dest, other, value):
480 """compare two value within a _merge_copies_dict loop iteration"""
481 new_tt = value[0]
482 other_tt = other[0]
483
484 if value[1] == other[1]:
485 return PICK_EITHER
486 # content from "major" wins, unless it is older
487 # than the branch point or there is a merge
488 if new_tt == other_tt:
489 return PICK_MAJOR
490 elif changes is not None and value[1] is None and dest in changes.salvaged:
491 return PICK_MINOR
492 elif changes is not None and other[1] is None and dest in changes.salvaged:
493 return PICK_MAJOR
494 elif changes is not None and dest in changes.merged:
495 return PICK_MAJOR
496 elif not isancestor(new_tt, other_tt):
497 if value[1] is not None:
498 return PICK_MAJOR
499 elif isancestor(other_tt, new_tt):
500 return PICK_MAJOR
501 return PICK_MINOR
502
503
497 504 def _revinfo_getter_extra(repo):
498 505 """return a function that return multiple data given a <rev>"i
499 506
500 507 * p1: revision number of first parent
501 508 * p2: revision number of first parent
502 509 * p1copies: mapping of copies from p1
503 510 * p2copies: mapping of copies from p2
504 511 * removed: a list of removed files
505 512 * ismerged: a callback to know if file was merged in that revision
506 513 """
507 514 cl = repo.changelog
508 515 parents = cl.parentrevs
509 516
510 517 def get_ismerged(rev):
511 518 ctx = repo[rev]
512 519
513 520 def ismerged(path):
514 521 if path not in ctx.files():
515 522 return False
516 523 fctx = ctx[path]
517 524 parents = fctx._filelog.parents(fctx._filenode)
518 525 nb_parents = 0
519 526 for n in parents:
520 527 if n != nullid:
521 528 nb_parents += 1
522 529 return nb_parents >= 2
523 530
524 531 return ismerged
525 532
526 533 def revinfo(rev):
527 534 p1, p2 = parents(rev)
528 535 ctx = repo[rev]
529 536 p1copies, p2copies = ctx._copies
530 537 removed = ctx.filesremoved()
531 538 return p1, p2, p1copies, p2copies, removed, get_ismerged(rev)
532 539
533 540 return revinfo
534 541
535 542
536 543 def _combine_changeset_copies_extra(
537 544 revs, children, targetrev, revinfo, match, isancestor
538 545 ):
539 546 """version of `_combine_changeset_copies` that works with the Google
540 547 specific "extra" based storage for copy information"""
541 548 all_copies = {}
542 549 alwaysmatch = match.always()
543 550 for r in revs:
544 551 copies = all_copies.pop(r, None)
545 552 if copies is None:
546 553 # this is a root
547 554 copies = {}
548 555 for i, c in enumerate(children[r]):
549 556 p1, p2, p1copies, p2copies, removed, ismerged = revinfo(c)
550 557 if r == p1:
551 558 parent = 1
552 559 childcopies = p1copies
553 560 else:
554 561 assert r == p2
555 562 parent = 2
556 563 childcopies = p2copies
557 564 if not alwaysmatch:
558 565 childcopies = {
559 566 dst: src for dst, src in childcopies.items() if match(dst)
560 567 }
561 568 newcopies = copies
562 569 if childcopies:
563 570 newcopies = copies.copy()
564 571 for dest, source in pycompat.iteritems(childcopies):
565 572 prev = copies.get(source)
566 573 if prev is not None and prev[1] is not None:
567 574 source = prev[1]
568 575 newcopies[dest] = (c, source)
569 576 assert newcopies is not copies
570 577 for f in removed:
571 578 if f in newcopies:
572 579 if newcopies is copies:
573 580 # copy on write to avoid affecting potential other
574 581 # branches. when there are no other branches, this
575 582 # could be avoided.
576 583 newcopies = copies.copy()
577 584 newcopies[f] = (c, None)
578 585 othercopies = all_copies.get(c)
579 586 if othercopies is None:
580 587 all_copies[c] = newcopies
581 588 else:
582 589 # we are the second parent to work on c, we need to merge our
583 590 # work with the other.
584 591 #
585 592 # In case of conflict, parent 1 take precedence over parent 2.
586 593 # This is an arbitrary choice made anew when implementing
587 594 # changeset based copies. It was made without regards with
588 595 # potential filelog related behavior.
589 596 if parent == 1:
590 597 _merge_copies_dict_extra(
591 598 othercopies, newcopies, isancestor, ismerged
592 599 )
593 600 else:
594 601 _merge_copies_dict_extra(
595 602 newcopies, othercopies, isancestor, ismerged
596 603 )
597 604 all_copies[c] = newcopies
598 605
599 606 final_copies = {}
600 607 for dest, (tt, source) in all_copies[targetrev].items():
601 608 if source is not None:
602 609 final_copies[dest] = source
603 610 return final_copies
604 611
605 612
606 613 def _merge_copies_dict_extra(minor, major, isancestor, ismerged):
607 614 """version of `_merge_copies_dict` that works with the Google
608 615 specific "extra" based storage for copy information"""
609 616 for dest, value in major.items():
610 617 other = minor.get(dest)
611 618 if other is None:
612 619 minor[dest] = value
613 620 else:
614 621 new_tt = value[0]
615 622 other_tt = other[0]
616 623 if value[1] == other[1]:
617 624 continue
618 625 # content from "major" wins, unless it is older
619 626 # than the branch point or there is a merge
620 627 if (
621 628 new_tt == other_tt
622 629 or not isancestor(new_tt, other_tt)
623 630 or ismerged(dest)
624 631 ):
625 632 minor[dest] = value
626 633
627 634
628 635 def _forwardcopies(a, b, base=None, match=None):
629 636 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
630 637
631 638 if base is None:
632 639 base = a
633 640 match = a.repo().narrowmatch(match)
634 641 # check for working copy
635 642 if b.rev() is None:
636 643 cm = _committedforwardcopies(a, b.p1(), base, match)
637 644 # combine copies from dirstate if necessary
638 645 copies = _chain(cm, _dirstatecopies(b._repo, match))
639 646 else:
640 647 copies = _committedforwardcopies(a, b, base, match)
641 648 return copies
642 649
643 650
644 651 def _backwardrenames(a, b, match):
645 652 if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
646 653 return {}
647 654
648 655 # Even though we're not taking copies into account, 1:n rename situations
649 656 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
650 657 # arbitrarily pick one of the renames.
651 658 # We don't want to pass in "match" here, since that would filter
652 659 # the destination by it. Since we're reversing the copies, we want
653 660 # to filter the source instead.
654 661 f = _forwardcopies(b, a)
655 662 r = {}
656 663 for k, v in sorted(pycompat.iteritems(f)):
657 664 if match and not match(v):
658 665 continue
659 666 # remove copies
660 667 if v in a:
661 668 continue
662 669 r[v] = k
663 670 return r
664 671
665 672
666 673 def pathcopies(x, y, match=None):
667 674 """find {dst@y: src@x} copy mapping for directed compare"""
668 675 repo = x._repo
669 676 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
670 677 if debug:
671 678 repo.ui.debug(
672 679 b'debug.copies: searching copies from %s to %s\n' % (x, y)
673 680 )
674 681 if x == y or not x or not y:
675 682 return {}
676 683 if y.rev() is None and x == y.p1():
677 684 if debug:
678 685 repo.ui.debug(b'debug.copies: search mode: dirstate\n')
679 686 # short-circuit to avoid issues with merge states
680 687 return _dirstatecopies(repo, match)
681 688 a = y.ancestor(x)
682 689 if a == x:
683 690 if debug:
684 691 repo.ui.debug(b'debug.copies: search mode: forward\n')
685 692 copies = _forwardcopies(x, y, match=match)
686 693 elif a == y:
687 694 if debug:
688 695 repo.ui.debug(b'debug.copies: search mode: backward\n')
689 696 copies = _backwardrenames(x, y, match=match)
690 697 else:
691 698 if debug:
692 699 repo.ui.debug(b'debug.copies: search mode: combined\n')
693 700 base = None
694 701 if a.rev() != nullrev:
695 702 base = x
696 703 copies = _chain(
697 704 _backwardrenames(x, a, match=match),
698 705 _forwardcopies(a, y, base, match=match),
699 706 )
700 707 _filter(x, y, copies)
701 708 return copies
702 709
703 710
704 711 def mergecopies(repo, c1, c2, base):
705 712 """
706 713 Finds moves and copies between context c1 and c2 that are relevant for
707 714 merging. 'base' will be used as the merge base.
708 715
709 716 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
710 717 files that were moved/ copied in one merge parent and modified in another.
711 718 For example:
712 719
713 720 o ---> 4 another commit
714 721 |
715 722 | o ---> 3 commit that modifies a.txt
716 723 | /
717 724 o / ---> 2 commit that moves a.txt to b.txt
718 725 |/
719 726 o ---> 1 merge base
720 727
721 728 If we try to rebase revision 3 on revision 4, since there is no a.txt in
722 729 revision 4, and if user have copytrace disabled, we prints the following
723 730 message:
724 731
725 732 ```other changed <file> which local deleted```
726 733
727 734 Returns a tuple where:
728 735
729 736 "branch_copies" an instance of branch_copies.
730 737
731 738 "diverge" is a mapping of source name -> list of destination names
732 739 for divergent renames.
733 740
734 741 This function calls different copytracing algorithms based on config.
735 742 """
736 743 # avoid silly behavior for update from empty dir
737 744 if not c1 or not c2 or c1 == c2:
738 745 return branch_copies(), branch_copies(), {}
739 746
740 747 narrowmatch = c1.repo().narrowmatch()
741 748
742 749 # avoid silly behavior for parent -> working dir
743 750 if c2.node() is None and c1.node() == repo.dirstate.p1():
744 751 return (
745 752 branch_copies(_dirstatecopies(repo, narrowmatch)),
746 753 branch_copies(),
747 754 {},
748 755 )
749 756
750 757 copytracing = repo.ui.config(b'experimental', b'copytrace')
751 758 if stringutil.parsebool(copytracing) is False:
752 759 # stringutil.parsebool() returns None when it is unable to parse the
753 760 # value, so we should rely on making sure copytracing is on such cases
754 761 return branch_copies(), branch_copies(), {}
755 762
756 763 if usechangesetcentricalgo(repo):
757 764 # The heuristics don't make sense when we need changeset-centric algos
758 765 return _fullcopytracing(repo, c1, c2, base)
759 766
760 767 # Copy trace disabling is explicitly below the node == p1 logic above
761 768 # because the logic above is required for a simple copy to be kept across a
762 769 # rebase.
763 770 if copytracing == b'heuristics':
764 771 # Do full copytracing if only non-public revisions are involved as
765 772 # that will be fast enough and will also cover the copies which could
766 773 # be missed by heuristics
767 774 if _isfullcopytraceable(repo, c1, base):
768 775 return _fullcopytracing(repo, c1, c2, base)
769 776 return _heuristicscopytracing(repo, c1, c2, base)
770 777 else:
771 778 return _fullcopytracing(repo, c1, c2, base)
772 779
773 780
774 781 def _isfullcopytraceable(repo, c1, base):
775 782 """Checks that if base, source and destination are all no-public branches,
776 783 if yes let's use the full copytrace algorithm for increased capabilities
777 784 since it will be fast enough.
778 785
779 786 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
780 787 number of changesets from c1 to base such that if number of changesets are
781 788 more than the limit, full copytracing algorithm won't be used.
782 789 """
783 790 if c1.rev() is None:
784 791 c1 = c1.p1()
785 792 if c1.mutable() and base.mutable():
786 793 sourcecommitlimit = repo.ui.configint(
787 794 b'experimental', b'copytrace.sourcecommitlimit'
788 795 )
789 796 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
790 797 return commits < sourcecommitlimit
791 798 return False
792 799
793 800
794 801 def _checksinglesidecopies(
795 802 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
796 803 ):
797 804 if src not in m2:
798 805 # deleted on side 2
799 806 if src not in m1:
800 807 # renamed on side 1, deleted on side 2
801 808 renamedelete[src] = dsts1
802 809 elif src not in mb:
803 810 # Work around the "short-circuit to avoid issues with merge states"
804 811 # thing in pathcopies(): pathcopies(x, y) can return a copy where the
805 812 # destination doesn't exist in y.
806 813 pass
807 814 elif mb[src] != m2[src] and not _related(c2[src], base[src]):
808 815 return
809 816 elif mb[src] != m2[src] or mb.flags(src) != m2.flags(src):
810 817 # modified on side 2
811 818 for dst in dsts1:
812 819 copy[dst] = src
813 820
814 821
815 822 class branch_copies(object):
816 823 """Information about copies made on one side of a merge/graft.
817 824
818 825 "copy" is a mapping from destination name -> source name,
819 826 where source is in c1 and destination is in c2 or vice-versa.
820 827
821 828 "movewithdir" is a mapping from source name -> destination name,
822 829 where the file at source present in one context but not the other
823 830 needs to be moved to destination by the merge process, because the
824 831 other context moved the directory it is in.
825 832
826 833 "renamedelete" is a mapping of source name -> list of destination
827 834 names for files deleted in c1 that were renamed in c2 or vice-versa.
828 835
829 836 "dirmove" is a mapping of detected source dir -> destination dir renames.
830 837 This is needed for handling changes to new files previously grafted into
831 838 renamed directories.
832 839 """
833 840
834 841 def __init__(
835 842 self, copy=None, renamedelete=None, dirmove=None, movewithdir=None
836 843 ):
837 844 self.copy = {} if copy is None else copy
838 845 self.renamedelete = {} if renamedelete is None else renamedelete
839 846 self.dirmove = {} if dirmove is None else dirmove
840 847 self.movewithdir = {} if movewithdir is None else movewithdir
841 848
842 849 def __repr__(self):
843 850 return '<branch_copies\n copy=%r\n renamedelete=%r\n dirmove=%r\n movewithdir=%r\n>' % (
844 851 self.copy,
845 852 self.renamedelete,
846 853 self.dirmove,
847 854 self.movewithdir,
848 855 )
849 856
850 857
851 858 def _fullcopytracing(repo, c1, c2, base):
852 859 """The full copytracing algorithm which finds all the new files that were
853 860 added from merge base up to the top commit and for each file it checks if
854 861 this file was copied from another file.
855 862
856 863 This is pretty slow when a lot of changesets are involved but will track all
857 864 the copies.
858 865 """
859 866 m1 = c1.manifest()
860 867 m2 = c2.manifest()
861 868 mb = base.manifest()
862 869
863 870 copies1 = pathcopies(base, c1)
864 871 copies2 = pathcopies(base, c2)
865 872
866 873 if not (copies1 or copies2):
867 874 return branch_copies(), branch_copies(), {}
868 875
869 876 inversecopies1 = {}
870 877 inversecopies2 = {}
871 878 for dst, src in copies1.items():
872 879 inversecopies1.setdefault(src, []).append(dst)
873 880 for dst, src in copies2.items():
874 881 inversecopies2.setdefault(src, []).append(dst)
875 882
876 883 copy1 = {}
877 884 copy2 = {}
878 885 diverge = {}
879 886 renamedelete1 = {}
880 887 renamedelete2 = {}
881 888 allsources = set(inversecopies1) | set(inversecopies2)
882 889 for src in allsources:
883 890 dsts1 = inversecopies1.get(src)
884 891 dsts2 = inversecopies2.get(src)
885 892 if dsts1 and dsts2:
886 893 # copied/renamed on both sides
887 894 if src not in m1 and src not in m2:
888 895 # renamed on both sides
889 896 dsts1 = set(dsts1)
890 897 dsts2 = set(dsts2)
891 898 # If there's some overlap in the rename destinations, we
892 899 # consider it not divergent. For example, if side 1 copies 'a'
893 900 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
894 901 # and 'd' and deletes 'a'.
895 902 if dsts1 & dsts2:
896 903 for dst in dsts1 & dsts2:
897 904 copy1[dst] = src
898 905 copy2[dst] = src
899 906 else:
900 907 diverge[src] = sorted(dsts1 | dsts2)
901 908 elif src in m1 and src in m2:
902 909 # copied on both sides
903 910 dsts1 = set(dsts1)
904 911 dsts2 = set(dsts2)
905 912 for dst in dsts1 & dsts2:
906 913 copy1[dst] = src
907 914 copy2[dst] = src
908 915 # TODO: Handle cases where it was renamed on one side and copied
909 916 # on the other side
910 917 elif dsts1:
911 918 # copied/renamed only on side 1
912 919 _checksinglesidecopies(
913 920 src, dsts1, m1, m2, mb, c2, base, copy1, renamedelete1
914 921 )
915 922 elif dsts2:
916 923 # copied/renamed only on side 2
917 924 _checksinglesidecopies(
918 925 src, dsts2, m2, m1, mb, c1, base, copy2, renamedelete2
919 926 )
920 927
921 928 # find interesting file sets from manifests
922 929 cache = []
923 930
924 931 def _get_addedfiles(idx):
925 932 if not cache:
926 933 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
927 934 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
928 935 u1 = sorted(addedinm1 - addedinm2)
929 936 u2 = sorted(addedinm2 - addedinm1)
930 937 cache.extend((u1, u2))
931 938 return cache[idx]
932 939
933 940 u1fn = lambda: _get_addedfiles(0)
934 941 u2fn = lambda: _get_addedfiles(1)
935 942 if repo.ui.debugflag:
936 943 u1 = u1fn()
937 944 u2 = u2fn()
938 945
939 946 header = b" unmatched files in %s"
940 947 if u1:
941 948 repo.ui.debug(
942 949 b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1))
943 950 )
944 951 if u2:
945 952 repo.ui.debug(
946 953 b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2))
947 954 )
948 955
949 956 renamedeleteset = set()
950 957 divergeset = set()
951 958 for dsts in diverge.values():
952 959 divergeset.update(dsts)
953 960 for dsts in renamedelete1.values():
954 961 renamedeleteset.update(dsts)
955 962 for dsts in renamedelete2.values():
956 963 renamedeleteset.update(dsts)
957 964
958 965 repo.ui.debug(
959 966 b" all copies found (* = to merge, ! = divergent, "
960 967 b"% = renamed and deleted):\n"
961 968 )
962 969 for side, copies in ((b"local", copies1), (b"remote", copies2)):
963 970 if not copies:
964 971 continue
965 972 repo.ui.debug(b" on %s side:\n" % side)
966 973 for f in sorted(copies):
967 974 note = b""
968 975 if f in copy1 or f in copy2:
969 976 note += b"*"
970 977 if f in divergeset:
971 978 note += b"!"
972 979 if f in renamedeleteset:
973 980 note += b"%"
974 981 repo.ui.debug(
975 982 b" src: '%s' -> dst: '%s' %s\n" % (copies[f], f, note)
976 983 )
977 984 del renamedeleteset
978 985 del divergeset
979 986
980 987 repo.ui.debug(b" checking for directory renames\n")
981 988
982 989 dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2fn)
983 990 dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1fn)
984 991
985 992 branch_copies1 = branch_copies(copy1, renamedelete1, dirmove1, movewithdir1)
986 993 branch_copies2 = branch_copies(copy2, renamedelete2, dirmove2, movewithdir2)
987 994
988 995 return branch_copies1, branch_copies2, diverge
989 996
990 997
991 998 def _dir_renames(repo, ctx, copy, fullcopy, addedfilesfn):
992 999 """Finds moved directories and files that should move with them.
993 1000
994 1001 ctx: the context for one of the sides
995 1002 copy: files copied on the same side (as ctx)
996 1003 fullcopy: files copied on the same side (as ctx), including those that
997 1004 merge.manifestmerge() won't care about
998 1005 addedfilesfn: function returning added files on the other side (compared to
999 1006 ctx)
1000 1007 """
1001 1008 # generate a directory move map
1002 1009 invalid = set()
1003 1010 dirmove = {}
1004 1011
1005 1012 # examine each file copy for a potential directory move, which is
1006 1013 # when all the files in a directory are moved to a new directory
1007 1014 for dst, src in pycompat.iteritems(fullcopy):
1008 1015 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
1009 1016 if dsrc in invalid:
1010 1017 # already seen to be uninteresting
1011 1018 continue
1012 1019 elif ctx.hasdir(dsrc) and ctx.hasdir(ddst):
1013 1020 # directory wasn't entirely moved locally
1014 1021 invalid.add(dsrc)
1015 1022 elif dsrc in dirmove and dirmove[dsrc] != ddst:
1016 1023 # files from the same directory moved to two different places
1017 1024 invalid.add(dsrc)
1018 1025 else:
1019 1026 # looks good so far
1020 1027 dirmove[dsrc] = ddst
1021 1028
1022 1029 for i in invalid:
1023 1030 if i in dirmove:
1024 1031 del dirmove[i]
1025 1032 del invalid
1026 1033
1027 1034 if not dirmove:
1028 1035 return {}, {}
1029 1036
1030 1037 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
1031 1038
1032 1039 for d in dirmove:
1033 1040 repo.ui.debug(
1034 1041 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
1035 1042 )
1036 1043
1037 1044 movewithdir = {}
1038 1045 # check unaccounted nonoverlapping files against directory moves
1039 1046 for f in addedfilesfn():
1040 1047 if f not in fullcopy:
1041 1048 for d in dirmove:
1042 1049 if f.startswith(d):
1043 1050 # new file added in a directory that was moved, move it
1044 1051 df = dirmove[d] + f[len(d) :]
1045 1052 if df not in copy:
1046 1053 movewithdir[f] = df
1047 1054 repo.ui.debug(
1048 1055 b" pending file src: '%s' -> dst: '%s'\n"
1049 1056 % (f, df)
1050 1057 )
1051 1058 break
1052 1059
1053 1060 return dirmove, movewithdir
1054 1061
1055 1062
1056 1063 def _heuristicscopytracing(repo, c1, c2, base):
1057 1064 """Fast copytracing using filename heuristics
1058 1065
1059 1066 Assumes that moves or renames are of following two types:
1060 1067
1061 1068 1) Inside a directory only (same directory name but different filenames)
1062 1069 2) Move from one directory to another
1063 1070 (same filenames but different directory names)
1064 1071
1065 1072 Works only when there are no merge commits in the "source branch".
1066 1073 Source branch is commits from base up to c2 not including base.
1067 1074
1068 1075 If merge is involved it fallbacks to _fullcopytracing().
1069 1076
1070 1077 Can be used by setting the following config:
1071 1078
1072 1079 [experimental]
1073 1080 copytrace = heuristics
1074 1081
1075 1082 In some cases the copy/move candidates found by heuristics can be very large
1076 1083 in number and that will make the algorithm slow. The number of possible
1077 1084 candidates to check can be limited by using the config
1078 1085 `experimental.copytrace.movecandidateslimit` which defaults to 100.
1079 1086 """
1080 1087
1081 1088 if c1.rev() is None:
1082 1089 c1 = c1.p1()
1083 1090 if c2.rev() is None:
1084 1091 c2 = c2.p1()
1085 1092
1086 1093 changedfiles = set()
1087 1094 m1 = c1.manifest()
1088 1095 if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
1089 1096 # If base is not in c2 branch, we switch to fullcopytracing
1090 1097 repo.ui.debug(
1091 1098 b"switching to full copytracing as base is not "
1092 1099 b"an ancestor of c2\n"
1093 1100 )
1094 1101 return _fullcopytracing(repo, c1, c2, base)
1095 1102
1096 1103 ctx = c2
1097 1104 while ctx != base:
1098 1105 if len(ctx.parents()) == 2:
1099 1106 # To keep things simple let's not handle merges
1100 1107 repo.ui.debug(b"switching to full copytracing because of merges\n")
1101 1108 return _fullcopytracing(repo, c1, c2, base)
1102 1109 changedfiles.update(ctx.files())
1103 1110 ctx = ctx.p1()
1104 1111
1105 1112 copies2 = {}
1106 1113 cp = _forwardcopies(base, c2)
1107 1114 for dst, src in pycompat.iteritems(cp):
1108 1115 if src in m1:
1109 1116 copies2[dst] = src
1110 1117
1111 1118 # file is missing if it isn't present in the destination, but is present in
1112 1119 # the base and present in the source.
1113 1120 # Presence in the base is important to exclude added files, presence in the
1114 1121 # source is important to exclude removed files.
1115 1122 filt = lambda f: f not in m1 and f in base and f in c2
1116 1123 missingfiles = [f for f in changedfiles if filt(f)]
1117 1124
1118 1125 copies1 = {}
1119 1126 if missingfiles:
1120 1127 basenametofilename = collections.defaultdict(list)
1121 1128 dirnametofilename = collections.defaultdict(list)
1122 1129
1123 1130 for f in m1.filesnotin(base.manifest()):
1124 1131 basename = os.path.basename(f)
1125 1132 dirname = os.path.dirname(f)
1126 1133 basenametofilename[basename].append(f)
1127 1134 dirnametofilename[dirname].append(f)
1128 1135
1129 1136 for f in missingfiles:
1130 1137 basename = os.path.basename(f)
1131 1138 dirname = os.path.dirname(f)
1132 1139 samebasename = basenametofilename[basename]
1133 1140 samedirname = dirnametofilename[dirname]
1134 1141 movecandidates = samebasename + samedirname
1135 1142 # f is guaranteed to be present in c2, that's why
1136 1143 # c2.filectx(f) won't fail
1137 1144 f2 = c2.filectx(f)
1138 1145 # we can have a lot of candidates which can slow down the heuristics
1139 1146 # config value to limit the number of candidates moves to check
1140 1147 maxcandidates = repo.ui.configint(
1141 1148 b'experimental', b'copytrace.movecandidateslimit'
1142 1149 )
1143 1150
1144 1151 if len(movecandidates) > maxcandidates:
1145 1152 repo.ui.status(
1146 1153 _(
1147 1154 b"skipping copytracing for '%s', more "
1148 1155 b"candidates than the limit: %d\n"
1149 1156 )
1150 1157 % (f, len(movecandidates))
1151 1158 )
1152 1159 continue
1153 1160
1154 1161 for candidate in movecandidates:
1155 1162 f1 = c1.filectx(candidate)
1156 1163 if _related(f1, f2):
1157 1164 # if there are a few related copies then we'll merge
1158 1165 # changes into all of them. This matches the behaviour
1159 1166 # of upstream copytracing
1160 1167 copies1[candidate] = f
1161 1168
1162 1169 return branch_copies(copies1), branch_copies(copies2), {}
1163 1170
1164 1171
1165 1172 def _related(f1, f2):
1166 1173 """return True if f1 and f2 filectx have a common ancestor
1167 1174
1168 1175 Walk back to common ancestor to see if the two files originate
1169 1176 from the same file. Since workingfilectx's rev() is None it messes
1170 1177 up the integer comparison logic, hence the pre-step check for
1171 1178 None (f1 and f2 can only be workingfilectx's initially).
1172 1179 """
1173 1180
1174 1181 if f1 == f2:
1175 1182 return True # a match
1176 1183
1177 1184 g1, g2 = f1.ancestors(), f2.ancestors()
1178 1185 try:
1179 1186 f1r, f2r = f1.linkrev(), f2.linkrev()
1180 1187
1181 1188 if f1r is None:
1182 1189 f1 = next(g1)
1183 1190 if f2r is None:
1184 1191 f2 = next(g2)
1185 1192
1186 1193 while True:
1187 1194 f1r, f2r = f1.linkrev(), f2.linkrev()
1188 1195 if f1r > f2r:
1189 1196 f1 = next(g1)
1190 1197 elif f2r > f1r:
1191 1198 f2 = next(g2)
1192 1199 else: # f1 and f2 point to files in the same linkrev
1193 1200 return f1 == f2 # true if they point to the same file
1194 1201 except StopIteration:
1195 1202 return False
1196 1203
1197 1204
1198 1205 def graftcopies(wctx, ctx, base):
1199 1206 """reproduce copies between base and ctx in the wctx
1200 1207
1201 1208 Unlike mergecopies(), this function will only consider copies between base
1202 1209 and ctx; it will ignore copies between base and wctx. Also unlike
1203 1210 mergecopies(), this function will apply copies to the working copy (instead
1204 1211 of just returning information about the copies). That makes it cheaper
1205 1212 (especially in the common case of base==ctx.p1()) and useful also when
1206 1213 experimental.copytrace=off.
1207 1214
1208 1215 merge.update() will have already marked most copies, but it will only
1209 1216 mark copies if it thinks the source files are related (see
1210 1217 merge._related()). It will also not mark copies if the file wasn't modified
1211 1218 on the local side. This function adds the copies that were "missed"
1212 1219 by merge.update().
1213 1220 """
1214 1221 new_copies = pathcopies(base, ctx)
1215 1222 _filter(wctx.p1(), wctx, new_copies)
1216 1223 for dst, src in pycompat.iteritems(new_copies):
1217 1224 wctx[dst].markcopied(src)
General Comments 0
You need to be logged in to leave comments. Login now