##// END OF EJS Templates
copies: split up _chain() in naive chaining and filtering steps...
Martin von Zweigbergk -
r42565:f3d06d37 default
parent child Browse files
Show More
@@ -1,799 +1,805 b''
1 1 # copies.py - copy detection for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import collections
11 11 import heapq
12 12 import os
13 13
14 14 from .i18n import _
15 15
16 16 from . import (
17 17 match as matchmod,
18 18 node,
19 19 pathutil,
20 20 util,
21 21 )
22 22 from .utils import (
23 23 stringutil,
24 24 )
25 25
26 26 def _findlimit(repo, ctxa, ctxb):
27 27 """
28 28 Find the last revision that needs to be checked to ensure that a full
29 29 transitive closure for file copies can be properly calculated.
30 30 Generally, this means finding the earliest revision number that's an
31 31 ancestor of a or b but not both, except when a or b is a direct descendent
32 32 of the other, in which case we can return the minimum revnum of a and b.
33 33 """
34 34
35 35 # basic idea:
36 36 # - mark a and b with different sides
37 37 # - if a parent's children are all on the same side, the parent is
38 38 # on that side, otherwise it is on no side
39 39 # - walk the graph in topological order with the help of a heap;
40 40 # - add unseen parents to side map
41 41 # - clear side of any parent that has children on different sides
42 42 # - track number of interesting revs that might still be on a side
43 43 # - track the lowest interesting rev seen
44 44 # - quit when interesting revs is zero
45 45
46 46 cl = repo.changelog
47 47 wdirparents = None
48 48 a = ctxa.rev()
49 49 b = ctxb.rev()
50 50 if a is None:
51 51 wdirparents = (ctxa.p1(), ctxa.p2())
52 52 a = node.wdirrev
53 53 if b is None:
54 54 assert not wdirparents
55 55 wdirparents = (ctxb.p1(), ctxb.p2())
56 56 b = node.wdirrev
57 57
58 58 side = {a: -1, b: 1}
59 59 visit = [-a, -b]
60 60 heapq.heapify(visit)
61 61 interesting = len(visit)
62 62 limit = node.wdirrev
63 63
64 64 while interesting:
65 65 r = -heapq.heappop(visit)
66 66 if r == node.wdirrev:
67 67 parents = [pctx.rev() for pctx in wdirparents]
68 68 else:
69 69 parents = cl.parentrevs(r)
70 70 if parents[1] == node.nullrev:
71 71 parents = parents[:1]
72 72 for p in parents:
73 73 if p not in side:
74 74 # first time we see p; add it to visit
75 75 side[p] = side[r]
76 76 if side[p]:
77 77 interesting += 1
78 78 heapq.heappush(visit, -p)
79 79 elif side[p] and side[p] != side[r]:
80 80 # p was interesting but now we know better
81 81 side[p] = 0
82 82 interesting -= 1
83 83 if side[r]:
84 84 limit = r # lowest rev visited
85 85 interesting -= 1
86 86
87 87 # Consider the following flow (see test-commit-amend.t under issue4405):
88 88 # 1/ File 'a0' committed
89 89 # 2/ File renamed from 'a0' to 'a1' in a new commit (call it 'a1')
90 90 # 3/ Move back to first commit
91 91 # 4/ Create a new commit via revert to contents of 'a1' (call it 'a1-amend')
92 92 # 5/ Rename file from 'a1' to 'a2' and commit --amend 'a1-msg'
93 93 #
94 94 # During the amend in step five, we will be in this state:
95 95 #
96 96 # @ 3 temporary amend commit for a1-amend
97 97 # |
98 98 # o 2 a1-amend
99 99 # |
100 100 # | o 1 a1
101 101 # |/
102 102 # o 0 a0
103 103 #
104 104 # When _findlimit is called, a and b are revs 3 and 0, so limit will be 2,
105 105 # yet the filelog has the copy information in rev 1 and we will not look
106 106 # back far enough unless we also look at the a and b as candidates.
107 107 # This only occurs when a is a descendent of b or visa-versa.
108 108 return min(limit, a, b)
109 109
110 def _chain(src, dst, a, b):
111 """chain two sets of copies 'a' and 'b'"""
110 def _chainandfilter(src, dst, a, b):
111 """chain two sets of copies 'a' and 'b' and filter result"""
112 112
113 113 # When chaining copies in 'a' (from 'src' via some other commit 'mid') with
114 114 # copies in 'b' (from 'mid' to 'dst'), we can get the different cases in the
115 115 # following table (not including trivial cases). For example, case 2 is
116 116 # where a file existed in 'src' and remained under that name in 'mid' and
117 117 # then was renamed between 'mid' and 'dst'.
118 118 #
119 119 # case src mid dst result
120 120 # 1 x y - -
121 121 # 2 x y y x->y
122 122 # 3 x y x -
123 123 # 4 x y z x->z
124 124 # 5 - x y -
125 125 # 6 x x y x->y
126 #
127 # _chain() takes care of chaining the copies in 'a' and 'b', but it
128 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
129 # between 5 and 6, so it includes all cases in its result.
130 # Cases 1, 3, and 5 are then removed by _filter().
126 131
127 # Initialize result ('t') from 'a'. This catches cases 1 & 2. We'll remove
128 # case 1 later. We'll also catch cases 3 & 4 here. Case 4 will be
129 # overwritten later, and case 3 will be removed later.
132 t = _chain(a, b)
133 _filter(src, dst, t)
134 return t
135
136 def _filter(src, dst, t):
137 """filters out invalid copies after chaining"""
138 for k, v in list(t.items()):
139 # remove copies from files that didn't exist
140 if v not in src:
141 del t[k]
142 # remove criss-crossed copies
143 elif k in src and v in dst:
144 del t[k]
145 # remove copies to files that were then removed
146 elif k not in dst:
147 del t[k]
148
149 def _chain(a, b):
150 """chain two sets of copies 'a' and 'b'"""
130 151 t = a.copy()
131 152 for k, v in b.iteritems():
132 153 if v in t:
133 # Found a chain, i.e. cases 3 & 4. We'll remove case 3 later.
134 154 t[k] = t[v]
135 155 else:
136 # Renamed only in 'b', i.e. cases 5 & 6. We'll remove case 5 later.
137 156 t[k] = v
138
139 for k, v in list(t.items()):
140 # remove copies from files that didn't exist, i.e. case 5
141 if v not in src:
142 del t[k]
143 # remove criss-crossed copies, i.e. case 3
144 elif k in src and v in dst:
145 del t[k]
146 # remove copies to files that were then removed, i.e. case 1
147 # and file 'y' in cases 3 & 4 (in case of rename)
148 elif k not in dst:
149 del t[k]
150
151 157 return t
152 158
153 159 def _tracefile(fctx, am, limit):
154 160 """return file context that is the ancestor of fctx present in ancestor
155 161 manifest am, stopping after the first ancestor lower than limit"""
156 162
157 163 for f in fctx.ancestors():
158 164 if am.get(f.path(), None) == f.filenode():
159 165 return f
160 166 if not f.isintroducedafter(limit):
161 167 return None
162 168
163 169 def _dirstatecopies(repo, match=None):
164 170 ds = repo.dirstate
165 171 c = ds.copies().copy()
166 172 for k in list(c):
167 173 if ds[k] not in 'anm' or (match and not match(k)):
168 174 del c[k]
169 175 return c
170 176
171 177 def _computeforwardmissing(a, b, match=None):
172 178 """Computes which files are in b but not a.
173 179 This is its own function so extensions can easily wrap this call to see what
174 180 files _forwardcopies is about to process.
175 181 """
176 182 ma = a.manifest()
177 183 mb = b.manifest()
178 184 return mb.filesnotin(ma, match=match)
179 185
180 186 def usechangesetcentricalgo(repo):
181 187 """Checks if we should use changeset-centric copy algorithms"""
182 188 return (repo.ui.config('experimental', 'copies.read-from') in
183 189 ('changeset-only', 'compatibility'))
184 190
185 191 def _committedforwardcopies(a, b, match):
186 192 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
187 193 # files might have to be traced back to the fctx parent of the last
188 194 # one-side-only changeset, but not further back than that
189 195 repo = a._repo
190 196
191 197 if usechangesetcentricalgo(repo):
192 198 return _changesetforwardcopies(a, b, match)
193 199
194 200 debug = repo.ui.debugflag and repo.ui.configbool('devel', 'debug.copies')
195 201 dbg = repo.ui.debug
196 202 if debug:
197 203 dbg('debug.copies: looking into rename from %s to %s\n'
198 204 % (a, b))
199 205 limit = _findlimit(repo, a, b)
200 206 if debug:
201 207 dbg('debug.copies: search limit: %d\n' % limit)
202 208 am = a.manifest()
203 209
204 210 # find where new files came from
205 211 # we currently don't try to find where old files went, too expensive
206 212 # this means we can miss a case like 'hg rm b; hg cp a b'
207 213 cm = {}
208 214
209 215 # Computing the forward missing is quite expensive on large manifests, since
210 216 # it compares the entire manifests. We can optimize it in the common use
211 217 # case of computing what copies are in a commit versus its parent (like
212 218 # during a rebase or histedit). Note, we exclude merge commits from this
213 219 # optimization, since the ctx.files() for a merge commit is not correct for
214 220 # this comparison.
215 221 forwardmissingmatch = match
216 222 if b.p1() == a and b.p2().node() == node.nullid:
217 223 filesmatcher = matchmod.exact(b.files())
218 224 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
219 225 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
220 226
221 227 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
222 228
223 229 if debug:
224 230 dbg('debug.copies: missing files to search: %d\n' % len(missing))
225 231
226 232 for f in sorted(missing):
227 233 if debug:
228 234 dbg('debug.copies: tracing file: %s\n' % f)
229 235 fctx = b[f]
230 236 fctx._ancestrycontext = ancestrycontext
231 237
232 238 if debug:
233 239 start = util.timer()
234 240 ofctx = _tracefile(fctx, am, limit)
235 241 if ofctx:
236 242 if debug:
237 243 dbg('debug.copies: rename of: %s\n' % ofctx._path)
238 244 cm[f] = ofctx.path()
239 245 if debug:
240 246 dbg('debug.copies: time: %f seconds\n'
241 247 % (util.timer() - start))
242 248 return cm
243 249
244 250 def _changesetforwardcopies(a, b, match):
245 251 if a.rev() == node.nullrev:
246 252 return {}
247 253
248 254 repo = a.repo()
249 255 children = {}
250 256 cl = repo.changelog
251 257 missingrevs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
252 258 for r in missingrevs:
253 259 for p in cl.parentrevs(r):
254 260 if p == node.nullrev:
255 261 continue
256 262 if p not in children:
257 263 children[p] = [r]
258 264 else:
259 265 children[p].append(r)
260 266
261 267 roots = set(children) - set(missingrevs)
262 268 # 'work' contains 3-tuples of a (revision number, parent number, copies).
263 269 # The parent number is only used for knowing which parent the copies dict
264 270 # came from.
265 271 work = [(r, 1, {}) for r in roots]
266 272 heapq.heapify(work)
267 273 while work:
268 274 r, i1, copies1 = heapq.heappop(work)
269 275 if work and work[0][0] == r:
270 276 # We are tracing copies from both parents
271 277 r, i2, copies2 = heapq.heappop(work)
272 278 copies = {}
273 279 ctx = repo[r]
274 280 p1man, p2man = ctx.p1().manifest(), ctx.p2().manifest()
275 281 allcopies = set(copies1) | set(copies2)
276 282 # TODO: perhaps this filtering should be done as long as ctx
277 283 # is merge, whether or not we're tracing from both parent.
278 284 for dst in allcopies:
279 285 if not match(dst):
280 286 continue
281 287 if dst not in copies2:
282 288 # Copied on p1 side: mark as copy from p1 side if it didn't
283 289 # already exist on p2 side
284 290 if dst not in p2man:
285 291 copies[dst] = copies1[dst]
286 292 elif dst not in copies1:
287 293 # Copied on p2 side: mark as copy from p2 side if it didn't
288 294 # already exist on p1 side
289 295 if dst not in p1man:
290 296 copies[dst] = copies2[dst]
291 297 else:
292 298 # Copied on both sides: mark as copy from p1 side
293 299 copies[dst] = copies1[dst]
294 300 else:
295 301 copies = copies1
296 302 if r == b.rev():
297 303 return copies
298 304 for c in children[r]:
299 305 childctx = repo[c]
300 306 if r == childctx.p1().rev():
301 307 parent = 1
302 308 childcopies = childctx.p1copies()
303 309 else:
304 310 assert r == childctx.p2().rev()
305 311 parent = 2
306 312 childcopies = childctx.p2copies()
307 313 if not match.always():
308 314 childcopies = {dst: src for dst, src in childcopies.items()
309 315 if match(dst)}
310 childcopies = _chain(a, childctx, copies, childcopies)
316 childcopies = _chainandfilter(a, childctx, copies, childcopies)
311 317 heapq.heappush(work, (c, parent, childcopies))
312 318 assert False
313 319
314 320 def _forwardcopies(a, b, match=None):
315 321 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
316 322
317 323 match = a.repo().narrowmatch(match)
318 324 # check for working copy
319 325 if b.rev() is None:
320 326 if a == b.p1():
321 327 # short-circuit to avoid issues with merge states
322 328 return _dirstatecopies(b._repo, match)
323 329
324 330 cm = _committedforwardcopies(a, b.p1(), match)
325 331 # combine copies from dirstate if necessary
326 return _chain(a, b, cm, _dirstatecopies(b._repo, match))
332 return _chainandfilter(a, b, cm, _dirstatecopies(b._repo, match))
327 333 return _committedforwardcopies(a, b, match)
328 334
329 335 def _backwardrenames(a, b, match):
330 336 if a._repo.ui.config('experimental', 'copytrace') == 'off':
331 337 return {}
332 338
333 339 # Even though we're not taking copies into account, 1:n rename situations
334 340 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
335 341 # arbitrarily pick one of the renames.
336 342 # We don't want to pass in "match" here, since that would filter
337 343 # the destination by it. Since we're reversing the copies, we want
338 344 # to filter the source instead.
339 345 f = _forwardcopies(b, a)
340 346 r = {}
341 347 for k, v in sorted(f.iteritems()):
342 348 if match and not match(v):
343 349 continue
344 350 # remove copies
345 351 if v in a:
346 352 continue
347 353 r[v] = k
348 354 return r
349 355
350 356 def pathcopies(x, y, match=None):
351 357 """find {dst@y: src@x} copy mapping for directed compare"""
352 358 repo = x._repo
353 359 debug = repo.ui.debugflag and repo.ui.configbool('devel', 'debug.copies')
354 360 if debug:
355 361 repo.ui.debug('debug.copies: searching copies from %s to %s\n'
356 362 % (x, y))
357 363 if x == y or not x or not y:
358 364 return {}
359 365 a = y.ancestor(x)
360 366 if a == x:
361 367 if debug:
362 368 repo.ui.debug('debug.copies: search mode: forward\n')
363 369 return _forwardcopies(x, y, match=match)
364 370 if a == y:
365 371 if debug:
366 372 repo.ui.debug('debug.copies: search mode: backward\n')
367 373 return _backwardrenames(x, y, match=match)
368 374 if debug:
369 375 repo.ui.debug('debug.copies: search mode: combined\n')
370 return _chain(x, y, _backwardrenames(x, a, match=match),
371 _forwardcopies(a, y, match=match))
376 return _chainandfilter(x, y, _backwardrenames(x, a, match=match),
377 _forwardcopies(a, y, match=match))
372 378
373 379 def mergecopies(repo, c1, c2, base):
374 380 """
375 381 Finds moves and copies between context c1 and c2 that are relevant for
376 382 merging. 'base' will be used as the merge base.
377 383
378 384 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
379 385 files that were moved/ copied in one merge parent and modified in another.
380 386 For example:
381 387
382 388 o ---> 4 another commit
383 389 |
384 390 | o ---> 3 commit that modifies a.txt
385 391 | /
386 392 o / ---> 2 commit that moves a.txt to b.txt
387 393 |/
388 394 o ---> 1 merge base
389 395
390 396 If we try to rebase revision 3 on revision 4, since there is no a.txt in
391 397 revision 4, and if user have copytrace disabled, we prints the following
392 398 message:
393 399
394 400 ```other changed <file> which local deleted```
395 401
396 402 Returns five dicts: "copy", "movewithdir", "diverge", "renamedelete" and
397 403 "dirmove".
398 404
399 405 "copy" is a mapping from destination name -> source name,
400 406 where source is in c1 and destination is in c2 or vice-versa.
401 407
402 408 "movewithdir" is a mapping from source name -> destination name,
403 409 where the file at source present in one context but not the other
404 410 needs to be moved to destination by the merge process, because the
405 411 other context moved the directory it is in.
406 412
407 413 "diverge" is a mapping of source name -> list of destination names
408 414 for divergent renames.
409 415
410 416 "renamedelete" is a mapping of source name -> list of destination
411 417 names for files deleted in c1 that were renamed in c2 or vice-versa.
412 418
413 419 "dirmove" is a mapping of detected source dir -> destination dir renames.
414 420 This is needed for handling changes to new files previously grafted into
415 421 renamed directories.
416 422
417 423 This function calls different copytracing algorithms based on config.
418 424 """
419 425 # avoid silly behavior for update from empty dir
420 426 if not c1 or not c2 or c1 == c2:
421 427 return {}, {}, {}, {}, {}
422 428
423 429 narrowmatch = c1.repo().narrowmatch()
424 430
425 431 # avoid silly behavior for parent -> working dir
426 432 if c2.node() is None and c1.node() == repo.dirstate.p1():
427 433 return _dirstatecopies(repo, narrowmatch), {}, {}, {}, {}
428 434
429 435 copytracing = repo.ui.config('experimental', 'copytrace')
430 436 if stringutil.parsebool(copytracing) is False:
431 437 # stringutil.parsebool() returns None when it is unable to parse the
432 438 # value, so we should rely on making sure copytracing is on such cases
433 439 return {}, {}, {}, {}, {}
434 440
435 441 if usechangesetcentricalgo(repo):
436 442 # The heuristics don't make sense when we need changeset-centric algos
437 443 return _fullcopytracing(repo, c1, c2, base)
438 444
439 445 # Copy trace disabling is explicitly below the node == p1 logic above
440 446 # because the logic above is required for a simple copy to be kept across a
441 447 # rebase.
442 448 if copytracing == 'heuristics':
443 449 # Do full copytracing if only non-public revisions are involved as
444 450 # that will be fast enough and will also cover the copies which could
445 451 # be missed by heuristics
446 452 if _isfullcopytraceable(repo, c1, base):
447 453 return _fullcopytracing(repo, c1, c2, base)
448 454 return _heuristicscopytracing(repo, c1, c2, base)
449 455 else:
450 456 return _fullcopytracing(repo, c1, c2, base)
451 457
452 458 def _isfullcopytraceable(repo, c1, base):
453 459 """ Checks that if base, source and destination are all no-public branches,
454 460 if yes let's use the full copytrace algorithm for increased capabilities
455 461 since it will be fast enough.
456 462
457 463 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
458 464 number of changesets from c1 to base such that if number of changesets are
459 465 more than the limit, full copytracing algorithm won't be used.
460 466 """
461 467 if c1.rev() is None:
462 468 c1 = c1.p1()
463 469 if c1.mutable() and base.mutable():
464 470 sourcecommitlimit = repo.ui.configint('experimental',
465 471 'copytrace.sourcecommitlimit')
466 472 commits = len(repo.revs('%d::%d', base.rev(), c1.rev()))
467 473 return commits < sourcecommitlimit
468 474 return False
469 475
470 476 def _checksinglesidecopies(src, dsts1, m1, m2, mb, c2, base,
471 477 copy, renamedelete):
472 478 if src not in m2:
473 479 # deleted on side 2
474 480 if src not in m1:
475 481 # renamed on side 1, deleted on side 2
476 482 renamedelete[src] = dsts1
477 483 elif m2[src] != mb[src]:
478 484 if not _related(c2[src], base[src]):
479 485 return
480 486 # modified on side 2
481 487 for dst in dsts1:
482 488 if dst not in m2:
483 489 # dst not added on side 2 (handle as regular
484 490 # "both created" case in manifestmerge otherwise)
485 491 copy[dst] = src
486 492
487 493 def _fullcopytracing(repo, c1, c2, base):
488 494 """ The full copytracing algorithm which finds all the new files that were
489 495 added from merge base up to the top commit and for each file it checks if
490 496 this file was copied from another file.
491 497
492 498 This is pretty slow when a lot of changesets are involved but will track all
493 499 the copies.
494 500 """
495 501 m1 = c1.manifest()
496 502 m2 = c2.manifest()
497 503 mb = base.manifest()
498 504
499 505 copies1 = pathcopies(base, c1)
500 506 copies2 = pathcopies(base, c2)
501 507
502 508 inversecopies1 = {}
503 509 inversecopies2 = {}
504 510 for dst, src in copies1.items():
505 511 inversecopies1.setdefault(src, []).append(dst)
506 512 for dst, src in copies2.items():
507 513 inversecopies2.setdefault(src, []).append(dst)
508 514
509 515 copy = {}
510 516 diverge = {}
511 517 renamedelete = {}
512 518 allsources = set(inversecopies1) | set(inversecopies2)
513 519 for src in allsources:
514 520 dsts1 = inversecopies1.get(src)
515 521 dsts2 = inversecopies2.get(src)
516 522 if dsts1 and dsts2:
517 523 # copied/renamed on both sides
518 524 if src not in m1 and src not in m2:
519 525 # renamed on both sides
520 526 dsts1 = set(dsts1)
521 527 dsts2 = set(dsts2)
522 528 # If there's some overlap in the rename destinations, we
523 529 # consider it not divergent. For example, if side 1 copies 'a'
524 530 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
525 531 # and 'd' and deletes 'a'.
526 532 if dsts1 & dsts2:
527 533 for dst in (dsts1 & dsts2):
528 534 copy[dst] = src
529 535 else:
530 536 diverge[src] = sorted(dsts1 | dsts2)
531 537 elif src in m1 and src in m2:
532 538 # copied on both sides
533 539 dsts1 = set(dsts1)
534 540 dsts2 = set(dsts2)
535 541 for dst in (dsts1 & dsts2):
536 542 copy[dst] = src
537 543 # TODO: Handle cases where it was renamed on one side and copied
538 544 # on the other side
539 545 elif dsts1:
540 546 # copied/renamed only on side 1
541 547 _checksinglesidecopies(src, dsts1, m1, m2, mb, c2, base,
542 548 copy, renamedelete)
543 549 elif dsts2:
544 550 # copied/renamed only on side 2
545 551 _checksinglesidecopies(src, dsts2, m2, m1, mb, c1, base,
546 552 copy, renamedelete)
547 553
548 554 renamedeleteset = set()
549 555 divergeset = set()
550 556 for dsts in diverge.values():
551 557 divergeset.update(dsts)
552 558 for dsts in renamedelete.values():
553 559 renamedeleteset.update(dsts)
554 560
555 561 # find interesting file sets from manifests
556 562 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
557 563 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
558 564 u1 = sorted(addedinm1 - addedinm2)
559 565 u2 = sorted(addedinm2 - addedinm1)
560 566
561 567 header = " unmatched files in %s"
562 568 if u1:
563 569 repo.ui.debug("%s:\n %s\n" % (header % 'local', "\n ".join(u1)))
564 570 if u2:
565 571 repo.ui.debug("%s:\n %s\n" % (header % 'other', "\n ".join(u2)))
566 572
567 573 fullcopy = copies1.copy()
568 574 fullcopy.update(copies2)
569 575 if not fullcopy:
570 576 return copy, {}, diverge, renamedelete, {}
571 577
572 578 if repo.ui.debugflag:
573 579 repo.ui.debug(" all copies found (* = to merge, ! = divergent, "
574 580 "% = renamed and deleted):\n")
575 581 for f in sorted(fullcopy):
576 582 note = ""
577 583 if f in copy:
578 584 note += "*"
579 585 if f in divergeset:
580 586 note += "!"
581 587 if f in renamedeleteset:
582 588 note += "%"
583 589 repo.ui.debug(" src: '%s' -> dst: '%s' %s\n" % (fullcopy[f], f,
584 590 note))
585 591 del divergeset
586 592
587 593 repo.ui.debug(" checking for directory renames\n")
588 594
589 595 # generate a directory move map
590 596 d1, d2 = c1.dirs(), c2.dirs()
591 597 invalid = set()
592 598 dirmove = {}
593 599
594 600 # examine each file copy for a potential directory move, which is
595 601 # when all the files in a directory are moved to a new directory
596 602 for dst, src in fullcopy.iteritems():
597 603 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
598 604 if dsrc in invalid:
599 605 # already seen to be uninteresting
600 606 continue
601 607 elif dsrc in d1 and ddst in d1:
602 608 # directory wasn't entirely moved locally
603 609 invalid.add(dsrc)
604 610 elif dsrc in d2 and ddst in d2:
605 611 # directory wasn't entirely moved remotely
606 612 invalid.add(dsrc)
607 613 elif dsrc in dirmove and dirmove[dsrc] != ddst:
608 614 # files from the same directory moved to two different places
609 615 invalid.add(dsrc)
610 616 else:
611 617 # looks good so far
612 618 dirmove[dsrc] = ddst
613 619
614 620 for i in invalid:
615 621 if i in dirmove:
616 622 del dirmove[i]
617 623 del d1, d2, invalid
618 624
619 625 if not dirmove:
620 626 return copy, {}, diverge, renamedelete, {}
621 627
622 628 dirmove = {k + "/": v + "/" for k, v in dirmove.iteritems()}
623 629
624 630 for d in dirmove:
625 631 repo.ui.debug(" discovered dir src: '%s' -> dst: '%s'\n" %
626 632 (d, dirmove[d]))
627 633
628 634 movewithdir = {}
629 635 # check unaccounted nonoverlapping files against directory moves
630 636 for f in u1 + u2:
631 637 if f not in fullcopy:
632 638 for d in dirmove:
633 639 if f.startswith(d):
634 640 # new file added in a directory that was moved, move it
635 641 df = dirmove[d] + f[len(d):]
636 642 if df not in copy:
637 643 movewithdir[f] = df
638 644 repo.ui.debug((" pending file src: '%s' -> "
639 645 "dst: '%s'\n") % (f, df))
640 646 break
641 647
642 648 return copy, movewithdir, diverge, renamedelete, dirmove
643 649
644 650 def _heuristicscopytracing(repo, c1, c2, base):
645 651 """ Fast copytracing using filename heuristics
646 652
647 653 Assumes that moves or renames are of following two types:
648 654
649 655 1) Inside a directory only (same directory name but different filenames)
650 656 2) Move from one directory to another
651 657 (same filenames but different directory names)
652 658
653 659 Works only when there are no merge commits in the "source branch".
654 660 Source branch is commits from base up to c2 not including base.
655 661
656 662 If merge is involved it fallbacks to _fullcopytracing().
657 663
658 664 Can be used by setting the following config:
659 665
660 666 [experimental]
661 667 copytrace = heuristics
662 668
663 669 In some cases the copy/move candidates found by heuristics can be very large
664 670 in number and that will make the algorithm slow. The number of possible
665 671 candidates to check can be limited by using the config
666 672 `experimental.copytrace.movecandidateslimit` which defaults to 100.
667 673 """
668 674
669 675 if c1.rev() is None:
670 676 c1 = c1.p1()
671 677 if c2.rev() is None:
672 678 c2 = c2.p1()
673 679
674 680 copies = {}
675 681
676 682 changedfiles = set()
677 683 m1 = c1.manifest()
678 684 if not repo.revs('%d::%d', base.rev(), c2.rev()):
679 685 # If base is not in c2 branch, we switch to fullcopytracing
680 686 repo.ui.debug("switching to full copytracing as base is not "
681 687 "an ancestor of c2\n")
682 688 return _fullcopytracing(repo, c1, c2, base)
683 689
684 690 ctx = c2
685 691 while ctx != base:
686 692 if len(ctx.parents()) == 2:
687 693 # To keep things simple let's not handle merges
688 694 repo.ui.debug("switching to full copytracing because of merges\n")
689 695 return _fullcopytracing(repo, c1, c2, base)
690 696 changedfiles.update(ctx.files())
691 697 ctx = ctx.p1()
692 698
693 699 cp = _forwardcopies(base, c2)
694 700 for dst, src in cp.iteritems():
695 701 if src in m1:
696 702 copies[dst] = src
697 703
698 704 # file is missing if it isn't present in the destination, but is present in
699 705 # the base and present in the source.
700 706 # Presence in the base is important to exclude added files, presence in the
701 707 # source is important to exclude removed files.
702 708 filt = lambda f: f not in m1 and f in base and f in c2
703 709 missingfiles = [f for f in changedfiles if filt(f)]
704 710
705 711 if missingfiles:
706 712 basenametofilename = collections.defaultdict(list)
707 713 dirnametofilename = collections.defaultdict(list)
708 714
709 715 for f in m1.filesnotin(base.manifest()):
710 716 basename = os.path.basename(f)
711 717 dirname = os.path.dirname(f)
712 718 basenametofilename[basename].append(f)
713 719 dirnametofilename[dirname].append(f)
714 720
715 721 for f in missingfiles:
716 722 basename = os.path.basename(f)
717 723 dirname = os.path.dirname(f)
718 724 samebasename = basenametofilename[basename]
719 725 samedirname = dirnametofilename[dirname]
720 726 movecandidates = samebasename + samedirname
721 727 # f is guaranteed to be present in c2, that's why
722 728 # c2.filectx(f) won't fail
723 729 f2 = c2.filectx(f)
724 730 # we can have a lot of candidates which can slow down the heuristics
725 731 # config value to limit the number of candidates moves to check
726 732 maxcandidates = repo.ui.configint('experimental',
727 733 'copytrace.movecandidateslimit')
728 734
729 735 if len(movecandidates) > maxcandidates:
730 736 repo.ui.status(_("skipping copytracing for '%s', more "
731 737 "candidates than the limit: %d\n")
732 738 % (f, len(movecandidates)))
733 739 continue
734 740
735 741 for candidate in movecandidates:
736 742 f1 = c1.filectx(candidate)
737 743 if _related(f1, f2):
738 744 # if there are a few related copies then we'll merge
739 745 # changes into all of them. This matches the behaviour
740 746 # of upstream copytracing
741 747 copies[candidate] = f
742 748
743 749 return copies, {}, {}, {}, {}
744 750
745 751 def _related(f1, f2):
746 752 """return True if f1 and f2 filectx have a common ancestor
747 753
748 754 Walk back to common ancestor to see if the two files originate
749 755 from the same file. Since workingfilectx's rev() is None it messes
750 756 up the integer comparison logic, hence the pre-step check for
751 757 None (f1 and f2 can only be workingfilectx's initially).
752 758 """
753 759
754 760 if f1 == f2:
755 761 return True # a match
756 762
757 763 g1, g2 = f1.ancestors(), f2.ancestors()
758 764 try:
759 765 f1r, f2r = f1.linkrev(), f2.linkrev()
760 766
761 767 if f1r is None:
762 768 f1 = next(g1)
763 769 if f2r is None:
764 770 f2 = next(g2)
765 771
766 772 while True:
767 773 f1r, f2r = f1.linkrev(), f2.linkrev()
768 774 if f1r > f2r:
769 775 f1 = next(g1)
770 776 elif f2r > f1r:
771 777 f2 = next(g2)
772 778 else: # f1 and f2 point to files in the same linkrev
773 779 return f1 == f2 # true if they point to the same file
774 780 except StopIteration:
775 781 return False
776 782
777 783 def duplicatecopies(repo, wctx, rev, fromrev, skiprev=None):
778 784 """reproduce copies from fromrev to rev in the dirstate
779 785
780 786 If skiprev is specified, it's a revision that should be used to
781 787 filter copy records. Any copies that occur between fromrev and
782 788 skiprev will not be duplicated, even if they appear in the set of
783 789 copies between fromrev and rev.
784 790 """
785 791 exclude = {}
786 792 ctraceconfig = repo.ui.config('experimental', 'copytrace')
787 793 bctrace = stringutil.parsebool(ctraceconfig)
788 794 if (skiprev is not None and
789 795 (ctraceconfig == 'heuristics' or bctrace or bctrace is None)):
790 796 # copytrace='off' skips this line, but not the entire function because
791 797 # the line below is O(size of the repo) during a rebase, while the rest
792 798 # of the function is much faster (and is required for carrying copy
793 799 # metadata across the rebase anyway).
794 800 exclude = pathcopies(repo[fromrev], repo[skiprev])
795 801 for dst, src in pathcopies(repo[fromrev], repo[rev]).iteritems():
796 802 if dst in exclude:
797 803 continue
798 804 if dst in wctx:
799 805 wctx[dst].markcopied(src)
General Comments 0
You need to be logged in to leave comments. Login now