##// END OF EJS Templates
copies: avoid calculating debug-only stuff without --debug...
Martin von Zweigbergk -
r44623:782e0d9c default
parent child Browse files
Show More
@@ -1,1126 +1,1127
1 1 # copies.py - copy detection for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import collections
11 11 import multiprocessing
12 12 import os
13 13
14 14 from .i18n import _
15 15
16 16
17 17 from .revlogutils.flagutil import REVIDX_SIDEDATA
18 18
19 19 from . import (
20 20 error,
21 21 match as matchmod,
22 22 node,
23 23 pathutil,
24 24 pycompat,
25 25 util,
26 26 )
27 27
28 28 from .revlogutils import sidedata as sidedatamod
29 29
30 30 from .utils import stringutil
31 31
32 32
33 33 def _filter(src, dst, t):
34 34 """filters out invalid copies after chaining"""
35 35
36 36 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
37 37 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
38 38 # in the following table (not including trivial cases). For example, case 2
39 39 # is where a file existed in 'src' and remained under that name in 'mid' and
40 40 # then was renamed between 'mid' and 'dst'.
41 41 #
42 42 # case src mid dst result
43 43 # 1 x y - -
44 44 # 2 x y y x->y
45 45 # 3 x y x -
46 46 # 4 x y z x->z
47 47 # 5 - x y -
48 48 # 6 x x y x->y
49 49 #
50 50 # _chain() takes care of chaining the copies in 'a' and 'b', but it
51 51 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
52 52 # between 5 and 6, so it includes all cases in its result.
53 53 # Cases 1, 3, and 5 are then removed by _filter().
54 54
55 55 for k, v in list(t.items()):
56 56 # remove copies from files that didn't exist
57 57 if v not in src:
58 58 del t[k]
59 59 # remove criss-crossed copies
60 60 elif k in src and v in dst:
61 61 del t[k]
62 62 # remove copies to files that were then removed
63 63 elif k not in dst:
64 64 del t[k]
65 65
66 66
67 67 def _chain(prefix, suffix):
68 68 """chain two sets of copies 'prefix' and 'suffix'"""
69 69 result = prefix.copy()
70 70 for key, value in pycompat.iteritems(suffix):
71 71 result[key] = prefix.get(value, value)
72 72 return result
73 73
74 74
75 75 def _tracefile(fctx, am, basemf):
76 76 """return file context that is the ancestor of fctx present in ancestor
77 77 manifest am
78 78
79 79 Note: we used to try and stop after a given limit, however checking if that
80 80 limit is reached turned out to be very expensive. we are better off
81 81 disabling that feature."""
82 82
83 83 for f in fctx.ancestors():
84 84 path = f.path()
85 85 if am.get(path, None) == f.filenode():
86 86 return path
87 87 if basemf and basemf.get(path, None) == f.filenode():
88 88 return path
89 89
90 90
91 91 def _dirstatecopies(repo, match=None):
92 92 ds = repo.dirstate
93 93 c = ds.copies().copy()
94 94 for k in list(c):
95 95 if ds[k] not in b'anm' or (match and not match(k)):
96 96 del c[k]
97 97 return c
98 98
99 99
100 100 def _computeforwardmissing(a, b, match=None):
101 101 """Computes which files are in b but not a.
102 102 This is its own function so extensions can easily wrap this call to see what
103 103 files _forwardcopies is about to process.
104 104 """
105 105 ma = a.manifest()
106 106 mb = b.manifest()
107 107 return mb.filesnotin(ma, match=match)
108 108
109 109
110 110 def usechangesetcentricalgo(repo):
111 111 """Checks if we should use changeset-centric copy algorithms"""
112 112 if repo.filecopiesmode == b'changeset-sidedata':
113 113 return True
114 114 readfrom = repo.ui.config(b'experimental', b'copies.read-from')
115 115 changesetsource = (b'changeset-only', b'compatibility')
116 116 return readfrom in changesetsource
117 117
118 118
119 119 def _committedforwardcopies(a, b, base, match):
120 120 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
121 121 # files might have to be traced back to the fctx parent of the last
122 122 # one-side-only changeset, but not further back than that
123 123 repo = a._repo
124 124
125 125 if usechangesetcentricalgo(repo):
126 126 return _changesetforwardcopies(a, b, match)
127 127
128 128 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
129 129 dbg = repo.ui.debug
130 130 if debug:
131 131 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b))
132 132 am = a.manifest()
133 133 basemf = None if base is None else base.manifest()
134 134
135 135 # find where new files came from
136 136 # we currently don't try to find where old files went, too expensive
137 137 # this means we can miss a case like 'hg rm b; hg cp a b'
138 138 cm = {}
139 139
140 140 # Computing the forward missing is quite expensive on large manifests, since
141 141 # it compares the entire manifests. We can optimize it in the common use
142 142 # case of computing what copies are in a commit versus its parent (like
143 143 # during a rebase or histedit). Note, we exclude merge commits from this
144 144 # optimization, since the ctx.files() for a merge commit is not correct for
145 145 # this comparison.
146 146 forwardmissingmatch = match
147 147 if b.p1() == a and b.p2().node() == node.nullid:
148 148 filesmatcher = matchmod.exact(b.files())
149 149 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
150 150 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
151 151
152 152 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
153 153
154 154 if debug:
155 155 dbg(b'debug.copies: missing files to search: %d\n' % len(missing))
156 156
157 157 for f in sorted(missing):
158 158 if debug:
159 159 dbg(b'debug.copies: tracing file: %s\n' % f)
160 160 fctx = b[f]
161 161 fctx._ancestrycontext = ancestrycontext
162 162
163 163 if debug:
164 164 start = util.timer()
165 165 opath = _tracefile(fctx, am, basemf)
166 166 if opath:
167 167 if debug:
168 168 dbg(b'debug.copies: rename of: %s\n' % opath)
169 169 cm[f] = opath
170 170 if debug:
171 171 dbg(
172 172 b'debug.copies: time: %f seconds\n'
173 173 % (util.timer() - start)
174 174 )
175 175 return cm
176 176
177 177
178 178 def _revinfogetter(repo):
179 179 """return a function that return multiple data given a <rev>"i
180 180
181 181 * p1: revision number of first parent
182 182 * p2: revision number of first parent
183 183 * p1copies: mapping of copies from p1
184 184 * p2copies: mapping of copies from p2
185 185 * removed: a list of removed files
186 186 """
187 187 cl = repo.changelog
188 188 parents = cl.parentrevs
189 189
190 190 if repo.filecopiesmode == b'changeset-sidedata':
191 191 changelogrevision = cl.changelogrevision
192 192 flags = cl.flags
193 193
194 194 # A small cache to avoid doing the work twice for merges
195 195 #
196 196 # In the vast majority of cases, if we ask information for a revision
197 197 # about 1 parent, we'll later ask it for the other. So it make sense to
198 198 # keep the information around when reaching the first parent of a merge
199 199 # and dropping it after it was provided for the second parents.
200 200 #
201 201 # It exists cases were only one parent of the merge will be walked. It
202 202 # happens when the "destination" the copy tracing is descendant from a
203 203 # new root, not common with the "source". In that case, we will only walk
204 204 # through merge parents that are descendant of changesets common
205 205 # between "source" and "destination".
206 206 #
207 207 # With the current case implementation if such changesets have a copy
208 208 # information, we'll keep them in memory until the end of
209 209 # _changesetforwardcopies. We don't expect the case to be frequent
210 210 # enough to matters.
211 211 #
212 212 # In addition, it would be possible to reach pathological case, were
213 213 # many first parent are met before any second parent is reached. In
214 214 # that case the cache could grow. If this even become an issue one can
215 215 # safely introduce a maximum cache size. This would trade extra CPU/IO
216 216 # time to save memory.
217 217 merge_caches = {}
218 218
219 219 def revinfo(rev):
220 220 p1, p2 = parents(rev)
221 221 if flags(rev) & REVIDX_SIDEDATA:
222 222 e = merge_caches.pop(rev, None)
223 223 if e is not None:
224 224 return e
225 225 c = changelogrevision(rev)
226 226 p1copies = c.p1copies
227 227 p2copies = c.p2copies
228 228 removed = c.filesremoved
229 229 if p1 != node.nullrev and p2 != node.nullrev:
230 230 # XXX some case we over cache, IGNORE
231 231 merge_caches[rev] = (p1, p2, p1copies, p2copies, removed)
232 232 else:
233 233 p1copies = {}
234 234 p2copies = {}
235 235 removed = []
236 236 return p1, p2, p1copies, p2copies, removed
237 237
238 238 else:
239 239
240 240 def revinfo(rev):
241 241 p1, p2 = parents(rev)
242 242 ctx = repo[rev]
243 243 p1copies, p2copies = ctx._copies
244 244 removed = ctx.filesremoved()
245 245 return p1, p2, p1copies, p2copies, removed
246 246
247 247 return revinfo
248 248
249 249
250 250 def _changesetforwardcopies(a, b, match):
251 251 if a.rev() in (node.nullrev, b.rev()):
252 252 return {}
253 253
254 254 repo = a.repo().unfiltered()
255 255 children = {}
256 256 revinfo = _revinfogetter(repo)
257 257
258 258 cl = repo.changelog
259 259 missingrevs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
260 260 mrset = set(missingrevs)
261 261 roots = set()
262 262 for r in missingrevs:
263 263 for p in cl.parentrevs(r):
264 264 if p == node.nullrev:
265 265 continue
266 266 if p not in children:
267 267 children[p] = [r]
268 268 else:
269 269 children[p].append(r)
270 270 if p not in mrset:
271 271 roots.add(p)
272 272 if not roots:
273 273 # no common revision to track copies from
274 274 return {}
275 275 min_root = min(roots)
276 276
277 277 from_head = set(
278 278 cl.reachableroots(min_root, [b.rev()], list(roots), includepath=True)
279 279 )
280 280
281 281 iterrevs = set(from_head)
282 282 iterrevs &= mrset
283 283 iterrevs.update(roots)
284 284 iterrevs.remove(b.rev())
285 285 revs = sorted(iterrevs)
286 286 return _combinechangesetcopies(revs, children, b.rev(), revinfo, match)
287 287
288 288
289 289 def _combinechangesetcopies(revs, children, targetrev, revinfo, match):
290 290 """combine the copies information for each item of iterrevs
291 291
292 292 revs: sorted iterable of revision to visit
293 293 children: a {parent: [children]} mapping.
294 294 targetrev: the final copies destination revision (not in iterrevs)
295 295 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
296 296 match: a matcher
297 297
298 298 It returns the aggregated copies information for `targetrev`.
299 299 """
300 300 all_copies = {}
301 301 alwaysmatch = match.always()
302 302 for r in revs:
303 303 copies = all_copies.pop(r, None)
304 304 if copies is None:
305 305 # this is a root
306 306 copies = {}
307 307 for i, c in enumerate(children[r]):
308 308 p1, p2, p1copies, p2copies, removed = revinfo(c)
309 309 if r == p1:
310 310 parent = 1
311 311 childcopies = p1copies
312 312 else:
313 313 assert r == p2
314 314 parent = 2
315 315 childcopies = p2copies
316 316 if not alwaysmatch:
317 317 childcopies = {
318 318 dst: src for dst, src in childcopies.items() if match(dst)
319 319 }
320 320 newcopies = copies
321 321 if childcopies:
322 322 newcopies = _chain(newcopies, childcopies)
323 323 # _chain makes a copies, we can avoid doing so in some
324 324 # simple/linear cases.
325 325 assert newcopies is not copies
326 326 for f in removed:
327 327 if f in newcopies:
328 328 if newcopies is copies:
329 329 # copy on write to avoid affecting potential other
330 330 # branches. when there are no other branches, this
331 331 # could be avoided.
332 332 newcopies = copies.copy()
333 333 del newcopies[f]
334 334 othercopies = all_copies.get(c)
335 335 if othercopies is None:
336 336 all_copies[c] = newcopies
337 337 else:
338 338 # we are the second parent to work on c, we need to merge our
339 339 # work with the other.
340 340 #
341 341 # Unlike when copies are stored in the filelog, we consider
342 342 # it a copy even if the destination already existed on the
343 343 # other branch. It's simply too expensive to check if the
344 344 # file existed in the manifest.
345 345 #
346 346 # In case of conflict, parent 1 take precedence over parent 2.
347 347 # This is an arbitrary choice made anew when implementing
348 348 # changeset based copies. It was made without regards with
349 349 # potential filelog related behavior.
350 350 if parent == 1:
351 351 othercopies.update(newcopies)
352 352 else:
353 353 newcopies.update(othercopies)
354 354 all_copies[c] = newcopies
355 355 return all_copies[targetrev]
356 356
357 357
358 358 def _forwardcopies(a, b, base=None, match=None):
359 359 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
360 360
361 361 if base is None:
362 362 base = a
363 363 match = a.repo().narrowmatch(match)
364 364 # check for working copy
365 365 if b.rev() is None:
366 366 cm = _committedforwardcopies(a, b.p1(), base, match)
367 367 # combine copies from dirstate if necessary
368 368 copies = _chain(cm, _dirstatecopies(b._repo, match))
369 369 else:
370 370 copies = _committedforwardcopies(a, b, base, match)
371 371 return copies
372 372
373 373
374 374 def _backwardrenames(a, b, match):
375 375 if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
376 376 return {}
377 377
378 378 # Even though we're not taking copies into account, 1:n rename situations
379 379 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
380 380 # arbitrarily pick one of the renames.
381 381 # We don't want to pass in "match" here, since that would filter
382 382 # the destination by it. Since we're reversing the copies, we want
383 383 # to filter the source instead.
384 384 f = _forwardcopies(b, a)
385 385 r = {}
386 386 for k, v in sorted(pycompat.iteritems(f)):
387 387 if match and not match(v):
388 388 continue
389 389 # remove copies
390 390 if v in a:
391 391 continue
392 392 r[v] = k
393 393 return r
394 394
395 395
396 396 def pathcopies(x, y, match=None):
397 397 """find {dst@y: src@x} copy mapping for directed compare"""
398 398 repo = x._repo
399 399 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
400 400 if debug:
401 401 repo.ui.debug(
402 402 b'debug.copies: searching copies from %s to %s\n' % (x, y)
403 403 )
404 404 if x == y or not x or not y:
405 405 return {}
406 406 a = y.ancestor(x)
407 407 if a == x:
408 408 if debug:
409 409 repo.ui.debug(b'debug.copies: search mode: forward\n')
410 410 if y.rev() is None and x == y.p1():
411 411 # short-circuit to avoid issues with merge states
412 412 return _dirstatecopies(repo, match)
413 413 copies = _forwardcopies(x, y, match=match)
414 414 elif a == y:
415 415 if debug:
416 416 repo.ui.debug(b'debug.copies: search mode: backward\n')
417 417 copies = _backwardrenames(x, y, match=match)
418 418 else:
419 419 if debug:
420 420 repo.ui.debug(b'debug.copies: search mode: combined\n')
421 421 base = None
422 422 if a.rev() != node.nullrev:
423 423 base = x
424 424 copies = _chain(
425 425 _backwardrenames(x, a, match=match),
426 426 _forwardcopies(a, y, base, match=match),
427 427 )
428 428 _filter(x, y, copies)
429 429 return copies
430 430
431 431
432 432 def mergecopies(repo, c1, c2, base):
433 433 """
434 434 Finds moves and copies between context c1 and c2 that are relevant for
435 435 merging. 'base' will be used as the merge base.
436 436
437 437 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
438 438 files that were moved/ copied in one merge parent and modified in another.
439 439 For example:
440 440
441 441 o ---> 4 another commit
442 442 |
443 443 | o ---> 3 commit that modifies a.txt
444 444 | /
445 445 o / ---> 2 commit that moves a.txt to b.txt
446 446 |/
447 447 o ---> 1 merge base
448 448
449 449 If we try to rebase revision 3 on revision 4, since there is no a.txt in
450 450 revision 4, and if user have copytrace disabled, we prints the following
451 451 message:
452 452
453 453 ```other changed <file> which local deleted```
454 454
455 455 Returns five dicts: "copy", "movewithdir", "diverge", "renamedelete" and
456 456 "dirmove".
457 457
458 458 "copy" is a mapping from destination name -> source name,
459 459 where source is in c1 and destination is in c2 or vice-versa.
460 460
461 461 "movewithdir" is a mapping from source name -> destination name,
462 462 where the file at source present in one context but not the other
463 463 needs to be moved to destination by the merge process, because the
464 464 other context moved the directory it is in.
465 465
466 466 "diverge" is a mapping of source name -> list of destination names
467 467 for divergent renames.
468 468
469 469 "renamedelete" is a mapping of source name -> list of destination
470 470 names for files deleted in c1 that were renamed in c2 or vice-versa.
471 471
472 472 "dirmove" is a mapping of detected source dir -> destination dir renames.
473 473 This is needed for handling changes to new files previously grafted into
474 474 renamed directories.
475 475
476 476 This function calls different copytracing algorithms based on config.
477 477 """
478 478 # avoid silly behavior for update from empty dir
479 479 if not c1 or not c2 or c1 == c2:
480 480 return {}, {}, {}, {}, {}
481 481
482 482 narrowmatch = c1.repo().narrowmatch()
483 483
484 484 # avoid silly behavior for parent -> working dir
485 485 if c2.node() is None and c1.node() == repo.dirstate.p1():
486 486 return _dirstatecopies(repo, narrowmatch), {}, {}, {}, {}
487 487
488 488 copytracing = repo.ui.config(b'experimental', b'copytrace')
489 489 if stringutil.parsebool(copytracing) is False:
490 490 # stringutil.parsebool() returns None when it is unable to parse the
491 491 # value, so we should rely on making sure copytracing is on such cases
492 492 return {}, {}, {}, {}, {}
493 493
494 494 if usechangesetcentricalgo(repo):
495 495 # The heuristics don't make sense when we need changeset-centric algos
496 496 return _fullcopytracing(repo, c1, c2, base)
497 497
498 498 # Copy trace disabling is explicitly below the node == p1 logic above
499 499 # because the logic above is required for a simple copy to be kept across a
500 500 # rebase.
501 501 if copytracing == b'heuristics':
502 502 # Do full copytracing if only non-public revisions are involved as
503 503 # that will be fast enough and will also cover the copies which could
504 504 # be missed by heuristics
505 505 if _isfullcopytraceable(repo, c1, base):
506 506 return _fullcopytracing(repo, c1, c2, base)
507 507 return _heuristicscopytracing(repo, c1, c2, base)
508 508 else:
509 509 return _fullcopytracing(repo, c1, c2, base)
510 510
511 511
512 512 def _isfullcopytraceable(repo, c1, base):
513 513 """ Checks that if base, source and destination are all no-public branches,
514 514 if yes let's use the full copytrace algorithm for increased capabilities
515 515 since it will be fast enough.
516 516
517 517 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
518 518 number of changesets from c1 to base such that if number of changesets are
519 519 more than the limit, full copytracing algorithm won't be used.
520 520 """
521 521 if c1.rev() is None:
522 522 c1 = c1.p1()
523 523 if c1.mutable() and base.mutable():
524 524 sourcecommitlimit = repo.ui.configint(
525 525 b'experimental', b'copytrace.sourcecommitlimit'
526 526 )
527 527 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
528 528 return commits < sourcecommitlimit
529 529 return False
530 530
531 531
532 532 def _checksinglesidecopies(
533 533 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
534 534 ):
535 535 if src not in m2:
536 536 # deleted on side 2
537 537 if src not in m1:
538 538 # renamed on side 1, deleted on side 2
539 539 renamedelete[src] = dsts1
540 540 elif m2[src] != mb[src]:
541 541 if not _related(c2[src], base[src]):
542 542 return
543 543 # modified on side 2
544 544 for dst in dsts1:
545 545 if dst not in m2:
546 546 # dst not added on side 2 (handle as regular
547 547 # "both created" case in manifestmerge otherwise)
548 548 copy[dst] = src
549 549
550 550
551 551 def _fullcopytracing(repo, c1, c2, base):
552 552 """ The full copytracing algorithm which finds all the new files that were
553 553 added from merge base up to the top commit and for each file it checks if
554 554 this file was copied from another file.
555 555
556 556 This is pretty slow when a lot of changesets are involved but will track all
557 557 the copies.
558 558 """
559 559 m1 = c1.manifest()
560 560 m2 = c2.manifest()
561 561 mb = base.manifest()
562 562
563 563 copies1 = pathcopies(base, c1)
564 564 copies2 = pathcopies(base, c2)
565 565
566 566 if not (copies1 or copies2):
567 567 return {}, {}, {}, {}, {}
568 568
569 569 inversecopies1 = {}
570 570 inversecopies2 = {}
571 571 for dst, src in copies1.items():
572 572 inversecopies1.setdefault(src, []).append(dst)
573 573 for dst, src in copies2.items():
574 574 inversecopies2.setdefault(src, []).append(dst)
575 575
576 576 copy = {}
577 577 diverge = {}
578 578 renamedelete = {}
579 579 allsources = set(inversecopies1) | set(inversecopies2)
580 580 for src in allsources:
581 581 dsts1 = inversecopies1.get(src)
582 582 dsts2 = inversecopies2.get(src)
583 583 if dsts1 and dsts2:
584 584 # copied/renamed on both sides
585 585 if src not in m1 and src not in m2:
586 586 # renamed on both sides
587 587 dsts1 = set(dsts1)
588 588 dsts2 = set(dsts2)
589 589 # If there's some overlap in the rename destinations, we
590 590 # consider it not divergent. For example, if side 1 copies 'a'
591 591 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
592 592 # and 'd' and deletes 'a'.
593 593 if dsts1 & dsts2:
594 594 for dst in dsts1 & dsts2:
595 595 copy[dst] = src
596 596 else:
597 597 diverge[src] = sorted(dsts1 | dsts2)
598 598 elif src in m1 and src in m2:
599 599 # copied on both sides
600 600 dsts1 = set(dsts1)
601 601 dsts2 = set(dsts2)
602 602 for dst in dsts1 & dsts2:
603 603 copy[dst] = src
604 604 # TODO: Handle cases where it was renamed on one side and copied
605 605 # on the other side
606 606 elif dsts1:
607 607 # copied/renamed only on side 1
608 608 _checksinglesidecopies(
609 609 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
610 610 )
611 611 elif dsts2:
612 612 # copied/renamed only on side 2
613 613 _checksinglesidecopies(
614 614 src, dsts2, m2, m1, mb, c1, base, copy, renamedelete
615 615 )
616 616
617 renamedeleteset = set()
618 divergeset = set()
619 for dsts in diverge.values():
620 divergeset.update(dsts)
621 for dsts in renamedelete.values():
622 renamedeleteset.update(dsts)
623
624 617 # find interesting file sets from manifests
625 618 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
626 619 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
627 620 u1 = sorted(addedinm1 - addedinm2)
628 621 u2 = sorted(addedinm2 - addedinm1)
629 622
630 623 header = b" unmatched files in %s"
631 624 if u1:
632 625 repo.ui.debug(b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1)))
633 626 if u2:
634 627 repo.ui.debug(b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2)))
635 628
636 629 fullcopy = copies1.copy()
637 630 fullcopy.update(copies2)
638 631
639 632 if repo.ui.debugflag:
633 renamedeleteset = set()
634 divergeset = set()
635 for dsts in diverge.values():
636 divergeset.update(dsts)
637 for dsts in renamedelete.values():
638 renamedeleteset.update(dsts)
639
640 640 repo.ui.debug(
641 641 b" all copies found (* = to merge, ! = divergent, "
642 642 b"% = renamed and deleted):\n"
643 643 )
644 644 for f in sorted(fullcopy):
645 645 note = b""
646 646 if f in copy:
647 647 note += b"*"
648 648 if f in divergeset:
649 649 note += b"!"
650 650 if f in renamedeleteset:
651 651 note += b"%"
652 652 repo.ui.debug(
653 653 b" src: '%s' -> dst: '%s' %s\n" % (fullcopy[f], f, note)
654 654 )
655 del renamedeleteset
655 656 del divergeset
656 657
657 658 repo.ui.debug(b" checking for directory renames\n")
658 659
659 660 # generate a directory move map
660 661 d1, d2 = c1.dirs(), c2.dirs()
661 662 invalid = set()
662 663 dirmove = {}
663 664
664 665 # examine each file copy for a potential directory move, which is
665 666 # when all the files in a directory are moved to a new directory
666 667 for dst, src in pycompat.iteritems(fullcopy):
667 668 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
668 669 if dsrc in invalid:
669 670 # already seen to be uninteresting
670 671 continue
671 672 elif dsrc in d1 and ddst in d1:
672 673 # directory wasn't entirely moved locally
673 674 invalid.add(dsrc)
674 675 elif dsrc in d2 and ddst in d2:
675 676 # directory wasn't entirely moved remotely
676 677 invalid.add(dsrc)
677 678 elif dsrc in dirmove and dirmove[dsrc] != ddst:
678 679 # files from the same directory moved to two different places
679 680 invalid.add(dsrc)
680 681 else:
681 682 # looks good so far
682 683 dirmove[dsrc] = ddst
683 684
684 685 for i in invalid:
685 686 if i in dirmove:
686 687 del dirmove[i]
687 688 del d1, d2, invalid
688 689
689 690 if not dirmove:
690 691 return copy, {}, diverge, renamedelete, {}
691 692
692 693 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
693 694
694 695 for d in dirmove:
695 696 repo.ui.debug(
696 697 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
697 698 )
698 699
699 700 movewithdir = {}
700 701 # check unaccounted nonoverlapping files against directory moves
701 702 for f in u1 + u2:
702 703 if f not in fullcopy:
703 704 for d in dirmove:
704 705 if f.startswith(d):
705 706 # new file added in a directory that was moved, move it
706 707 df = dirmove[d] + f[len(d) :]
707 708 if df not in copy:
708 709 movewithdir[f] = df
709 710 repo.ui.debug(
710 711 b" pending file src: '%s' -> dst: '%s'\n"
711 712 % (f, df)
712 713 )
713 714 break
714 715
715 716 return copy, movewithdir, diverge, renamedelete, dirmove
716 717
717 718
718 719 def _heuristicscopytracing(repo, c1, c2, base):
719 720 """ Fast copytracing using filename heuristics
720 721
721 722 Assumes that moves or renames are of following two types:
722 723
723 724 1) Inside a directory only (same directory name but different filenames)
724 725 2) Move from one directory to another
725 726 (same filenames but different directory names)
726 727
727 728 Works only when there are no merge commits in the "source branch".
728 729 Source branch is commits from base up to c2 not including base.
729 730
730 731 If merge is involved it fallbacks to _fullcopytracing().
731 732
732 733 Can be used by setting the following config:
733 734
734 735 [experimental]
735 736 copytrace = heuristics
736 737
737 738 In some cases the copy/move candidates found by heuristics can be very large
738 739 in number and that will make the algorithm slow. The number of possible
739 740 candidates to check can be limited by using the config
740 741 `experimental.copytrace.movecandidateslimit` which defaults to 100.
741 742 """
742 743
743 744 if c1.rev() is None:
744 745 c1 = c1.p1()
745 746 if c2.rev() is None:
746 747 c2 = c2.p1()
747 748
748 749 copies = {}
749 750
750 751 changedfiles = set()
751 752 m1 = c1.manifest()
752 753 if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
753 754 # If base is not in c2 branch, we switch to fullcopytracing
754 755 repo.ui.debug(
755 756 b"switching to full copytracing as base is not "
756 757 b"an ancestor of c2\n"
757 758 )
758 759 return _fullcopytracing(repo, c1, c2, base)
759 760
760 761 ctx = c2
761 762 while ctx != base:
762 763 if len(ctx.parents()) == 2:
763 764 # To keep things simple let's not handle merges
764 765 repo.ui.debug(b"switching to full copytracing because of merges\n")
765 766 return _fullcopytracing(repo, c1, c2, base)
766 767 changedfiles.update(ctx.files())
767 768 ctx = ctx.p1()
768 769
769 770 cp = _forwardcopies(base, c2)
770 771 for dst, src in pycompat.iteritems(cp):
771 772 if src in m1:
772 773 copies[dst] = src
773 774
774 775 # file is missing if it isn't present in the destination, but is present in
775 776 # the base and present in the source.
776 777 # Presence in the base is important to exclude added files, presence in the
777 778 # source is important to exclude removed files.
778 779 filt = lambda f: f not in m1 and f in base and f in c2
779 780 missingfiles = [f for f in changedfiles if filt(f)]
780 781
781 782 if missingfiles:
782 783 basenametofilename = collections.defaultdict(list)
783 784 dirnametofilename = collections.defaultdict(list)
784 785
785 786 for f in m1.filesnotin(base.manifest()):
786 787 basename = os.path.basename(f)
787 788 dirname = os.path.dirname(f)
788 789 basenametofilename[basename].append(f)
789 790 dirnametofilename[dirname].append(f)
790 791
791 792 for f in missingfiles:
792 793 basename = os.path.basename(f)
793 794 dirname = os.path.dirname(f)
794 795 samebasename = basenametofilename[basename]
795 796 samedirname = dirnametofilename[dirname]
796 797 movecandidates = samebasename + samedirname
797 798 # f is guaranteed to be present in c2, that's why
798 799 # c2.filectx(f) won't fail
799 800 f2 = c2.filectx(f)
800 801 # we can have a lot of candidates which can slow down the heuristics
801 802 # config value to limit the number of candidates moves to check
802 803 maxcandidates = repo.ui.configint(
803 804 b'experimental', b'copytrace.movecandidateslimit'
804 805 )
805 806
806 807 if len(movecandidates) > maxcandidates:
807 808 repo.ui.status(
808 809 _(
809 810 b"skipping copytracing for '%s', more "
810 811 b"candidates than the limit: %d\n"
811 812 )
812 813 % (f, len(movecandidates))
813 814 )
814 815 continue
815 816
816 817 for candidate in movecandidates:
817 818 f1 = c1.filectx(candidate)
818 819 if _related(f1, f2):
819 820 # if there are a few related copies then we'll merge
820 821 # changes into all of them. This matches the behaviour
821 822 # of upstream copytracing
822 823 copies[candidate] = f
823 824
824 825 return copies, {}, {}, {}, {}
825 826
826 827
827 828 def _related(f1, f2):
828 829 """return True if f1 and f2 filectx have a common ancestor
829 830
830 831 Walk back to common ancestor to see if the two files originate
831 832 from the same file. Since workingfilectx's rev() is None it messes
832 833 up the integer comparison logic, hence the pre-step check for
833 834 None (f1 and f2 can only be workingfilectx's initially).
834 835 """
835 836
836 837 if f1 == f2:
837 838 return True # a match
838 839
839 840 g1, g2 = f1.ancestors(), f2.ancestors()
840 841 try:
841 842 f1r, f2r = f1.linkrev(), f2.linkrev()
842 843
843 844 if f1r is None:
844 845 f1 = next(g1)
845 846 if f2r is None:
846 847 f2 = next(g2)
847 848
848 849 while True:
849 850 f1r, f2r = f1.linkrev(), f2.linkrev()
850 851 if f1r > f2r:
851 852 f1 = next(g1)
852 853 elif f2r > f1r:
853 854 f2 = next(g2)
854 855 else: # f1 and f2 point to files in the same linkrev
855 856 return f1 == f2 # true if they point to the same file
856 857 except StopIteration:
857 858 return False
858 859
859 860
860 861 def graftcopies(wctx, ctx, base):
861 862 """reproduce copies between base and ctx in the wctx
862 863
863 864 Unlike mergecopies(), this function will only consider copies between base
864 865 and ctx; it will ignore copies between base and wctx. Also unlike
865 866 mergecopies(), this function will apply copies to the working copy (instead
866 867 of just returning information about the copies). That makes it cheaper
867 868 (especially in the common case of base==ctx.p1()) and useful also when
868 869 experimental.copytrace=off.
869 870
870 871 merge.update() will have already marked most copies, but it will only
871 872 mark copies if it thinks the source files are related (see
872 873 merge._related()). It will also not mark copies if the file wasn't modified
873 874 on the local side. This function adds the copies that were "missed"
874 875 by merge.update().
875 876 """
876 877 new_copies = pathcopies(base, ctx)
877 878 _filter(wctx.p1(), wctx, new_copies)
878 879 for dst, src in pycompat.iteritems(new_copies):
879 880 wctx[dst].markcopied(src)
880 881
881 882
882 883 def computechangesetfilesadded(ctx):
883 884 """return the list of files added in a changeset
884 885 """
885 886 added = []
886 887 for f in ctx.files():
887 888 if not any(f in p for p in ctx.parents()):
888 889 added.append(f)
889 890 return added
890 891
891 892
892 893 def computechangesetfilesremoved(ctx):
893 894 """return the list of files removed in a changeset
894 895 """
895 896 removed = []
896 897 for f in ctx.files():
897 898 if f not in ctx:
898 899 removed.append(f)
899 900 return removed
900 901
901 902
902 903 def computechangesetcopies(ctx):
903 904 """return the copies data for a changeset
904 905
905 906 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
906 907
907 908 Each dictionnary are in the form: `{newname: oldname}`
908 909 """
909 910 p1copies = {}
910 911 p2copies = {}
911 912 p1 = ctx.p1()
912 913 p2 = ctx.p2()
913 914 narrowmatch = ctx._repo.narrowmatch()
914 915 for dst in ctx.files():
915 916 if not narrowmatch(dst) or dst not in ctx:
916 917 continue
917 918 copied = ctx[dst].renamed()
918 919 if not copied:
919 920 continue
920 921 src, srcnode = copied
921 922 if src in p1 and p1[src].filenode() == srcnode:
922 923 p1copies[dst] = src
923 924 elif src in p2 and p2[src].filenode() == srcnode:
924 925 p2copies[dst] = src
925 926 return p1copies, p2copies
926 927
927 928
928 929 def encodecopies(files, copies):
929 930 items = []
930 931 for i, dst in enumerate(files):
931 932 if dst in copies:
932 933 items.append(b'%d\0%s' % (i, copies[dst]))
933 934 if len(items) != len(copies):
934 935 raise error.ProgrammingError(
935 936 b'some copy targets missing from file list'
936 937 )
937 938 return b"\n".join(items)
938 939
939 940
940 941 def decodecopies(files, data):
941 942 try:
942 943 copies = {}
943 944 if not data:
944 945 return copies
945 946 for l in data.split(b'\n'):
946 947 strindex, src = l.split(b'\0')
947 948 i = int(strindex)
948 949 dst = files[i]
949 950 copies[dst] = src
950 951 return copies
951 952 except (ValueError, IndexError):
952 953 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
953 954 # used different syntax for the value.
954 955 return None
955 956
956 957
957 958 def encodefileindices(files, subset):
958 959 subset = set(subset)
959 960 indices = []
960 961 for i, f in enumerate(files):
961 962 if f in subset:
962 963 indices.append(b'%d' % i)
963 964 return b'\n'.join(indices)
964 965
965 966
966 967 def decodefileindices(files, data):
967 968 try:
968 969 subset = []
969 970 if not data:
970 971 return subset
971 972 for strindex in data.split(b'\n'):
972 973 i = int(strindex)
973 974 if i < 0 or i >= len(files):
974 975 return None
975 976 subset.append(files[i])
976 977 return subset
977 978 except (ValueError, IndexError):
978 979 # Perhaps someone had chosen the same key name (e.g. "added") and
979 980 # used different syntax for the value.
980 981 return None
981 982
982 983
983 984 def _getsidedata(srcrepo, rev):
984 985 ctx = srcrepo[rev]
985 986 filescopies = computechangesetcopies(ctx)
986 987 filesadded = computechangesetfilesadded(ctx)
987 988 filesremoved = computechangesetfilesremoved(ctx)
988 989 sidedata = {}
989 990 if any([filescopies, filesadded, filesremoved]):
990 991 sortedfiles = sorted(ctx.files())
991 992 p1copies, p2copies = filescopies
992 993 p1copies = encodecopies(sortedfiles, p1copies)
993 994 p2copies = encodecopies(sortedfiles, p2copies)
994 995 filesadded = encodefileindices(sortedfiles, filesadded)
995 996 filesremoved = encodefileindices(sortedfiles, filesremoved)
996 997 if p1copies:
997 998 sidedata[sidedatamod.SD_P1COPIES] = p1copies
998 999 if p2copies:
999 1000 sidedata[sidedatamod.SD_P2COPIES] = p2copies
1000 1001 if filesadded:
1001 1002 sidedata[sidedatamod.SD_FILESADDED] = filesadded
1002 1003 if filesremoved:
1003 1004 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
1004 1005 return sidedata
1005 1006
1006 1007
1007 1008 def getsidedataadder(srcrepo, destrepo):
1008 1009 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
1009 1010 if pycompat.iswindows or not use_w:
1010 1011 return _get_simple_sidedata_adder(srcrepo, destrepo)
1011 1012 else:
1012 1013 return _get_worker_sidedata_adder(srcrepo, destrepo)
1013 1014
1014 1015
1015 1016 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
1016 1017 """The function used by worker precomputing sidedata
1017 1018
1018 1019 It read an input queue containing revision numbers
1019 1020 It write in an output queue containing (rev, <sidedata-map>)
1020 1021
1021 1022 The `None` input value is used as a stop signal.
1022 1023
1023 1024 The `tokens` semaphore is user to avoid having too many unprocessed
1024 1025 entries. The workers needs to acquire one token before fetching a task.
1025 1026 They will be released by the consumer of the produced data.
1026 1027 """
1027 1028 tokens.acquire()
1028 1029 rev = revs_queue.get()
1029 1030 while rev is not None:
1030 1031 data = _getsidedata(srcrepo, rev)
1031 1032 sidedata_queue.put((rev, data))
1032 1033 tokens.acquire()
1033 1034 rev = revs_queue.get()
1034 1035 # processing of `None` is completed, release the token.
1035 1036 tokens.release()
1036 1037
1037 1038
1038 1039 BUFF_PER_WORKER = 50
1039 1040
1040 1041
1041 1042 def _get_worker_sidedata_adder(srcrepo, destrepo):
1042 1043 """The parallel version of the sidedata computation
1043 1044
1044 1045 This code spawn a pool of worker that precompute a buffer of sidedata
1045 1046 before we actually need them"""
1046 1047 # avoid circular import copies -> scmutil -> worker -> copies
1047 1048 from . import worker
1048 1049
1049 1050 nbworkers = worker._numworkers(srcrepo.ui)
1050 1051
1051 1052 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
1052 1053 revsq = multiprocessing.Queue()
1053 1054 sidedataq = multiprocessing.Queue()
1054 1055
1055 1056 assert srcrepo.filtername is None
1056 1057 # queue all tasks beforehand, revision numbers are small and it make
1057 1058 # synchronisation simpler
1058 1059 #
1059 1060 # Since the computation for each node can be quite expensive, the overhead
1060 1061 # of using a single queue is not revelant. In practice, most computation
1061 1062 # are fast but some are very expensive and dominate all the other smaller
1062 1063 # cost.
1063 1064 for r in srcrepo.changelog.revs():
1064 1065 revsq.put(r)
1065 1066 # queue the "no more tasks" markers
1066 1067 for i in range(nbworkers):
1067 1068 revsq.put(None)
1068 1069
1069 1070 allworkers = []
1070 1071 for i in range(nbworkers):
1071 1072 args = (srcrepo, revsq, sidedataq, tokens)
1072 1073 w = multiprocessing.Process(target=_sidedata_worker, args=args)
1073 1074 allworkers.append(w)
1074 1075 w.start()
1075 1076
1076 1077 # dictionnary to store results for revision higher than we one we are
1077 1078 # looking for. For example, if we need the sidedatamap for 42, and 43 is
1078 1079 # received, when shelve 43 for later use.
1079 1080 staging = {}
1080 1081
1081 1082 def sidedata_companion(revlog, rev):
1082 1083 sidedata = {}
1083 1084 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
1084 1085 # Is the data previously shelved ?
1085 1086 sidedata = staging.pop(rev, None)
1086 1087 if sidedata is None:
1087 1088 # look at the queued result until we find the one we are lookig
1088 1089 # for (shelve the other ones)
1089 1090 r, sidedata = sidedataq.get()
1090 1091 while r != rev:
1091 1092 staging[r] = sidedata
1092 1093 r, sidedata = sidedataq.get()
1093 1094 tokens.release()
1094 1095 return False, (), sidedata
1095 1096
1096 1097 return sidedata_companion
1097 1098
1098 1099
1099 1100 def _get_simple_sidedata_adder(srcrepo, destrepo):
1100 1101 """The simple version of the sidedata computation
1101 1102
1102 1103 It just compute it in the same thread on request"""
1103 1104
1104 1105 def sidedatacompanion(revlog, rev):
1105 1106 sidedata = {}
1106 1107 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
1107 1108 sidedata = _getsidedata(srcrepo, rev)
1108 1109 return False, (), sidedata
1109 1110
1110 1111 return sidedatacompanion
1111 1112
1112 1113
1113 1114 def getsidedataremover(srcrepo, destrepo):
1114 1115 def sidedatacompanion(revlog, rev):
1115 1116 f = ()
1116 1117 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
1117 1118 if revlog.flags(rev) & REVIDX_SIDEDATA:
1118 1119 f = (
1119 1120 sidedatamod.SD_P1COPIES,
1120 1121 sidedatamod.SD_P2COPIES,
1121 1122 sidedatamod.SD_FILESADDED,
1122 1123 sidedatamod.SD_FILESREMOVED,
1123 1124 )
1124 1125 return False, f, {}
1125 1126
1126 1127 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now