##// END OF EJS Templates
graftcopies: document why the function is useful at all...
Martin von Zweigbergk -
r44552:06e7e765 default
parent child Browse files
Show More
@@ -1,1111 +1,1125 b''
1 1 # copies.py - copy detection for Mercurial
2 2 #
3 3 # Copyright 2008 Matt Mackall <mpm@selenic.com>
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import collections
11 11 import multiprocessing
12 12 import os
13 13
14 14 from .i18n import _
15 15
16 16
17 17 from .revlogutils.flagutil import REVIDX_SIDEDATA
18 18
19 19 from . import (
20 20 error,
21 21 match as matchmod,
22 22 node,
23 23 pathutil,
24 24 pycompat,
25 25 util,
26 26 )
27 27
28 28 from .revlogutils import sidedata as sidedatamod
29 29
30 30 from .utils import stringutil
31 31
32 32
33 33 def _filter(src, dst, t):
34 34 """filters out invalid copies after chaining"""
35 35
36 36 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
37 37 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
38 38 # in the following table (not including trivial cases). For example, case 2
39 39 # is where a file existed in 'src' and remained under that name in 'mid' and
40 40 # then was renamed between 'mid' and 'dst'.
41 41 #
42 42 # case src mid dst result
43 43 # 1 x y - -
44 44 # 2 x y y x->y
45 45 # 3 x y x -
46 46 # 4 x y z x->z
47 47 # 5 - x y -
48 48 # 6 x x y x->y
49 49 #
50 50 # _chain() takes care of chaining the copies in 'a' and 'b', but it
51 51 # cannot tell the difference between cases 1 and 2, between 3 and 4, or
52 52 # between 5 and 6, so it includes all cases in its result.
53 53 # Cases 1, 3, and 5 are then removed by _filter().
54 54
55 55 for k, v in list(t.items()):
56 56 # remove copies from files that didn't exist
57 57 if v not in src:
58 58 del t[k]
59 59 # remove criss-crossed copies
60 60 elif k in src and v in dst:
61 61 del t[k]
62 62 # remove copies to files that were then removed
63 63 elif k not in dst:
64 64 del t[k]
65 65
66 66
67 67 def _chain(prefix, suffix):
68 68 """chain two sets of copies 'prefix' and 'suffix'"""
69 69 result = prefix.copy()
70 70 for key, value in pycompat.iteritems(suffix):
71 71 result[key] = prefix.get(value, value)
72 72 return result
73 73
74 74
75 75 def _tracefile(fctx, am, basemf):
76 76 """return file context that is the ancestor of fctx present in ancestor
77 77 manifest am
78 78
79 79 Note: we used to try and stop after a given limit, however checking if that
80 80 limit is reached turned out to be very expensive. we are better off
81 81 disabling that feature."""
82 82
83 83 for f in fctx.ancestors():
84 84 path = f.path()
85 85 if am.get(path, None) == f.filenode():
86 86 return path
87 87 if basemf and basemf.get(path, None) == f.filenode():
88 88 return path
89 89
90 90
91 91 def _dirstatecopies(repo, match=None):
92 92 ds = repo.dirstate
93 93 c = ds.copies().copy()
94 94 for k in list(c):
95 95 if ds[k] not in b'anm' or (match and not match(k)):
96 96 del c[k]
97 97 return c
98 98
99 99
100 100 def _computeforwardmissing(a, b, match=None):
101 101 """Computes which files are in b but not a.
102 102 This is its own function so extensions can easily wrap this call to see what
103 103 files _forwardcopies is about to process.
104 104 """
105 105 ma = a.manifest()
106 106 mb = b.manifest()
107 107 return mb.filesnotin(ma, match=match)
108 108
109 109
110 110 def usechangesetcentricalgo(repo):
111 111 """Checks if we should use changeset-centric copy algorithms"""
112 112 if repo.filecopiesmode == b'changeset-sidedata':
113 113 return True
114 114 readfrom = repo.ui.config(b'experimental', b'copies.read-from')
115 115 changesetsource = (b'changeset-only', b'compatibility')
116 116 return readfrom in changesetsource
117 117
118 118
119 119 def _committedforwardcopies(a, b, base, match):
120 120 """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
121 121 # files might have to be traced back to the fctx parent of the last
122 122 # one-side-only changeset, but not further back than that
123 123 repo = a._repo
124 124
125 125 if usechangesetcentricalgo(repo):
126 126 return _changesetforwardcopies(a, b, match)
127 127
128 128 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
129 129 dbg = repo.ui.debug
130 130 if debug:
131 131 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b))
132 132 am = a.manifest()
133 133 basemf = None if base is None else base.manifest()
134 134
135 135 # find where new files came from
136 136 # we currently don't try to find where old files went, too expensive
137 137 # this means we can miss a case like 'hg rm b; hg cp a b'
138 138 cm = {}
139 139
140 140 # Computing the forward missing is quite expensive on large manifests, since
141 141 # it compares the entire manifests. We can optimize it in the common use
142 142 # case of computing what copies are in a commit versus its parent (like
143 143 # during a rebase or histedit). Note, we exclude merge commits from this
144 144 # optimization, since the ctx.files() for a merge commit is not correct for
145 145 # this comparison.
146 146 forwardmissingmatch = match
147 147 if b.p1() == a and b.p2().node() == node.nullid:
148 148 filesmatcher = matchmod.exact(b.files())
149 149 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
150 150 missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
151 151
152 152 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
153 153
154 154 if debug:
155 155 dbg(b'debug.copies: missing files to search: %d\n' % len(missing))
156 156
157 157 for f in sorted(missing):
158 158 if debug:
159 159 dbg(b'debug.copies: tracing file: %s\n' % f)
160 160 fctx = b[f]
161 161 fctx._ancestrycontext = ancestrycontext
162 162
163 163 if debug:
164 164 start = util.timer()
165 165 opath = _tracefile(fctx, am, basemf)
166 166 if opath:
167 167 if debug:
168 168 dbg(b'debug.copies: rename of: %s\n' % opath)
169 169 cm[f] = opath
170 170 if debug:
171 171 dbg(
172 172 b'debug.copies: time: %f seconds\n'
173 173 % (util.timer() - start)
174 174 )
175 175 return cm
176 176
177 177
178 178 def _revinfogetter(repo):
179 179 """return a function that return multiple data given a <rev>"i
180 180
181 181 * p1: revision number of first parent
182 182 * p2: revision number of first parent
183 183 * p1copies: mapping of copies from p1
184 184 * p2copies: mapping of copies from p2
185 185 * removed: a list of removed files
186 186 """
187 187 cl = repo.changelog
188 188 parents = cl.parentrevs
189 189
190 190 if repo.filecopiesmode == b'changeset-sidedata':
191 191 changelogrevision = cl.changelogrevision
192 192 flags = cl.flags
193 193
194 194 # A small cache to avoid doing the work twice for merges
195 195 #
196 196 # In the vast majority of cases, if we ask information for a revision
197 197 # about 1 parent, we'll later ask it for the other. So it make sense to
198 198 # keep the information around when reaching the first parent of a merge
199 199 # and dropping it after it was provided for the second parents.
200 200 #
201 201 # It exists cases were only one parent of the merge will be walked. It
202 202 # happens when the "destination" the copy tracing is descendant from a
203 203 # new root, not common with the "source". In that case, we will only walk
204 204 # through merge parents that are descendant of changesets common
205 205 # between "source" and "destination".
206 206 #
207 207 # With the current case implementation if such changesets have a copy
208 208 # information, we'll keep them in memory until the end of
209 209 # _changesetforwardcopies. We don't expect the case to be frequent
210 210 # enough to matters.
211 211 #
212 212 # In addition, it would be possible to reach pathological case, were
213 213 # many first parent are met before any second parent is reached. In
214 214 # that case the cache could grow. If this even become an issue one can
215 215 # safely introduce a maximum cache size. This would trade extra CPU/IO
216 216 # time to save memory.
217 217 merge_caches = {}
218 218
219 219 def revinfo(rev):
220 220 p1, p2 = parents(rev)
221 221 if flags(rev) & REVIDX_SIDEDATA:
222 222 e = merge_caches.pop(rev, None)
223 223 if e is not None:
224 224 return e
225 225 c = changelogrevision(rev)
226 226 p1copies = c.p1copies
227 227 p2copies = c.p2copies
228 228 removed = c.filesremoved
229 229 if p1 != node.nullrev and p2 != node.nullrev:
230 230 # XXX some case we over cache, IGNORE
231 231 merge_caches[rev] = (p1, p2, p1copies, p2copies, removed)
232 232 else:
233 233 p1copies = {}
234 234 p2copies = {}
235 235 removed = []
236 236 return p1, p2, p1copies, p2copies, removed
237 237
238 238 else:
239 239
240 240 def revinfo(rev):
241 241 p1, p2 = parents(rev)
242 242 ctx = repo[rev]
243 243 p1copies, p2copies = ctx._copies
244 244 removed = ctx.filesremoved()
245 245 return p1, p2, p1copies, p2copies, removed
246 246
247 247 return revinfo
248 248
249 249
250 250 def _changesetforwardcopies(a, b, match):
251 251 if a.rev() in (node.nullrev, b.rev()):
252 252 return {}
253 253
254 254 repo = a.repo().unfiltered()
255 255 children = {}
256 256 revinfo = _revinfogetter(repo)
257 257
258 258 cl = repo.changelog
259 259 missingrevs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
260 260 mrset = set(missingrevs)
261 261 roots = set()
262 262 for r in missingrevs:
263 263 for p in cl.parentrevs(r):
264 264 if p == node.nullrev:
265 265 continue
266 266 if p not in children:
267 267 children[p] = [r]
268 268 else:
269 269 children[p].append(r)
270 270 if p not in mrset:
271 271 roots.add(p)
272 272 if not roots:
273 273 # no common revision to track copies from
274 274 return {}
275 275 min_root = min(roots)
276 276
277 277 from_head = set(
278 278 cl.reachableroots(min_root, [b.rev()], list(roots), includepath=True)
279 279 )
280 280
281 281 iterrevs = set(from_head)
282 282 iterrevs &= mrset
283 283 iterrevs.update(roots)
284 284 iterrevs.remove(b.rev())
285 285 revs = sorted(iterrevs)
286 286 return _combinechangesetcopies(revs, children, b.rev(), revinfo, match)
287 287
288 288
289 289 def _combinechangesetcopies(revs, children, targetrev, revinfo, match):
290 290 """combine the copies information for each item of iterrevs
291 291
292 292 revs: sorted iterable of revision to visit
293 293 children: a {parent: [children]} mapping.
294 294 targetrev: the final copies destination revision (not in iterrevs)
295 295 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
296 296 match: a matcher
297 297
298 298 It returns the aggregated copies information for `targetrev`.
299 299 """
300 300 all_copies = {}
301 301 alwaysmatch = match.always()
302 302 for r in revs:
303 303 copies = all_copies.pop(r, None)
304 304 if copies is None:
305 305 # this is a root
306 306 copies = {}
307 307 for i, c in enumerate(children[r]):
308 308 p1, p2, p1copies, p2copies, removed = revinfo(c)
309 309 if r == p1:
310 310 parent = 1
311 311 childcopies = p1copies
312 312 else:
313 313 assert r == p2
314 314 parent = 2
315 315 childcopies = p2copies
316 316 if not alwaysmatch:
317 317 childcopies = {
318 318 dst: src for dst, src in childcopies.items() if match(dst)
319 319 }
320 320 newcopies = copies
321 321 if childcopies:
322 322 newcopies = _chain(newcopies, childcopies)
323 323 # _chain makes a copies, we can avoid doing so in some
324 324 # simple/linear cases.
325 325 assert newcopies is not copies
326 326 for f in removed:
327 327 if f in newcopies:
328 328 if newcopies is copies:
329 329 # copy on write to avoid affecting potential other
330 330 # branches. when there are no other branches, this
331 331 # could be avoided.
332 332 newcopies = copies.copy()
333 333 del newcopies[f]
334 334 othercopies = all_copies.get(c)
335 335 if othercopies is None:
336 336 all_copies[c] = newcopies
337 337 else:
338 338 # we are the second parent to work on c, we need to merge our
339 339 # work with the other.
340 340 #
341 341 # Unlike when copies are stored in the filelog, we consider
342 342 # it a copy even if the destination already existed on the
343 343 # other branch. It's simply too expensive to check if the
344 344 # file existed in the manifest.
345 345 #
346 346 # In case of conflict, parent 1 take precedence over parent 2.
347 347 # This is an arbitrary choice made anew when implementing
348 348 # changeset based copies. It was made without regards with
349 349 # potential filelog related behavior.
350 350 if parent == 1:
351 351 othercopies.update(newcopies)
352 352 else:
353 353 newcopies.update(othercopies)
354 354 all_copies[c] = newcopies
355 355 return all_copies[targetrev]
356 356
357 357
358 358 def _forwardcopies(a, b, base=None, match=None):
359 359 """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
360 360
361 361 if base is None:
362 362 base = a
363 363 match = a.repo().narrowmatch(match)
364 364 # check for working copy
365 365 if b.rev() is None:
366 366 cm = _committedforwardcopies(a, b.p1(), base, match)
367 367 # combine copies from dirstate if necessary
368 368 copies = _chain(cm, _dirstatecopies(b._repo, match))
369 369 else:
370 370 copies = _committedforwardcopies(a, b, base, match)
371 371 return copies
372 372
373 373
374 374 def _backwardrenames(a, b, match):
375 375 if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
376 376 return {}
377 377
378 378 # Even though we're not taking copies into account, 1:n rename situations
379 379 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
380 380 # arbitrarily pick one of the renames.
381 381 # We don't want to pass in "match" here, since that would filter
382 382 # the destination by it. Since we're reversing the copies, we want
383 383 # to filter the source instead.
384 384 f = _forwardcopies(b, a)
385 385 r = {}
386 386 for k, v in sorted(pycompat.iteritems(f)):
387 387 if match and not match(v):
388 388 continue
389 389 # remove copies
390 390 if v in a:
391 391 continue
392 392 r[v] = k
393 393 return r
394 394
395 395
396 396 def pathcopies(x, y, match=None):
397 397 """find {dst@y: src@x} copy mapping for directed compare"""
398 398 repo = x._repo
399 399 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
400 400 if debug:
401 401 repo.ui.debug(
402 402 b'debug.copies: searching copies from %s to %s\n' % (x, y)
403 403 )
404 404 if x == y or not x or not y:
405 405 return {}
406 406 a = y.ancestor(x)
407 407 if a == x:
408 408 if debug:
409 409 repo.ui.debug(b'debug.copies: search mode: forward\n')
410 410 if y.rev() is None and x == y.p1():
411 411 # short-circuit to avoid issues with merge states
412 412 return _dirstatecopies(repo, match)
413 413 copies = _forwardcopies(x, y, match=match)
414 414 elif a == y:
415 415 if debug:
416 416 repo.ui.debug(b'debug.copies: search mode: backward\n')
417 417 copies = _backwardrenames(x, y, match=match)
418 418 else:
419 419 if debug:
420 420 repo.ui.debug(b'debug.copies: search mode: combined\n')
421 421 base = None
422 422 if a.rev() != node.nullrev:
423 423 base = x
424 424 copies = _chain(
425 425 _backwardrenames(x, a, match=match),
426 426 _forwardcopies(a, y, base, match=match),
427 427 )
428 428 _filter(x, y, copies)
429 429 return copies
430 430
431 431
432 432 def mergecopies(repo, c1, c2, base):
433 433 """
434 434 Finds moves and copies between context c1 and c2 that are relevant for
435 435 merging. 'base' will be used as the merge base.
436 436
437 437 Copytracing is used in commands like rebase, merge, unshelve, etc to merge
438 438 files that were moved/ copied in one merge parent and modified in another.
439 439 For example:
440 440
441 441 o ---> 4 another commit
442 442 |
443 443 | o ---> 3 commit that modifies a.txt
444 444 | /
445 445 o / ---> 2 commit that moves a.txt to b.txt
446 446 |/
447 447 o ---> 1 merge base
448 448
449 449 If we try to rebase revision 3 on revision 4, since there is no a.txt in
450 450 revision 4, and if user have copytrace disabled, we prints the following
451 451 message:
452 452
453 453 ```other changed <file> which local deleted```
454 454
455 455 Returns five dicts: "copy", "movewithdir", "diverge", "renamedelete" and
456 456 "dirmove".
457 457
458 458 "copy" is a mapping from destination name -> source name,
459 459 where source is in c1 and destination is in c2 or vice-versa.
460 460
461 461 "movewithdir" is a mapping from source name -> destination name,
462 462 where the file at source present in one context but not the other
463 463 needs to be moved to destination by the merge process, because the
464 464 other context moved the directory it is in.
465 465
466 466 "diverge" is a mapping of source name -> list of destination names
467 467 for divergent renames.
468 468
469 469 "renamedelete" is a mapping of source name -> list of destination
470 470 names for files deleted in c1 that were renamed in c2 or vice-versa.
471 471
472 472 "dirmove" is a mapping of detected source dir -> destination dir renames.
473 473 This is needed for handling changes to new files previously grafted into
474 474 renamed directories.
475 475
476 476 This function calls different copytracing algorithms based on config.
477 477 """
478 478 # avoid silly behavior for update from empty dir
479 479 if not c1 or not c2 or c1 == c2:
480 480 return {}, {}, {}, {}, {}
481 481
482 482 narrowmatch = c1.repo().narrowmatch()
483 483
484 484 # avoid silly behavior for parent -> working dir
485 485 if c2.node() is None and c1.node() == repo.dirstate.p1():
486 486 return _dirstatecopies(repo, narrowmatch), {}, {}, {}, {}
487 487
488 488 copytracing = repo.ui.config(b'experimental', b'copytrace')
489 489 if stringutil.parsebool(copytracing) is False:
490 490 # stringutil.parsebool() returns None when it is unable to parse the
491 491 # value, so we should rely on making sure copytracing is on such cases
492 492 return {}, {}, {}, {}, {}
493 493
494 494 if usechangesetcentricalgo(repo):
495 495 # The heuristics don't make sense when we need changeset-centric algos
496 496 return _fullcopytracing(repo, c1, c2, base)
497 497
498 498 # Copy trace disabling is explicitly below the node == p1 logic above
499 499 # because the logic above is required for a simple copy to be kept across a
500 500 # rebase.
501 501 if copytracing == b'heuristics':
502 502 # Do full copytracing if only non-public revisions are involved as
503 503 # that will be fast enough and will also cover the copies which could
504 504 # be missed by heuristics
505 505 if _isfullcopytraceable(repo, c1, base):
506 506 return _fullcopytracing(repo, c1, c2, base)
507 507 return _heuristicscopytracing(repo, c1, c2, base)
508 508 else:
509 509 return _fullcopytracing(repo, c1, c2, base)
510 510
511 511
512 512 def _isfullcopytraceable(repo, c1, base):
513 513 """ Checks that if base, source and destination are all no-public branches,
514 514 if yes let's use the full copytrace algorithm for increased capabilities
515 515 since it will be fast enough.
516 516
517 517 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
518 518 number of changesets from c1 to base such that if number of changesets are
519 519 more than the limit, full copytracing algorithm won't be used.
520 520 """
521 521 if c1.rev() is None:
522 522 c1 = c1.p1()
523 523 if c1.mutable() and base.mutable():
524 524 sourcecommitlimit = repo.ui.configint(
525 525 b'experimental', b'copytrace.sourcecommitlimit'
526 526 )
527 527 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
528 528 return commits < sourcecommitlimit
529 529 return False
530 530
531 531
532 532 def _checksinglesidecopies(
533 533 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
534 534 ):
535 535 if src not in m2:
536 536 # deleted on side 2
537 537 if src not in m1:
538 538 # renamed on side 1, deleted on side 2
539 539 renamedelete[src] = dsts1
540 540 elif m2[src] != mb[src]:
541 541 if not _related(c2[src], base[src]):
542 542 return
543 543 # modified on side 2
544 544 for dst in dsts1:
545 545 if dst not in m2:
546 546 # dst not added on side 2 (handle as regular
547 547 # "both created" case in manifestmerge otherwise)
548 548 copy[dst] = src
549 549
550 550
551 551 def _fullcopytracing(repo, c1, c2, base):
552 552 """ The full copytracing algorithm which finds all the new files that were
553 553 added from merge base up to the top commit and for each file it checks if
554 554 this file was copied from another file.
555 555
556 556 This is pretty slow when a lot of changesets are involved but will track all
557 557 the copies.
558 558 """
559 559 m1 = c1.manifest()
560 560 m2 = c2.manifest()
561 561 mb = base.manifest()
562 562
563 563 copies1 = pathcopies(base, c1)
564 564 copies2 = pathcopies(base, c2)
565 565
566 566 inversecopies1 = {}
567 567 inversecopies2 = {}
568 568 for dst, src in copies1.items():
569 569 inversecopies1.setdefault(src, []).append(dst)
570 570 for dst, src in copies2.items():
571 571 inversecopies2.setdefault(src, []).append(dst)
572 572
573 573 copy = {}
574 574 diverge = {}
575 575 renamedelete = {}
576 576 allsources = set(inversecopies1) | set(inversecopies2)
577 577 for src in allsources:
578 578 dsts1 = inversecopies1.get(src)
579 579 dsts2 = inversecopies2.get(src)
580 580 if dsts1 and dsts2:
581 581 # copied/renamed on both sides
582 582 if src not in m1 and src not in m2:
583 583 # renamed on both sides
584 584 dsts1 = set(dsts1)
585 585 dsts2 = set(dsts2)
586 586 # If there's some overlap in the rename destinations, we
587 587 # consider it not divergent. For example, if side 1 copies 'a'
588 588 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
589 589 # and 'd' and deletes 'a'.
590 590 if dsts1 & dsts2:
591 591 for dst in dsts1 & dsts2:
592 592 copy[dst] = src
593 593 else:
594 594 diverge[src] = sorted(dsts1 | dsts2)
595 595 elif src in m1 and src in m2:
596 596 # copied on both sides
597 597 dsts1 = set(dsts1)
598 598 dsts2 = set(dsts2)
599 599 for dst in dsts1 & dsts2:
600 600 copy[dst] = src
601 601 # TODO: Handle cases where it was renamed on one side and copied
602 602 # on the other side
603 603 elif dsts1:
604 604 # copied/renamed only on side 1
605 605 _checksinglesidecopies(
606 606 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
607 607 )
608 608 elif dsts2:
609 609 # copied/renamed only on side 2
610 610 _checksinglesidecopies(
611 611 src, dsts2, m2, m1, mb, c1, base, copy, renamedelete
612 612 )
613 613
614 614 renamedeleteset = set()
615 615 divergeset = set()
616 616 for dsts in diverge.values():
617 617 divergeset.update(dsts)
618 618 for dsts in renamedelete.values():
619 619 renamedeleteset.update(dsts)
620 620
621 621 # find interesting file sets from manifests
622 622 addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
623 623 addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
624 624 u1 = sorted(addedinm1 - addedinm2)
625 625 u2 = sorted(addedinm2 - addedinm1)
626 626
627 627 header = b" unmatched files in %s"
628 628 if u1:
629 629 repo.ui.debug(b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1)))
630 630 if u2:
631 631 repo.ui.debug(b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2)))
632 632
633 633 fullcopy = copies1.copy()
634 634 fullcopy.update(copies2)
635 635 if not fullcopy:
636 636 return copy, {}, diverge, renamedelete, {}
637 637
638 638 if repo.ui.debugflag:
639 639 repo.ui.debug(
640 640 b" all copies found (* = to merge, ! = divergent, "
641 641 b"% = renamed and deleted):\n"
642 642 )
643 643 for f in sorted(fullcopy):
644 644 note = b""
645 645 if f in copy:
646 646 note += b"*"
647 647 if f in divergeset:
648 648 note += b"!"
649 649 if f in renamedeleteset:
650 650 note += b"%"
651 651 repo.ui.debug(
652 652 b" src: '%s' -> dst: '%s' %s\n" % (fullcopy[f], f, note)
653 653 )
654 654 del divergeset
655 655
656 656 repo.ui.debug(b" checking for directory renames\n")
657 657
658 658 # generate a directory move map
659 659 d1, d2 = c1.dirs(), c2.dirs()
660 660 invalid = set()
661 661 dirmove = {}
662 662
663 663 # examine each file copy for a potential directory move, which is
664 664 # when all the files in a directory are moved to a new directory
665 665 for dst, src in pycompat.iteritems(fullcopy):
666 666 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
667 667 if dsrc in invalid:
668 668 # already seen to be uninteresting
669 669 continue
670 670 elif dsrc in d1 and ddst in d1:
671 671 # directory wasn't entirely moved locally
672 672 invalid.add(dsrc)
673 673 elif dsrc in d2 and ddst in d2:
674 674 # directory wasn't entirely moved remotely
675 675 invalid.add(dsrc)
676 676 elif dsrc in dirmove and dirmove[dsrc] != ddst:
677 677 # files from the same directory moved to two different places
678 678 invalid.add(dsrc)
679 679 else:
680 680 # looks good so far
681 681 dirmove[dsrc] = ddst
682 682
683 683 for i in invalid:
684 684 if i in dirmove:
685 685 del dirmove[i]
686 686 del d1, d2, invalid
687 687
688 688 if not dirmove:
689 689 return copy, {}, diverge, renamedelete, {}
690 690
691 691 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
692 692
693 693 for d in dirmove:
694 694 repo.ui.debug(
695 695 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
696 696 )
697 697
698 698 movewithdir = {}
699 699 # check unaccounted nonoverlapping files against directory moves
700 700 for f in u1 + u2:
701 701 if f not in fullcopy:
702 702 for d in dirmove:
703 703 if f.startswith(d):
704 704 # new file added in a directory that was moved, move it
705 705 df = dirmove[d] + f[len(d) :]
706 706 if df not in copy:
707 707 movewithdir[f] = df
708 708 repo.ui.debug(
709 709 b" pending file src: '%s' -> dst: '%s'\n"
710 710 % (f, df)
711 711 )
712 712 break
713 713
714 714 return copy, movewithdir, diverge, renamedelete, dirmove
715 715
716 716
717 717 def _heuristicscopytracing(repo, c1, c2, base):
718 718 """ Fast copytracing using filename heuristics
719 719
720 720 Assumes that moves or renames are of following two types:
721 721
722 722 1) Inside a directory only (same directory name but different filenames)
723 723 2) Move from one directory to another
724 724 (same filenames but different directory names)
725 725
726 726 Works only when there are no merge commits in the "source branch".
727 727 Source branch is commits from base up to c2 not including base.
728 728
729 729 If merge is involved it fallbacks to _fullcopytracing().
730 730
731 731 Can be used by setting the following config:
732 732
733 733 [experimental]
734 734 copytrace = heuristics
735 735
736 736 In some cases the copy/move candidates found by heuristics can be very large
737 737 in number and that will make the algorithm slow. The number of possible
738 738 candidates to check can be limited by using the config
739 739 `experimental.copytrace.movecandidateslimit` which defaults to 100.
740 740 """
741 741
742 742 if c1.rev() is None:
743 743 c1 = c1.p1()
744 744 if c2.rev() is None:
745 745 c2 = c2.p1()
746 746
747 747 copies = {}
748 748
749 749 changedfiles = set()
750 750 m1 = c1.manifest()
751 751 if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
752 752 # If base is not in c2 branch, we switch to fullcopytracing
753 753 repo.ui.debug(
754 754 b"switching to full copytracing as base is not "
755 755 b"an ancestor of c2\n"
756 756 )
757 757 return _fullcopytracing(repo, c1, c2, base)
758 758
759 759 ctx = c2
760 760 while ctx != base:
761 761 if len(ctx.parents()) == 2:
762 762 # To keep things simple let's not handle merges
763 763 repo.ui.debug(b"switching to full copytracing because of merges\n")
764 764 return _fullcopytracing(repo, c1, c2, base)
765 765 changedfiles.update(ctx.files())
766 766 ctx = ctx.p1()
767 767
768 768 cp = _forwardcopies(base, c2)
769 769 for dst, src in pycompat.iteritems(cp):
770 770 if src in m1:
771 771 copies[dst] = src
772 772
773 773 # file is missing if it isn't present in the destination, but is present in
774 774 # the base and present in the source.
775 775 # Presence in the base is important to exclude added files, presence in the
776 776 # source is important to exclude removed files.
777 777 filt = lambda f: f not in m1 and f in base and f in c2
778 778 missingfiles = [f for f in changedfiles if filt(f)]
779 779
780 780 if missingfiles:
781 781 basenametofilename = collections.defaultdict(list)
782 782 dirnametofilename = collections.defaultdict(list)
783 783
784 784 for f in m1.filesnotin(base.manifest()):
785 785 basename = os.path.basename(f)
786 786 dirname = os.path.dirname(f)
787 787 basenametofilename[basename].append(f)
788 788 dirnametofilename[dirname].append(f)
789 789
790 790 for f in missingfiles:
791 791 basename = os.path.basename(f)
792 792 dirname = os.path.dirname(f)
793 793 samebasename = basenametofilename[basename]
794 794 samedirname = dirnametofilename[dirname]
795 795 movecandidates = samebasename + samedirname
796 796 # f is guaranteed to be present in c2, that's why
797 797 # c2.filectx(f) won't fail
798 798 f2 = c2.filectx(f)
799 799 # we can have a lot of candidates which can slow down the heuristics
800 800 # config value to limit the number of candidates moves to check
801 801 maxcandidates = repo.ui.configint(
802 802 b'experimental', b'copytrace.movecandidateslimit'
803 803 )
804 804
805 805 if len(movecandidates) > maxcandidates:
806 806 repo.ui.status(
807 807 _(
808 808 b"skipping copytracing for '%s', more "
809 809 b"candidates than the limit: %d\n"
810 810 )
811 811 % (f, len(movecandidates))
812 812 )
813 813 continue
814 814
815 815 for candidate in movecandidates:
816 816 f1 = c1.filectx(candidate)
817 817 if _related(f1, f2):
818 818 # if there are a few related copies then we'll merge
819 819 # changes into all of them. This matches the behaviour
820 820 # of upstream copytracing
821 821 copies[candidate] = f
822 822
823 823 return copies, {}, {}, {}, {}
824 824
825 825
826 826 def _related(f1, f2):
827 827 """return True if f1 and f2 filectx have a common ancestor
828 828
829 829 Walk back to common ancestor to see if the two files originate
830 830 from the same file. Since workingfilectx's rev() is None it messes
831 831 up the integer comparison logic, hence the pre-step check for
832 832 None (f1 and f2 can only be workingfilectx's initially).
833 833 """
834 834
835 835 if f1 == f2:
836 836 return True # a match
837 837
838 838 g1, g2 = f1.ancestors(), f2.ancestors()
839 839 try:
840 840 f1r, f2r = f1.linkrev(), f2.linkrev()
841 841
842 842 if f1r is None:
843 843 f1 = next(g1)
844 844 if f2r is None:
845 845 f2 = next(g2)
846 846
847 847 while True:
848 848 f1r, f2r = f1.linkrev(), f2.linkrev()
849 849 if f1r > f2r:
850 850 f1 = next(g1)
851 851 elif f2r > f1r:
852 852 f2 = next(g2)
853 853 else: # f1 and f2 point to files in the same linkrev
854 854 return f1 == f2 # true if they point to the same file
855 855 except StopIteration:
856 856 return False
857 857
858 858
859 859 def graftcopies(wctx, ctx, base):
860 """reproduce copies between base and ctx in the wctx"""
860 """reproduce copies between base and ctx in the wctx
861
862 Unlike mergecopies(), this function will only consider copies between base
863 and ctx; it will ignore copies between base and wctx. Also unlike
864 mergecopies(), this function will apply copies to the working copy (instead
865 of just returning information about the copies). That makes it cheaper
866 (especially in the common case of base==ctx.p1()) and useful also when
867 experimental.copytrace=off.
868
869 merge.update() will have already marked most copies, but it will only
870 mark copies if it thinks the source files are related (see
871 merge._related()). It will also not mark copies if the file wasn't modified
872 on the local side. This function adds the copies that were "missed"
873 by merge.update().
874 """
861 875 new_copies = pathcopies(base, ctx)
862 876 _filter(wctx.p1(), wctx, new_copies)
863 877 for dst, src in pycompat.iteritems(new_copies):
864 878 wctx[dst].markcopied(src)
865 879
866 880
867 881 def computechangesetfilesadded(ctx):
868 882 """return the list of files added in a changeset
869 883 """
870 884 added = []
871 885 for f in ctx.files():
872 886 if not any(f in p for p in ctx.parents()):
873 887 added.append(f)
874 888 return added
875 889
876 890
877 891 def computechangesetfilesremoved(ctx):
878 892 """return the list of files removed in a changeset
879 893 """
880 894 removed = []
881 895 for f in ctx.files():
882 896 if f not in ctx:
883 897 removed.append(f)
884 898 return removed
885 899
886 900
887 901 def computechangesetcopies(ctx):
888 902 """return the copies data for a changeset
889 903
890 904 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
891 905
892 906 Each dictionnary are in the form: `{newname: oldname}`
893 907 """
894 908 p1copies = {}
895 909 p2copies = {}
896 910 p1 = ctx.p1()
897 911 p2 = ctx.p2()
898 912 narrowmatch = ctx._repo.narrowmatch()
899 913 for dst in ctx.files():
900 914 if not narrowmatch(dst) or dst not in ctx:
901 915 continue
902 916 copied = ctx[dst].renamed()
903 917 if not copied:
904 918 continue
905 919 src, srcnode = copied
906 920 if src in p1 and p1[src].filenode() == srcnode:
907 921 p1copies[dst] = src
908 922 elif src in p2 and p2[src].filenode() == srcnode:
909 923 p2copies[dst] = src
910 924 return p1copies, p2copies
911 925
912 926
913 927 def encodecopies(files, copies):
914 928 items = []
915 929 for i, dst in enumerate(files):
916 930 if dst in copies:
917 931 items.append(b'%d\0%s' % (i, copies[dst]))
918 932 if len(items) != len(copies):
919 933 raise error.ProgrammingError(
920 934 b'some copy targets missing from file list'
921 935 )
922 936 return b"\n".join(items)
923 937
924 938
925 939 def decodecopies(files, data):
926 940 try:
927 941 copies = {}
928 942 if not data:
929 943 return copies
930 944 for l in data.split(b'\n'):
931 945 strindex, src = l.split(b'\0')
932 946 i = int(strindex)
933 947 dst = files[i]
934 948 copies[dst] = src
935 949 return copies
936 950 except (ValueError, IndexError):
937 951 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
938 952 # used different syntax for the value.
939 953 return None
940 954
941 955
942 956 def encodefileindices(files, subset):
943 957 subset = set(subset)
944 958 indices = []
945 959 for i, f in enumerate(files):
946 960 if f in subset:
947 961 indices.append(b'%d' % i)
948 962 return b'\n'.join(indices)
949 963
950 964
951 965 def decodefileindices(files, data):
952 966 try:
953 967 subset = []
954 968 if not data:
955 969 return subset
956 970 for strindex in data.split(b'\n'):
957 971 i = int(strindex)
958 972 if i < 0 or i >= len(files):
959 973 return None
960 974 subset.append(files[i])
961 975 return subset
962 976 except (ValueError, IndexError):
963 977 # Perhaps someone had chosen the same key name (e.g. "added") and
964 978 # used different syntax for the value.
965 979 return None
966 980
967 981
968 982 def _getsidedata(srcrepo, rev):
969 983 ctx = srcrepo[rev]
970 984 filescopies = computechangesetcopies(ctx)
971 985 filesadded = computechangesetfilesadded(ctx)
972 986 filesremoved = computechangesetfilesremoved(ctx)
973 987 sidedata = {}
974 988 if any([filescopies, filesadded, filesremoved]):
975 989 sortedfiles = sorted(ctx.files())
976 990 p1copies, p2copies = filescopies
977 991 p1copies = encodecopies(sortedfiles, p1copies)
978 992 p2copies = encodecopies(sortedfiles, p2copies)
979 993 filesadded = encodefileindices(sortedfiles, filesadded)
980 994 filesremoved = encodefileindices(sortedfiles, filesremoved)
981 995 if p1copies:
982 996 sidedata[sidedatamod.SD_P1COPIES] = p1copies
983 997 if p2copies:
984 998 sidedata[sidedatamod.SD_P2COPIES] = p2copies
985 999 if filesadded:
986 1000 sidedata[sidedatamod.SD_FILESADDED] = filesadded
987 1001 if filesremoved:
988 1002 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
989 1003 return sidedata
990 1004
991 1005
992 1006 def getsidedataadder(srcrepo, destrepo):
993 1007 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
994 1008 if pycompat.iswindows or not use_w:
995 1009 return _get_simple_sidedata_adder(srcrepo, destrepo)
996 1010 else:
997 1011 return _get_worker_sidedata_adder(srcrepo, destrepo)
998 1012
999 1013
1000 1014 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
1001 1015 """The function used by worker precomputing sidedata
1002 1016
1003 1017 It read an input queue containing revision numbers
1004 1018 It write in an output queue containing (rev, <sidedata-map>)
1005 1019
1006 1020 The `None` input value is used as a stop signal.
1007 1021
1008 1022 The `tokens` semaphore is user to avoid having too many unprocessed
1009 1023 entries. The workers needs to acquire one token before fetching a task.
1010 1024 They will be released by the consumer of the produced data.
1011 1025 """
1012 1026 tokens.acquire()
1013 1027 rev = revs_queue.get()
1014 1028 while rev is not None:
1015 1029 data = _getsidedata(srcrepo, rev)
1016 1030 sidedata_queue.put((rev, data))
1017 1031 tokens.acquire()
1018 1032 rev = revs_queue.get()
1019 1033 # processing of `None` is completed, release the token.
1020 1034 tokens.release()
1021 1035
1022 1036
1023 1037 BUFF_PER_WORKER = 50
1024 1038
1025 1039
1026 1040 def _get_worker_sidedata_adder(srcrepo, destrepo):
1027 1041 """The parallel version of the sidedata computation
1028 1042
1029 1043 This code spawn a pool of worker that precompute a buffer of sidedata
1030 1044 before we actually need them"""
1031 1045 # avoid circular import copies -> scmutil -> worker -> copies
1032 1046 from . import worker
1033 1047
1034 1048 nbworkers = worker._numworkers(srcrepo.ui)
1035 1049
1036 1050 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
1037 1051 revsq = multiprocessing.Queue()
1038 1052 sidedataq = multiprocessing.Queue()
1039 1053
1040 1054 assert srcrepo.filtername is None
1041 1055 # queue all tasks beforehand, revision numbers are small and it make
1042 1056 # synchronisation simpler
1043 1057 #
1044 1058 # Since the computation for each node can be quite expensive, the overhead
1045 1059 # of using a single queue is not revelant. In practice, most computation
1046 1060 # are fast but some are very expensive and dominate all the other smaller
1047 1061 # cost.
1048 1062 for r in srcrepo.changelog.revs():
1049 1063 revsq.put(r)
1050 1064 # queue the "no more tasks" markers
1051 1065 for i in range(nbworkers):
1052 1066 revsq.put(None)
1053 1067
1054 1068 allworkers = []
1055 1069 for i in range(nbworkers):
1056 1070 args = (srcrepo, revsq, sidedataq, tokens)
1057 1071 w = multiprocessing.Process(target=_sidedata_worker, args=args)
1058 1072 allworkers.append(w)
1059 1073 w.start()
1060 1074
1061 1075 # dictionnary to store results for revision higher than we one we are
1062 1076 # looking for. For example, if we need the sidedatamap for 42, and 43 is
1063 1077 # received, when shelve 43 for later use.
1064 1078 staging = {}
1065 1079
1066 1080 def sidedata_companion(revlog, rev):
1067 1081 sidedata = {}
1068 1082 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
1069 1083 # Is the data previously shelved ?
1070 1084 sidedata = staging.pop(rev, None)
1071 1085 if sidedata is None:
1072 1086 # look at the queued result until we find the one we are lookig
1073 1087 # for (shelve the other ones)
1074 1088 r, sidedata = sidedataq.get()
1075 1089 while r != rev:
1076 1090 staging[r] = sidedata
1077 1091 r, sidedata = sidedataq.get()
1078 1092 tokens.release()
1079 1093 return False, (), sidedata
1080 1094
1081 1095 return sidedata_companion
1082 1096
1083 1097
1084 1098 def _get_simple_sidedata_adder(srcrepo, destrepo):
1085 1099 """The simple version of the sidedata computation
1086 1100
1087 1101 It just compute it in the same thread on request"""
1088 1102
1089 1103 def sidedatacompanion(revlog, rev):
1090 1104 sidedata = {}
1091 1105 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
1092 1106 sidedata = _getsidedata(srcrepo, rev)
1093 1107 return False, (), sidedata
1094 1108
1095 1109 return sidedatacompanion
1096 1110
1097 1111
1098 1112 def getsidedataremover(srcrepo, destrepo):
1099 1113 def sidedatacompanion(revlog, rev):
1100 1114 f = ()
1101 1115 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
1102 1116 if revlog.flags(rev) & REVIDX_SIDEDATA:
1103 1117 f = (
1104 1118 sidedatamod.SD_P1COPIES,
1105 1119 sidedatamod.SD_P2COPIES,
1106 1120 sidedatamod.SD_FILESADDED,
1107 1121 sidedatamod.SD_FILESREMOVED,
1108 1122 )
1109 1123 return False, f, {}
1110 1124
1111 1125 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now