##// END OF EJS Templates
changing-files: add clean computation of changed files for roots...
marmoute -
r46258:f6811e5b default
parent child Browse files
Show More
@@ -1,650 +1,665 b''
1 1 # metadata.py -- code related to various metadata computation and access.
2 2 #
3 3 # Copyright 2019 Google, Inc <martinvonz@google.com>
4 4 # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import multiprocessing
11 11 import struct
12 12
13 13 from . import (
14 14 error,
15 15 node,
16 16 pycompat,
17 17 util,
18 18 )
19 19
20 20 from .revlogutils import (
21 21 flagutil as sidedataflag,
22 22 sidedata as sidedatamod,
23 23 )
24 24
25 25
26 26 class ChangingFiles(object):
27 27 """A class recording the changes made to files by a changeset
28 28
29 29 Actions performed on files are gathered into 3 sets:
30 30
31 31 - added: files actively added in the changeset.
32 32 - merged: files whose history got merged
33 33 - removed: files removed in the revision
34 34 - salvaged: files that might have been deleted by a merge but were not
35 35 - touched: files affected by the merge
36 36
37 37 and copies information is held by 2 mappings
38 38
39 39 - copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
40 40 - copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
41 41
42 42 See their inline help for details.
43 43 """
44 44
45 45 def __init__(
46 46 self,
47 47 touched=None,
48 48 added=None,
49 49 removed=None,
50 50 merged=None,
51 51 salvaged=None,
52 52 p1_copies=None,
53 53 p2_copies=None,
54 54 ):
55 55 self._added = set(() if added is None else added)
56 56 self._merged = set(() if merged is None else merged)
57 57 self._removed = set(() if removed is None else removed)
58 58 self._touched = set(() if touched is None else touched)
59 59 self._salvaged = set(() if salvaged is None else salvaged)
60 60 self._touched.update(self._added)
61 61 self._touched.update(self._merged)
62 62 self._touched.update(self._removed)
63 63 self._p1_copies = dict(() if p1_copies is None else p1_copies)
64 64 self._p2_copies = dict(() if p2_copies is None else p2_copies)
65 65
66 66 def __eq__(self, other):
67 67 return (
68 68 self.added == other.added
69 69 and self.merged == other.merged
70 70 and self.removed == other.removed
71 71 and self.salvaged == other.salvaged
72 72 and self.touched == other.touched
73 73 and self.copied_from_p1 == other.copied_from_p1
74 74 and self.copied_from_p2 == other.copied_from_p2
75 75 )
76 76
77 77 @util.propertycache
78 78 def added(self):
79 79 """files actively added in the changeset
80 80
81 81 Any file present in that revision that was absent in all the changeset's
82 82 parents.
83 83
84 84 In case of merge, this means a file absent in one of the parents but
85 85 existing in the other will *not* be contained in this set. (They were
86 86 added by an ancestor)
87 87 """
88 88 return frozenset(self._added)
89 89
90 90 def mark_added(self, filename):
91 91 if 'added' in vars(self):
92 92 del self.added
93 93 self._added.add(filename)
94 94 self.mark_touched(filename)
95 95
96 96 def update_added(self, filenames):
97 97 for f in filenames:
98 98 self.mark_added(f)
99 99
100 100 @util.propertycache
101 101 def merged(self):
102 102 """files actively merged during a merge
103 103
104 104 Any modified files which had modification on both size that needed merging.
105 105
106 106 In this case a new filenode was created and it has two parents.
107 107 """
108 108 return frozenset(self._merged)
109 109
110 110 def mark_merged(self, filename):
111 111 if 'merged' in vars(self):
112 112 del self.merged
113 113 self._merged.add(filename)
114 114 self.mark_touched(filename)
115 115
116 116 def update_merged(self, filenames):
117 117 for f in filenames:
118 118 self.mark_merged(f)
119 119
120 120 @util.propertycache
121 121 def removed(self):
122 122 """files actively removed by the changeset
123 123
124 124 In case of merge this will only contain the set of files removing "new"
125 125 content. For any file absent in the current changeset:
126 126
127 127 a) If the file exists in both parents, it is clearly "actively" removed
128 128 by this changeset.
129 129
130 130 b) If a file exists in only one parent and in none of the common
131 131 ancestors, then the file was newly added in one of the merged branches
132 132 and then got "actively" removed.
133 133
134 134 c) If a file exists in only one parent and at least one of the common
135 135 ancestors using the same filenode, then the file was unchanged on one
136 136 side and deleted on the other side. The merge "passively" propagated
137 137 that deletion, but didn't "actively" remove the file. In this case the
138 138 file is *not* included in the `removed` set.
139 139
140 140 d) If a file exists in only one parent and at least one of the common
141 141 ancestors using a different filenode, then the file was changed on one
142 142 side and removed on the other side. The merge process "actively"
143 143 decided to drop the new change and delete the file. Unlike in the
144 144 previous case, (c), the file included in the `removed` set.
145 145
146 146 Summary table for merge:
147 147
148 148 case | exists in parents | exists in gca || removed
149 149 (a) | both | * || yes
150 150 (b) | one | none || yes
151 151 (c) | one | same filenode || no
152 152 (d) | one | new filenode || yes
153 153 """
154 154 return frozenset(self._removed)
155 155
156 156 def mark_removed(self, filename):
157 157 if 'removed' in vars(self):
158 158 del self.removed
159 159 self._removed.add(filename)
160 160 self.mark_touched(filename)
161 161
162 162 def update_removed(self, filenames):
163 163 for f in filenames:
164 164 self.mark_removed(f)
165 165
166 166 @util.propertycache
167 167 def salvaged(self):
168 168 """files that might have been deleted by a merge, but still exists.
169 169
170 170 During a merge, the manifest merging might select some files for
171 171 removal, or for a removed/changed conflict. If at commit time the file
172 172 still exists, its removal was "reverted" and the file is "salvaged"
173 173 """
174 174 return frozenset(self._salvaged)
175 175
176 176 def mark_salvaged(self, filename):
177 177 if "salvaged" in vars(self):
178 178 del self.salvaged
179 179 self._salvaged.add(filename)
180 180 self.mark_touched(filename)
181 181
182 182 def update_salvaged(self, filenames):
183 183 for f in filenames:
184 184 self.mark_salvaged(f)
185 185
186 186 @util.propertycache
187 187 def touched(self):
188 188 """files either actively modified, added or removed"""
189 189 return frozenset(self._touched)
190 190
191 191 def mark_touched(self, filename):
192 192 if 'touched' in vars(self):
193 193 del self.touched
194 194 self._touched.add(filename)
195 195
196 196 def update_touched(self, filenames):
197 197 for f in filenames:
198 198 self.mark_touched(f)
199 199
200 200 @util.propertycache
201 201 def copied_from_p1(self):
202 202 return self._p1_copies.copy()
203 203
204 204 def mark_copied_from_p1(self, source, dest):
205 205 if 'copied_from_p1' in vars(self):
206 206 del self.copied_from_p1
207 207 self._p1_copies[dest] = source
208 208
209 209 def update_copies_from_p1(self, copies):
210 210 for dest, source in copies.items():
211 211 self.mark_copied_from_p1(source, dest)
212 212
213 213 @util.propertycache
214 214 def copied_from_p2(self):
215 215 return self._p2_copies.copy()
216 216
217 217 def mark_copied_from_p2(self, source, dest):
218 218 if 'copied_from_p2' in vars(self):
219 219 del self.copied_from_p2
220 220 self._p2_copies[dest] = source
221 221
222 222 def update_copies_from_p2(self, copies):
223 223 for dest, source in copies.items():
224 224 self.mark_copied_from_p2(source, dest)
225 225
226 226
227 227 def compute_all_files_changes(ctx):
228 228 """compute the files changed by a revision"""
229 p1 = ctx.p1()
230 p2 = ctx.p2()
231 if p1.rev() == node.nullrev and p2.rev() == node.nullrev:
232 return _process_root(ctx)
229 233 filescopies = computechangesetcopies(ctx)
230 234 filesadded = computechangesetfilesadded(ctx)
231 235 filesremoved = computechangesetfilesremoved(ctx)
232 236 filesmerged = computechangesetfilesmerged(ctx)
233 237 files = ChangingFiles()
234 238 files.update_touched(ctx.files())
235 239 files.update_added(filesadded)
236 240 files.update_removed(filesremoved)
237 241 files.update_merged(filesmerged)
238 242 files.update_copies_from_p1(filescopies[0])
239 243 files.update_copies_from_p2(filescopies[1])
240 244 return files
241 245
242 246
247 def _process_root(ctx):
248 """compute the appropriate changed files for a changeset with no parents
249 """
250 # Simple, there was nothing before it, so everything is added.
251 md = ChangingFiles()
252 manifest = ctx.manifest()
253 for filename in manifest:
254 md.mark_added(filename)
255 return md
256
257
243 258 def computechangesetfilesadded(ctx):
244 259 """return the list of files added in a changeset
245 260 """
246 261 added = []
247 262 for f in ctx.files():
248 263 if not any(f in p for p in ctx.parents()):
249 264 added.append(f)
250 265 return added
251 266
252 267
253 268 def get_removal_filter(ctx, x=None):
254 269 """return a function to detect files "wrongly" detected as `removed`
255 270
256 271 When a file is removed relative to p1 in a merge, this
257 272 function determines whether the absence is due to a
258 273 deletion from a parent, or whether the merge commit
259 274 itself deletes the file. We decide this by doing a
260 275 simplified three way merge of the manifest entry for
261 276 the file. There are two ways we decide the merge
262 277 itself didn't delete a file:
263 278 - neither parent (nor the merge) contain the file
264 279 - exactly one parent contains the file, and that
265 280 parent has the same filelog entry as the merge
266 281 ancestor (or all of them if there two). In other
267 282 words, that parent left the file unchanged while the
268 283 other one deleted it.
269 284 One way to think about this is that deleting a file is
270 285 similar to emptying it, so the list of changed files
271 286 should be similar either way. The computation
272 287 described above is not done directly in _filecommit
273 288 when creating the list of changed files, however
274 289 it does something very similar by comparing filelog
275 290 nodes.
276 291 """
277 292
278 293 if x is not None:
279 294 p1, p2, m1, m2 = x
280 295 else:
281 296 p1 = ctx.p1()
282 297 p2 = ctx.p2()
283 298 m1 = p1.manifest()
284 299 m2 = p2.manifest()
285 300
286 301 @util.cachefunc
287 302 def mas():
288 303 p1n = p1.node()
289 304 p2n = p2.node()
290 305 cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
291 306 if not cahs:
292 307 cahs = [node.nullrev]
293 308 return [ctx.repo()[r].manifest() for r in cahs]
294 309
295 310 def deletionfromparent(f):
296 311 if f in m1:
297 312 return f not in m2 and all(
298 313 f in ma and ma.find(f) == m1.find(f) for ma in mas()
299 314 )
300 315 elif f in m2:
301 316 return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
302 317 else:
303 318 return True
304 319
305 320 return deletionfromparent
306 321
307 322
308 323 def computechangesetfilesremoved(ctx):
309 324 """return the list of files removed in a changeset
310 325 """
311 326 removed = []
312 327 for f in ctx.files():
313 328 if f not in ctx:
314 329 removed.append(f)
315 330 if removed:
316 331 rf = get_removal_filter(ctx)
317 332 removed = [r for r in removed if not rf(r)]
318 333 return removed
319 334
320 335
321 336 def computechangesetfilesmerged(ctx):
322 337 """return the list of files merged in a changeset
323 338 """
324 339 merged = []
325 340 if len(ctx.parents()) < 2:
326 341 return merged
327 342 for f in ctx.files():
328 343 if f in ctx:
329 344 fctx = ctx[f]
330 345 parents = fctx._filelog.parents(fctx._filenode)
331 346 if parents[1] != node.nullid:
332 347 merged.append(f)
333 348 return merged
334 349
335 350
336 351 def computechangesetcopies(ctx):
337 352 """return the copies data for a changeset
338 353
339 354 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
340 355
341 356 Each dictionnary are in the form: `{newname: oldname}`
342 357 """
343 358 p1copies = {}
344 359 p2copies = {}
345 360 p1 = ctx.p1()
346 361 p2 = ctx.p2()
347 362 narrowmatch = ctx._repo.narrowmatch()
348 363 for dst in ctx.files():
349 364 if not narrowmatch(dst) or dst not in ctx:
350 365 continue
351 366 copied = ctx[dst].renamed()
352 367 if not copied:
353 368 continue
354 369 src, srcnode = copied
355 370 if src in p1 and p1[src].filenode() == srcnode:
356 371 p1copies[dst] = src
357 372 elif src in p2 and p2[src].filenode() == srcnode:
358 373 p2copies[dst] = src
359 374 return p1copies, p2copies
360 375
361 376
362 377 def encodecopies(files, copies):
363 378 items = []
364 379 for i, dst in enumerate(files):
365 380 if dst in copies:
366 381 items.append(b'%d\0%s' % (i, copies[dst]))
367 382 if len(items) != len(copies):
368 383 raise error.ProgrammingError(
369 384 b'some copy targets missing from file list'
370 385 )
371 386 return b"\n".join(items)
372 387
373 388
374 389 def decodecopies(files, data):
375 390 try:
376 391 copies = {}
377 392 if not data:
378 393 return copies
379 394 for l in data.split(b'\n'):
380 395 strindex, src = l.split(b'\0')
381 396 i = int(strindex)
382 397 dst = files[i]
383 398 copies[dst] = src
384 399 return copies
385 400 except (ValueError, IndexError):
386 401 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
387 402 # used different syntax for the value.
388 403 return None
389 404
390 405
391 406 def encodefileindices(files, subset):
392 407 subset = set(subset)
393 408 indices = []
394 409 for i, f in enumerate(files):
395 410 if f in subset:
396 411 indices.append(b'%d' % i)
397 412 return b'\n'.join(indices)
398 413
399 414
400 415 def decodefileindices(files, data):
401 416 try:
402 417 subset = []
403 418 if not data:
404 419 return subset
405 420 for strindex in data.split(b'\n'):
406 421 i = int(strindex)
407 422 if i < 0 or i >= len(files):
408 423 return None
409 424 subset.append(files[i])
410 425 return subset
411 426 except (ValueError, IndexError):
412 427 # Perhaps someone had chosen the same key name (e.g. "added") and
413 428 # used different syntax for the value.
414 429 return None
415 430
416 431
417 432 # see mercurial/helptext/internals/revlogs.txt for details about the format
418 433
419 434 ACTION_MASK = int("111" "00", 2)
420 435 # note: untouched file used as copy source will as `000` for this mask.
421 436 ADDED_FLAG = int("001" "00", 2)
422 437 MERGED_FLAG = int("010" "00", 2)
423 438 REMOVED_FLAG = int("011" "00", 2)
424 439 # `100` is reserved for future use
425 440 TOUCHED_FLAG = int("101" "00", 2)
426 441
427 442 COPIED_MASK = int("11", 2)
428 443 COPIED_FROM_P1_FLAG = int("10", 2)
429 444 COPIED_FROM_P2_FLAG = int("11", 2)
430 445
431 446 # structure is <flag><filename-end><copy-source>
432 447 INDEX_HEADER = struct.Struct(">L")
433 448 INDEX_ENTRY = struct.Struct(">bLL")
434 449
435 450
436 451 def encode_files_sidedata(files):
437 452 all_files = set(files.touched - files.salvaged)
438 453 all_files.update(files.copied_from_p1.values())
439 454 all_files.update(files.copied_from_p2.values())
440 455 all_files = sorted(all_files)
441 456 file_idx = {f: i for (i, f) in enumerate(all_files)}
442 457 file_idx[None] = 0
443 458
444 459 chunks = [INDEX_HEADER.pack(len(all_files))]
445 460
446 461 filename_length = 0
447 462 for f in all_files:
448 463 filename_size = len(f)
449 464 filename_length += filename_size
450 465 flag = 0
451 466 if f in files.added:
452 467 flag |= ADDED_FLAG
453 468 elif f in files.merged:
454 469 flag |= MERGED_FLAG
455 470 elif f in files.removed:
456 471 flag |= REMOVED_FLAG
457 472 elif f in files.touched:
458 473 flag |= TOUCHED_FLAG
459 474
460 475 copy = None
461 476 if f in files.copied_from_p1:
462 477 flag |= COPIED_FROM_P1_FLAG
463 478 copy = files.copied_from_p1.get(f)
464 479 elif f in files.copied_from_p2:
465 480 copy = files.copied_from_p2.get(f)
466 481 flag |= COPIED_FROM_P2_FLAG
467 482 copy_idx = file_idx[copy]
468 483 chunks.append(INDEX_ENTRY.pack(flag, filename_length, copy_idx))
469 484 chunks.extend(all_files)
470 485 return {sidedatamod.SD_FILES: b''.join(chunks)}
471 486
472 487
473 488 def decode_files_sidedata(sidedata):
474 489 md = ChangingFiles()
475 490 raw = sidedata.get(sidedatamod.SD_FILES)
476 491
477 492 if raw is None:
478 493 return md
479 494
480 495 copies = []
481 496 all_files = []
482 497
483 498 assert len(raw) >= INDEX_HEADER.size
484 499 total_files = INDEX_HEADER.unpack_from(raw, 0)[0]
485 500
486 501 offset = INDEX_HEADER.size
487 502 file_offset_base = offset + (INDEX_ENTRY.size * total_files)
488 503 file_offset_last = file_offset_base
489 504
490 505 assert len(raw) >= file_offset_base
491 506
492 507 for idx in range(total_files):
493 508 flag, file_end, copy_idx = INDEX_ENTRY.unpack_from(raw, offset)
494 509 file_end += file_offset_base
495 510 filename = raw[file_offset_last:file_end]
496 511 filesize = file_end - file_offset_last
497 512 assert len(filename) == filesize
498 513 offset += INDEX_ENTRY.size
499 514 file_offset_last = file_end
500 515 all_files.append(filename)
501 516 if flag & ACTION_MASK == ADDED_FLAG:
502 517 md.mark_added(filename)
503 518 elif flag & ACTION_MASK == MERGED_FLAG:
504 519 md.mark_merged(filename)
505 520 elif flag & ACTION_MASK == REMOVED_FLAG:
506 521 md.mark_removed(filename)
507 522 elif flag & ACTION_MASK == TOUCHED_FLAG:
508 523 md.mark_touched(filename)
509 524
510 525 copied = None
511 526 if flag & COPIED_MASK == COPIED_FROM_P1_FLAG:
512 527 copied = md.mark_copied_from_p1
513 528 elif flag & COPIED_MASK == COPIED_FROM_P2_FLAG:
514 529 copied = md.mark_copied_from_p2
515 530
516 531 if copied is not None:
517 532 copies.append((copied, filename, copy_idx))
518 533
519 534 for copied, filename, copy_idx in copies:
520 535 copied(all_files[copy_idx], filename)
521 536
522 537 return md
523 538
524 539
525 540 def _getsidedata(srcrepo, rev):
526 541 ctx = srcrepo[rev]
527 542 files = compute_all_files_changes(ctx)
528 543 return encode_files_sidedata(files)
529 544
530 545
531 546 def getsidedataadder(srcrepo, destrepo):
532 547 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
533 548 if pycompat.iswindows or not use_w:
534 549 return _get_simple_sidedata_adder(srcrepo, destrepo)
535 550 else:
536 551 return _get_worker_sidedata_adder(srcrepo, destrepo)
537 552
538 553
539 554 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
540 555 """The function used by worker precomputing sidedata
541 556
542 557 It read an input queue containing revision numbers
543 558 It write in an output queue containing (rev, <sidedata-map>)
544 559
545 560 The `None` input value is used as a stop signal.
546 561
547 562 The `tokens` semaphore is user to avoid having too many unprocessed
548 563 entries. The workers needs to acquire one token before fetching a task.
549 564 They will be released by the consumer of the produced data.
550 565 """
551 566 tokens.acquire()
552 567 rev = revs_queue.get()
553 568 while rev is not None:
554 569 data = _getsidedata(srcrepo, rev)
555 570 sidedata_queue.put((rev, data))
556 571 tokens.acquire()
557 572 rev = revs_queue.get()
558 573 # processing of `None` is completed, release the token.
559 574 tokens.release()
560 575
561 576
562 577 BUFF_PER_WORKER = 50
563 578
564 579
565 580 def _get_worker_sidedata_adder(srcrepo, destrepo):
566 581 """The parallel version of the sidedata computation
567 582
568 583 This code spawn a pool of worker that precompute a buffer of sidedata
569 584 before we actually need them"""
570 585 # avoid circular import copies -> scmutil -> worker -> copies
571 586 from . import worker
572 587
573 588 nbworkers = worker._numworkers(srcrepo.ui)
574 589
575 590 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
576 591 revsq = multiprocessing.Queue()
577 592 sidedataq = multiprocessing.Queue()
578 593
579 594 assert srcrepo.filtername is None
580 595 # queue all tasks beforehand, revision numbers are small and it make
581 596 # synchronisation simpler
582 597 #
583 598 # Since the computation for each node can be quite expensive, the overhead
584 599 # of using a single queue is not revelant. In practice, most computation
585 600 # are fast but some are very expensive and dominate all the other smaller
586 601 # cost.
587 602 for r in srcrepo.changelog.revs():
588 603 revsq.put(r)
589 604 # queue the "no more tasks" markers
590 605 for i in range(nbworkers):
591 606 revsq.put(None)
592 607
593 608 allworkers = []
594 609 for i in range(nbworkers):
595 610 args = (srcrepo, revsq, sidedataq, tokens)
596 611 w = multiprocessing.Process(target=_sidedata_worker, args=args)
597 612 allworkers.append(w)
598 613 w.start()
599 614
600 615 # dictionnary to store results for revision higher than we one we are
601 616 # looking for. For example, if we need the sidedatamap for 42, and 43 is
602 617 # received, when shelve 43 for later use.
603 618 staging = {}
604 619
605 620 def sidedata_companion(revlog, rev):
606 621 sidedata = {}
607 622 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
608 623 # Is the data previously shelved ?
609 624 sidedata = staging.pop(rev, None)
610 625 if sidedata is None:
611 626 # look at the queued result until we find the one we are lookig
612 627 # for (shelve the other ones)
613 628 r, sidedata = sidedataq.get()
614 629 while r != rev:
615 630 staging[r] = sidedata
616 631 r, sidedata = sidedataq.get()
617 632 tokens.release()
618 633 return False, (), sidedata
619 634
620 635 return sidedata_companion
621 636
622 637
623 638 def _get_simple_sidedata_adder(srcrepo, destrepo):
624 639 """The simple version of the sidedata computation
625 640
626 641 It just compute it in the same thread on request"""
627 642
628 643 def sidedatacompanion(revlog, rev):
629 644 sidedata = {}
630 645 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
631 646 sidedata = _getsidedata(srcrepo, rev)
632 647 return False, (), sidedata
633 648
634 649 return sidedatacompanion
635 650
636 651
637 652 def getsidedataremover(srcrepo, destrepo):
638 653 def sidedatacompanion(revlog, rev):
639 654 f = ()
640 655 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
641 656 if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
642 657 f = (
643 658 sidedatamod.SD_P1COPIES,
644 659 sidedatamod.SD_P2COPIES,
645 660 sidedatamod.SD_FILESADDED,
646 661 sidedatamod.SD_FILESREMOVED,
647 662 )
648 663 return False, f, {}
649 664
650 665 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now