##// END OF EJS Templates
salvaged: explicitly skip salvaged file while encoding...
marmoute -
r46234:3bfa7c7f default draft
parent child Browse files
Show More
@@ -1,644 +1,644
1 1 # metadata.py -- code related to various metadata computation and access.
2 2 #
3 3 # Copyright 2019 Google, Inc <martinvonz@google.com>
4 4 # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import multiprocessing
11 11 import struct
12 12
13 13 from . import (
14 14 error,
15 15 node,
16 16 pycompat,
17 17 util,
18 18 )
19 19
20 20 from .revlogutils import (
21 21 flagutil as sidedataflag,
22 22 sidedata as sidedatamod,
23 23 )
24 24
25 25
26 26 class ChangingFiles(object):
27 27 """A class recording the changes made to files by a changeset
28 28
29 29 Actions performed on files are gathered into 3 sets:
30 30
31 31 - added: files actively added in the changeset.
32 32 - merged: files whose history got merged
33 33 - removed: files removed in the revision
34 34 - salvaged: files that might have been deleted by a merge but were not
35 35 - touched: files affected by the merge
36 36
37 37 and copies information is held by 2 mappings
38 38
39 39 - copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
40 40 - copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
41 41
42 42 See their inline help for details.
43 43 """
44 44
45 45 def __init__(
46 46 self,
47 47 touched=None,
48 48 added=None,
49 49 removed=None,
50 50 merged=None,
51 51 salvaged=None,
52 52 p1_copies=None,
53 53 p2_copies=None,
54 54 ):
55 55 self._added = set(() if added is None else added)
56 56 self._merged = set(() if merged is None else merged)
57 57 self._removed = set(() if removed is None else removed)
58 58 self._touched = set(() if touched is None else touched)
59 59 self._salvaged = set(() if salvaged is None else salvaged)
60 60 self._touched.update(self._added)
61 61 self._touched.update(self._merged)
62 62 self._touched.update(self._removed)
63 63 self._p1_copies = dict(() if p1_copies is None else p1_copies)
64 64 self._p2_copies = dict(() if p2_copies is None else p2_copies)
65 65
66 66 def __eq__(self, other):
67 67 return (
68 68 self.added == other.added
69 69 and self.merged == other.merged
70 70 and self.removed == other.removed
71 71 and self.salvaged == other.salvaged
72 72 and self.touched == other.touched
73 73 and self.copied_from_p1 == other.copied_from_p1
74 74 and self.copied_from_p2 == other.copied_from_p2
75 75 )
76 76
77 77 @util.propertycache
78 78 def added(self):
79 79 """files actively added in the changeset
80 80
81 81 Any file present in that revision that was absent in all the changeset's
82 82 parents.
83 83
84 84 In case of merge, this means a file absent in one of the parents but
85 85 existing in the other will *not* be contained in this set. (They were
86 86 added by an ancestor)
87 87 """
88 88 return frozenset(self._added)
89 89
90 90 def mark_added(self, filename):
91 91 if 'added' in vars(self):
92 92 del self.added
93 93 self._added.add(filename)
94 94 self.mark_touched(filename)
95 95
96 96 def update_added(self, filenames):
97 97 for f in filenames:
98 98 self.mark_added(f)
99 99
100 100 @util.propertycache
101 101 def merged(self):
102 102 """files actively merged during a merge
103 103
104 104 Any modified files which had modification on both size that needed merging.
105 105
106 106 In this case a new filenode was created and it has two parents.
107 107 """
108 108 return frozenset(self._merged)
109 109
110 110 def mark_merged(self, filename):
111 111 if 'merged' in vars(self):
112 112 del self.merged
113 113 self._merged.add(filename)
114 114 self.mark_touched(filename)
115 115
116 116 def update_merged(self, filenames):
117 117 for f in filenames:
118 118 self.mark_merged(f)
119 119
120 120 @util.propertycache
121 121 def removed(self):
122 122 """files actively removed by the changeset
123 123
124 124 In case of merge this will only contain the set of files removing "new"
125 125 content. For any file absent in the current changeset:
126 126
127 127 a) If the file exists in both parents, it is clearly "actively" removed
128 128 by this changeset.
129 129
130 130 b) If a file exists in only one parent and in none of the common
131 131 ancestors, then the file was newly added in one of the merged branches
132 132 and then got "actively" removed.
133 133
134 134 c) If a file exists in only one parent and at least one of the common
135 135 ancestors using the same filenode, then the file was unchanged on one
136 136 side and deleted on the other side. The merge "passively" propagated
137 137 that deletion, but didn't "actively" remove the file. In this case the
138 138 file is *not* included in the `removed` set.
139 139
140 140 d) If a file exists in only one parent and at least one of the common
141 141 ancestors using a different filenode, then the file was changed on one
142 142 side and removed on the other side. The merge process "actively"
143 143 decided to drop the new change and delete the file. Unlike in the
144 144 previous case, (c), the file included in the `removed` set.
145 145
146 146 Summary table for merge:
147 147
148 148 case | exists in parents | exists in gca || removed
149 149 (a) | both | * || yes
150 150 (b) | one | none || yes
151 151 (c) | one | same filenode || no
152 152 (d) | one | new filenode || yes
153 153 """
154 154 return frozenset(self._removed)
155 155
156 156 def mark_removed(self, filename):
157 157 if 'removed' in vars(self):
158 158 del self.removed
159 159 self._removed.add(filename)
160 160 self.mark_touched(filename)
161 161
162 162 def update_removed(self, filenames):
163 163 for f in filenames:
164 164 self.mark_removed(f)
165 165
166 166 @util.propertycache
167 167 def salvaged(self):
168 168 """files that might have been deleted by a merge, but still exists.
169 169
170 170 During a merge, the manifest merging might select some files for
171 171 removal, or for a removed/changed conflict. If at commit time the file
172 172 still exists, its removal was "reverted" and the file is "salvaged"
173 173 """
174 174 return frozenset(self._salvaged)
175 175
176 176 def mark_salvaged(self, filename):
177 177 if "salvaged" in vars(self):
178 178 del self.salvaged
179 179 self._salvaged.add(filename)
180 180 self.mark_touched(filename)
181 181
182 182 def update_salvaged(self, filenames):
183 183 for f in filenames:
184 184 self.mark_salvaged(f)
185 185
186 186 @util.propertycache
187 187 def touched(self):
188 188 """files either actively modified, added or removed"""
189 189 return frozenset(self._touched)
190 190
191 191 def mark_touched(self, filename):
192 192 if 'touched' in vars(self):
193 193 del self.touched
194 194 self._touched.add(filename)
195 195
196 196 def update_touched(self, filenames):
197 197 for f in filenames:
198 198 self.mark_touched(f)
199 199
200 200 @util.propertycache
201 201 def copied_from_p1(self):
202 202 return self._p1_copies.copy()
203 203
204 204 def mark_copied_from_p1(self, source, dest):
205 205 if 'copied_from_p1' in vars(self):
206 206 del self.copied_from_p1
207 207 self._p1_copies[dest] = source
208 208
209 209 def update_copies_from_p1(self, copies):
210 210 for dest, source in copies.items():
211 211 self.mark_copied_from_p1(source, dest)
212 212
213 213 @util.propertycache
214 214 def copied_from_p2(self):
215 215 return self._p2_copies.copy()
216 216
217 217 def mark_copied_from_p2(self, source, dest):
218 218 if 'copied_from_p2' in vars(self):
219 219 del self.copied_from_p2
220 220 self._p2_copies[dest] = source
221 221
222 222 def update_copies_from_p2(self, copies):
223 223 for dest, source in copies.items():
224 224 self.mark_copied_from_p2(source, dest)
225 225
226 226
227 227 def computechangesetfilesadded(ctx):
228 228 """return the list of files added in a changeset
229 229 """
230 230 added = []
231 231 for f in ctx.files():
232 232 if not any(f in p for p in ctx.parents()):
233 233 added.append(f)
234 234 return added
235 235
236 236
237 237 def get_removal_filter(ctx, x=None):
238 238 """return a function to detect files "wrongly" detected as `removed`
239 239
240 240 When a file is removed relative to p1 in a merge, this
241 241 function determines whether the absence is due to a
242 242 deletion from a parent, or whether the merge commit
243 243 itself deletes the file. We decide this by doing a
244 244 simplified three way merge of the manifest entry for
245 245 the file. There are two ways we decide the merge
246 246 itself didn't delete a file:
247 247 - neither parent (nor the merge) contain the file
248 248 - exactly one parent contains the file, and that
249 249 parent has the same filelog entry as the merge
250 250 ancestor (or all of them if there two). In other
251 251 words, that parent left the file unchanged while the
252 252 other one deleted it.
253 253 One way to think about this is that deleting a file is
254 254 similar to emptying it, so the list of changed files
255 255 should be similar either way. The computation
256 256 described above is not done directly in _filecommit
257 257 when creating the list of changed files, however
258 258 it does something very similar by comparing filelog
259 259 nodes.
260 260 """
261 261
262 262 if x is not None:
263 263 p1, p2, m1, m2 = x
264 264 else:
265 265 p1 = ctx.p1()
266 266 p2 = ctx.p2()
267 267 m1 = p1.manifest()
268 268 m2 = p2.manifest()
269 269
270 270 @util.cachefunc
271 271 def mas():
272 272 p1n = p1.node()
273 273 p2n = p2.node()
274 274 cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
275 275 if not cahs:
276 276 cahs = [node.nullrev]
277 277 return [ctx.repo()[r].manifest() for r in cahs]
278 278
279 279 def deletionfromparent(f):
280 280 if f in m1:
281 281 return f not in m2 and all(
282 282 f in ma and ma.find(f) == m1.find(f) for ma in mas()
283 283 )
284 284 elif f in m2:
285 285 return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
286 286 else:
287 287 return True
288 288
289 289 return deletionfromparent
290 290
291 291
292 292 def computechangesetfilesremoved(ctx):
293 293 """return the list of files removed in a changeset
294 294 """
295 295 removed = []
296 296 for f in ctx.files():
297 297 if f not in ctx:
298 298 removed.append(f)
299 299 if removed:
300 300 rf = get_removal_filter(ctx)
301 301 removed = [r for r in removed if not rf(r)]
302 302 return removed
303 303
304 304
305 305 def computechangesetfilesmerged(ctx):
306 306 """return the list of files merged in a changeset
307 307 """
308 308 merged = []
309 309 if len(ctx.parents()) < 2:
310 310 return merged
311 311 for f in ctx.files():
312 312 if f in ctx:
313 313 fctx = ctx[f]
314 314 parents = fctx._filelog.parents(fctx._filenode)
315 315 if parents[1] != node.nullid:
316 316 merged.append(f)
317 317 return merged
318 318
319 319
320 320 def computechangesetcopies(ctx):
321 321 """return the copies data for a changeset
322 322
323 323 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
324 324
325 325 Each dictionnary are in the form: `{newname: oldname}`
326 326 """
327 327 p1copies = {}
328 328 p2copies = {}
329 329 p1 = ctx.p1()
330 330 p2 = ctx.p2()
331 331 narrowmatch = ctx._repo.narrowmatch()
332 332 for dst in ctx.files():
333 333 if not narrowmatch(dst) or dst not in ctx:
334 334 continue
335 335 copied = ctx[dst].renamed()
336 336 if not copied:
337 337 continue
338 338 src, srcnode = copied
339 339 if src in p1 and p1[src].filenode() == srcnode:
340 340 p1copies[dst] = src
341 341 elif src in p2 and p2[src].filenode() == srcnode:
342 342 p2copies[dst] = src
343 343 return p1copies, p2copies
344 344
345 345
346 346 def encodecopies(files, copies):
347 347 items = []
348 348 for i, dst in enumerate(files):
349 349 if dst in copies:
350 350 items.append(b'%d\0%s' % (i, copies[dst]))
351 351 if len(items) != len(copies):
352 352 raise error.ProgrammingError(
353 353 b'some copy targets missing from file list'
354 354 )
355 355 return b"\n".join(items)
356 356
357 357
358 358 def decodecopies(files, data):
359 359 try:
360 360 copies = {}
361 361 if not data:
362 362 return copies
363 363 for l in data.split(b'\n'):
364 364 strindex, src = l.split(b'\0')
365 365 i = int(strindex)
366 366 dst = files[i]
367 367 copies[dst] = src
368 368 return copies
369 369 except (ValueError, IndexError):
370 370 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
371 371 # used different syntax for the value.
372 372 return None
373 373
374 374
375 375 def encodefileindices(files, subset):
376 376 subset = set(subset)
377 377 indices = []
378 378 for i, f in enumerate(files):
379 379 if f in subset:
380 380 indices.append(b'%d' % i)
381 381 return b'\n'.join(indices)
382 382
383 383
384 384 def decodefileindices(files, data):
385 385 try:
386 386 subset = []
387 387 if not data:
388 388 return subset
389 389 for strindex in data.split(b'\n'):
390 390 i = int(strindex)
391 391 if i < 0 or i >= len(files):
392 392 return None
393 393 subset.append(files[i])
394 394 return subset
395 395 except (ValueError, IndexError):
396 396 # Perhaps someone had chosen the same key name (e.g. "added") and
397 397 # used different syntax for the value.
398 398 return None
399 399
400 400
401 401 # see mercurial/helptext/internals/revlogs.txt for details about the format
402 402
403 403 ACTION_MASK = int("111" "00", 2)
404 404 # note: untouched file used as copy source will as `000` for this mask.
405 405 ADDED_FLAG = int("001" "00", 2)
406 406 MERGED_FLAG = int("010" "00", 2)
407 407 REMOVED_FLAG = int("011" "00", 2)
408 408 # `100` is reserved for future use
409 409 TOUCHED_FLAG = int("101" "00", 2)
410 410
411 411 COPIED_MASK = int("11", 2)
412 412 COPIED_FROM_P1_FLAG = int("10", 2)
413 413 COPIED_FROM_P2_FLAG = int("11", 2)
414 414
415 415 # structure is <flag><filename-end><copy-source>
416 416 INDEX_HEADER = struct.Struct(">L")
417 417 INDEX_ENTRY = struct.Struct(">bLL")
418 418
419 419
420 420 def encode_files_sidedata(files):
421 all_files = set(files.touched)
421 all_files = set(files.touched - files.salvaged)
422 422 all_files.update(files.copied_from_p1.values())
423 423 all_files.update(files.copied_from_p2.values())
424 424 all_files = sorted(all_files)
425 425 file_idx = {f: i for (i, f) in enumerate(all_files)}
426 426 file_idx[None] = 0
427 427
428 428 chunks = [INDEX_HEADER.pack(len(all_files))]
429 429
430 430 filename_length = 0
431 431 for f in all_files:
432 432 filename_size = len(f)
433 433 filename_length += filename_size
434 434 flag = 0
435 435 if f in files.added:
436 436 flag |= ADDED_FLAG
437 437 elif f in files.merged:
438 438 flag |= MERGED_FLAG
439 439 elif f in files.removed:
440 440 flag |= REMOVED_FLAG
441 441 elif f in files.touched:
442 442 flag |= TOUCHED_FLAG
443 443
444 444 copy = None
445 445 if f in files.copied_from_p1:
446 446 flag |= COPIED_FROM_P1_FLAG
447 447 copy = files.copied_from_p1.get(f)
448 448 elif f in files.copied_from_p2:
449 449 copy = files.copied_from_p2.get(f)
450 450 flag |= COPIED_FROM_P2_FLAG
451 451 copy_idx = file_idx[copy]
452 452 chunks.append(INDEX_ENTRY.pack(flag, filename_length, copy_idx))
453 453 chunks.extend(all_files)
454 454 return {sidedatamod.SD_FILES: b''.join(chunks)}
455 455
456 456
457 457 def decode_files_sidedata(sidedata):
458 458 md = ChangingFiles()
459 459 raw = sidedata.get(sidedatamod.SD_FILES)
460 460
461 461 if raw is None:
462 462 return md
463 463
464 464 copies = []
465 465 all_files = []
466 466
467 467 assert len(raw) >= INDEX_HEADER.size
468 468 total_files = INDEX_HEADER.unpack_from(raw, 0)[0]
469 469
470 470 offset = INDEX_HEADER.size
471 471 file_offset_base = offset + (INDEX_ENTRY.size * total_files)
472 472 file_offset_last = file_offset_base
473 473
474 474 assert len(raw) >= file_offset_base
475 475
476 476 for idx in range(total_files):
477 477 flag, file_end, copy_idx = INDEX_ENTRY.unpack_from(raw, offset)
478 478 file_end += file_offset_base
479 479 filename = raw[file_offset_last:file_end]
480 480 filesize = file_end - file_offset_last
481 481 assert len(filename) == filesize
482 482 offset += INDEX_ENTRY.size
483 483 file_offset_last = file_end
484 484 all_files.append(filename)
485 485 if flag & ACTION_MASK == ADDED_FLAG:
486 486 md.mark_added(filename)
487 487 elif flag & ACTION_MASK == MERGED_FLAG:
488 488 md.mark_merged(filename)
489 489 elif flag & ACTION_MASK == REMOVED_FLAG:
490 490 md.mark_removed(filename)
491 491 elif flag & ACTION_MASK == TOUCHED_FLAG:
492 492 md.mark_touched(filename)
493 493
494 494 copied = None
495 495 if flag & COPIED_MASK == COPIED_FROM_P1_FLAG:
496 496 copied = md.mark_copied_from_p1
497 497 elif flag & COPIED_MASK == COPIED_FROM_P2_FLAG:
498 498 copied = md.mark_copied_from_p2
499 499
500 500 if copied is not None:
501 501 copies.append((copied, filename, copy_idx))
502 502
503 503 for copied, filename, copy_idx in copies:
504 504 copied(all_files[copy_idx], filename)
505 505
506 506 return md
507 507
508 508
509 509 def _getsidedata(srcrepo, rev):
510 510 ctx = srcrepo[rev]
511 511 filescopies = computechangesetcopies(ctx)
512 512 filesadded = computechangesetfilesadded(ctx)
513 513 filesremoved = computechangesetfilesremoved(ctx)
514 514 filesmerged = computechangesetfilesmerged(ctx)
515 515 files = ChangingFiles()
516 516 files.update_touched(ctx.files())
517 517 files.update_added(filesadded)
518 518 files.update_removed(filesremoved)
519 519 files.update_merged(filesmerged)
520 520 files.update_copies_from_p1(filescopies[0])
521 521 files.update_copies_from_p2(filescopies[1])
522 522 return encode_files_sidedata(files)
523 523
524 524
525 525 def getsidedataadder(srcrepo, destrepo):
526 526 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
527 527 if pycompat.iswindows or not use_w:
528 528 return _get_simple_sidedata_adder(srcrepo, destrepo)
529 529 else:
530 530 return _get_worker_sidedata_adder(srcrepo, destrepo)
531 531
532 532
533 533 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
534 534 """The function used by worker precomputing sidedata
535 535
536 536 It read an input queue containing revision numbers
537 537 It write in an output queue containing (rev, <sidedata-map>)
538 538
539 539 The `None` input value is used as a stop signal.
540 540
541 541 The `tokens` semaphore is user to avoid having too many unprocessed
542 542 entries. The workers needs to acquire one token before fetching a task.
543 543 They will be released by the consumer of the produced data.
544 544 """
545 545 tokens.acquire()
546 546 rev = revs_queue.get()
547 547 while rev is not None:
548 548 data = _getsidedata(srcrepo, rev)
549 549 sidedata_queue.put((rev, data))
550 550 tokens.acquire()
551 551 rev = revs_queue.get()
552 552 # processing of `None` is completed, release the token.
553 553 tokens.release()
554 554
555 555
556 556 BUFF_PER_WORKER = 50
557 557
558 558
559 559 def _get_worker_sidedata_adder(srcrepo, destrepo):
560 560 """The parallel version of the sidedata computation
561 561
562 562 This code spawn a pool of worker that precompute a buffer of sidedata
563 563 before we actually need them"""
564 564 # avoid circular import copies -> scmutil -> worker -> copies
565 565 from . import worker
566 566
567 567 nbworkers = worker._numworkers(srcrepo.ui)
568 568
569 569 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
570 570 revsq = multiprocessing.Queue()
571 571 sidedataq = multiprocessing.Queue()
572 572
573 573 assert srcrepo.filtername is None
574 574 # queue all tasks beforehand, revision numbers are small and it make
575 575 # synchronisation simpler
576 576 #
577 577 # Since the computation for each node can be quite expensive, the overhead
578 578 # of using a single queue is not revelant. In practice, most computation
579 579 # are fast but some are very expensive and dominate all the other smaller
580 580 # cost.
581 581 for r in srcrepo.changelog.revs():
582 582 revsq.put(r)
583 583 # queue the "no more tasks" markers
584 584 for i in range(nbworkers):
585 585 revsq.put(None)
586 586
587 587 allworkers = []
588 588 for i in range(nbworkers):
589 589 args = (srcrepo, revsq, sidedataq, tokens)
590 590 w = multiprocessing.Process(target=_sidedata_worker, args=args)
591 591 allworkers.append(w)
592 592 w.start()
593 593
594 594 # dictionnary to store results for revision higher than we one we are
595 595 # looking for. For example, if we need the sidedatamap for 42, and 43 is
596 596 # received, when shelve 43 for later use.
597 597 staging = {}
598 598
599 599 def sidedata_companion(revlog, rev):
600 600 sidedata = {}
601 601 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
602 602 # Is the data previously shelved ?
603 603 sidedata = staging.pop(rev, None)
604 604 if sidedata is None:
605 605 # look at the queued result until we find the one we are lookig
606 606 # for (shelve the other ones)
607 607 r, sidedata = sidedataq.get()
608 608 while r != rev:
609 609 staging[r] = sidedata
610 610 r, sidedata = sidedataq.get()
611 611 tokens.release()
612 612 return False, (), sidedata
613 613
614 614 return sidedata_companion
615 615
616 616
617 617 def _get_simple_sidedata_adder(srcrepo, destrepo):
618 618 """The simple version of the sidedata computation
619 619
620 620 It just compute it in the same thread on request"""
621 621
622 622 def sidedatacompanion(revlog, rev):
623 623 sidedata = {}
624 624 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
625 625 sidedata = _getsidedata(srcrepo, rev)
626 626 return False, (), sidedata
627 627
628 628 return sidedatacompanion
629 629
630 630
631 631 def getsidedataremover(srcrepo, destrepo):
632 632 def sidedatacompanion(revlog, rev):
633 633 f = ()
634 634 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
635 635 if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
636 636 f = (
637 637 sidedatamod.SD_P1COPIES,
638 638 sidedatamod.SD_P2COPIES,
639 639 sidedatamod.SD_FILESADDED,
640 640 sidedatamod.SD_FILESREMOVED,
641 641 )
642 642 return False, f, {}
643 643
644 644 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now