##// END OF EJS Templates
changing-files: add the ability to track merged files too...
marmoute -
r46186:e5578dbe default
parent child Browse files
Show More
@@ -1,519 +1,542 b''
1 1 # metadata.py -- code related to various metadata computation and access.
2 2 #
3 3 # Copyright 2019 Google, Inc <martinvonz@google.com>
4 4 # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import multiprocessing
11 11
12 12 from . import (
13 13 error,
14 14 node,
15 15 pycompat,
16 16 util,
17 17 )
18 18
19 19 from .revlogutils import (
20 20 flagutil as sidedataflag,
21 21 sidedata as sidedatamod,
22 22 )
23 23
24 24
25 25 class ChangingFiles(object):
26 26 """A class recording the changes made to a file by a changeset
27 27
28 28 Actions performed on files are gathered into 3 sets:
29 29
30 30 - added: files actively added in the changeset.
31 - merged: files whose history got merged
31 32 - removed: files removed in the revision
32 33 - touched: files affected by the merge
33 34
34 35 and copies information is held by 2 mappings
35 36
36 37 - copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
37 38 - copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
38 39
39 40 See their inline help for details.
40 41 """
41 42
42 43 def __init__(
43 44 self,
44 45 touched=None,
45 46 added=None,
46 47 removed=None,
48 merged=None,
47 49 p1_copies=None,
48 50 p2_copies=None,
49 51 ):
50 52 self._added = set(() if added is None else added)
53 self._merged = set(() if merged is None else merged)
51 54 self._removed = set(() if removed is None else removed)
52 55 self._touched = set(() if touched is None else touched)
53 56 self._touched.update(self._added)
57 self._touched.update(self._merged)
54 58 self._touched.update(self._removed)
55 59 self._p1_copies = dict(() if p1_copies is None else p1_copies)
56 60 self._p2_copies = dict(() if p2_copies is None else p2_copies)
57 61
58 62 def __eq__(self, other):
59 63 return (
60 64 self.added == other.added
65 and self.merged == other.merged
61 66 and self.removed == other.removed
62 67 and self.touched == other.touched
63 68 and self.copied_from_p1 == other.copied_from_p1
64 69 and self.copied_from_p2 == other.copied_from_p2
65 70 )
66 71
67 72 @property
68 73 def added(self):
69 74 """files actively added in the changeset
70 75
71 76 Any file present in that revision that was absent in all the changeset's
72 77 parents.
73 78
74 79 In case of merge, this means a file absent in one of the parents but
75 80 existing in the other will *not* be contained in this set. (They were
76 81 added by an ancestor)
77 82 """
78 83 return frozenset(self._added)
79 84
80 85 def mark_added(self, filename):
81 86 self._added.add(filename)
82 87 self._touched.add(filename)
83 88
84 89 def update_added(self, filenames):
85 90 for f in filenames:
86 91 self.mark_added(f)
87 92
88 93 @property
94 def merged(self):
95 """files actively merged during a merge
96
97 Any modified files which had modification on both size that needed merging.
98
99 In this case a new filenode was created and it has two parents.
100 """
101 return frozenset(self._merged)
102
103 def mark_merged(self, filename):
104 self._merged.add(filename)
105 self._touched.add(filename)
106
107 def update_merged(self, filenames):
108 for f in filenames:
109 self.mark_merged(f)
110
111 @property
89 112 def removed(self):
90 113 """files actively removed by the changeset
91 114
92 115 In case of merge this will only contain the set of files removing "new"
93 116 content. For any file absent in the current changeset:
94 117
95 118 a) If the file exists in both parents, it is clearly "actively" removed
96 119 by this changeset.
97 120
98 121 b) If a file exists in only one parent and in none of the common
99 122 ancestors, then the file was newly added in one of the merged branches
100 123 and then got "actively" removed.
101 124
102 125 c) If a file exists in only one parent and at least one of the common
103 126 ancestors using the same filenode, then the file was unchanged on one
104 127 side and deleted on the other side. The merge "passively" propagated
105 128 that deletion, but didn't "actively" remove the file. In this case the
106 129 file is *not* included in the `removed` set.
107 130
108 131 d) If a file exists in only one parent and at least one of the common
109 132 ancestors using a different filenode, then the file was changed on one
110 133 side and removed on the other side. The merge process "actively"
111 134 decided to drop the new change and delete the file. Unlike in the
112 135 previous case, (c), the file included in the `removed` set.
113 136
114 137 Summary table for merge:
115 138
116 139 case | exists in parents | exists in gca || removed
117 140 (a) | both | * || yes
118 141 (b) | one | none || yes
119 142 (c) | one | same filenode || no
120 143 (d) | one | new filenode || yes
121 144 """
122 145 return frozenset(self._removed)
123 146
124 147 def mark_removed(self, filename):
125 148 self._removed.add(filename)
126 149 self._touched.add(filename)
127 150
128 151 def update_removed(self, filenames):
129 152 for f in filenames:
130 153 self.mark_removed(f)
131 154
132 155 @property
133 156 def touched(self):
134 157 """files either actively modified, added or removed"""
135 158 return frozenset(self._touched)
136 159
137 160 def mark_touched(self, filename):
138 161 self._touched.add(filename)
139 162
140 163 def update_touched(self, filenames):
141 164 for f in filenames:
142 165 self.mark_touched(f)
143 166
144 167 @property
145 168 def copied_from_p1(self):
146 169 return self._p1_copies.copy()
147 170
148 171 def mark_copied_from_p1(self, source, dest):
149 172 self._p1_copies[dest] = source
150 173
151 174 def update_copies_from_p1(self, copies):
152 175 for dest, source in copies.items():
153 176 self.mark_copied_from_p1(source, dest)
154 177
155 178 @property
156 179 def copied_from_p2(self):
157 180 return self._p2_copies.copy()
158 181
159 182 def mark_copied_from_p2(self, source, dest):
160 183 self._p2_copies[dest] = source
161 184
162 185 def update_copies_from_p2(self, copies):
163 186 for dest, source in copies.items():
164 187 self.mark_copied_from_p2(source, dest)
165 188
166 189
167 190 def computechangesetfilesadded(ctx):
168 191 """return the list of files added in a changeset
169 192 """
170 193 added = []
171 194 for f in ctx.files():
172 195 if not any(f in p for p in ctx.parents()):
173 196 added.append(f)
174 197 return added
175 198
176 199
177 200 def get_removal_filter(ctx, x=None):
178 201 """return a function to detect files "wrongly" detected as `removed`
179 202
180 203 When a file is removed relative to p1 in a merge, this
181 204 function determines whether the absence is due to a
182 205 deletion from a parent, or whether the merge commit
183 206 itself deletes the file. We decide this by doing a
184 207 simplified three way merge of the manifest entry for
185 208 the file. There are two ways we decide the merge
186 209 itself didn't delete a file:
187 210 - neither parent (nor the merge) contain the file
188 211 - exactly one parent contains the file, and that
189 212 parent has the same filelog entry as the merge
190 213 ancestor (or all of them if there two). In other
191 214 words, that parent left the file unchanged while the
192 215 other one deleted it.
193 216 One way to think about this is that deleting a file is
194 217 similar to emptying it, so the list of changed files
195 218 should be similar either way. The computation
196 219 described above is not done directly in _filecommit
197 220 when creating the list of changed files, however
198 221 it does something very similar by comparing filelog
199 222 nodes.
200 223 """
201 224
202 225 if x is not None:
203 226 p1, p2, m1, m2 = x
204 227 else:
205 228 p1 = ctx.p1()
206 229 p2 = ctx.p2()
207 230 m1 = p1.manifest()
208 231 m2 = p2.manifest()
209 232
210 233 @util.cachefunc
211 234 def mas():
212 235 p1n = p1.node()
213 236 p2n = p2.node()
214 237 cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
215 238 if not cahs:
216 239 cahs = [node.nullrev]
217 240 return [ctx.repo()[r].manifest() for r in cahs]
218 241
219 242 def deletionfromparent(f):
220 243 if f in m1:
221 244 return f not in m2 and all(
222 245 f in ma and ma.find(f) == m1.find(f) for ma in mas()
223 246 )
224 247 elif f in m2:
225 248 return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
226 249 else:
227 250 return True
228 251
229 252 return deletionfromparent
230 253
231 254
232 255 def computechangesetfilesremoved(ctx):
233 256 """return the list of files removed in a changeset
234 257 """
235 258 removed = []
236 259 for f in ctx.files():
237 260 if f not in ctx:
238 261 removed.append(f)
239 262 if removed:
240 263 rf = get_removal_filter(ctx)
241 264 removed = [r for r in removed if not rf(r)]
242 265 return removed
243 266
244 267
245 268 def computechangesetcopies(ctx):
246 269 """return the copies data for a changeset
247 270
248 271 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
249 272
250 273 Each dictionnary are in the form: `{newname: oldname}`
251 274 """
252 275 p1copies = {}
253 276 p2copies = {}
254 277 p1 = ctx.p1()
255 278 p2 = ctx.p2()
256 279 narrowmatch = ctx._repo.narrowmatch()
257 280 for dst in ctx.files():
258 281 if not narrowmatch(dst) or dst not in ctx:
259 282 continue
260 283 copied = ctx[dst].renamed()
261 284 if not copied:
262 285 continue
263 286 src, srcnode = copied
264 287 if src in p1 and p1[src].filenode() == srcnode:
265 288 p1copies[dst] = src
266 289 elif src in p2 and p2[src].filenode() == srcnode:
267 290 p2copies[dst] = src
268 291 return p1copies, p2copies
269 292
270 293
271 294 def encodecopies(files, copies):
272 295 items = []
273 296 for i, dst in enumerate(files):
274 297 if dst in copies:
275 298 items.append(b'%d\0%s' % (i, copies[dst]))
276 299 if len(items) != len(copies):
277 300 raise error.ProgrammingError(
278 301 b'some copy targets missing from file list'
279 302 )
280 303 return b"\n".join(items)
281 304
282 305
283 306 def decodecopies(files, data):
284 307 try:
285 308 copies = {}
286 309 if not data:
287 310 return copies
288 311 for l in data.split(b'\n'):
289 312 strindex, src = l.split(b'\0')
290 313 i = int(strindex)
291 314 dst = files[i]
292 315 copies[dst] = src
293 316 return copies
294 317 except (ValueError, IndexError):
295 318 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
296 319 # used different syntax for the value.
297 320 return None
298 321
299 322
300 323 def encodefileindices(files, subset):
301 324 subset = set(subset)
302 325 indices = []
303 326 for i, f in enumerate(files):
304 327 if f in subset:
305 328 indices.append(b'%d' % i)
306 329 return b'\n'.join(indices)
307 330
308 331
309 332 def decodefileindices(files, data):
310 333 try:
311 334 subset = []
312 335 if not data:
313 336 return subset
314 337 for strindex in data.split(b'\n'):
315 338 i = int(strindex)
316 339 if i < 0 or i >= len(files):
317 340 return None
318 341 subset.append(files[i])
319 342 return subset
320 343 except (ValueError, IndexError):
321 344 # Perhaps someone had chosen the same key name (e.g. "added") and
322 345 # used different syntax for the value.
323 346 return None
324 347
325 348
326 349 def encode_files_sidedata(files):
327 350 sortedfiles = sorted(files.touched)
328 351 sidedata = {}
329 352 p1copies = files.copied_from_p1
330 353 if p1copies:
331 354 p1copies = encodecopies(sortedfiles, p1copies)
332 355 sidedata[sidedatamod.SD_P1COPIES] = p1copies
333 356 p2copies = files.copied_from_p2
334 357 if p2copies:
335 358 p2copies = encodecopies(sortedfiles, p2copies)
336 359 sidedata[sidedatamod.SD_P2COPIES] = p2copies
337 360 filesadded = files.added
338 361 if filesadded:
339 362 filesadded = encodefileindices(sortedfiles, filesadded)
340 363 sidedata[sidedatamod.SD_FILESADDED] = filesadded
341 364 filesremoved = files.removed
342 365 if filesremoved:
343 366 filesremoved = encodefileindices(sortedfiles, filesremoved)
344 367 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
345 368 if not sidedata:
346 369 sidedata = None
347 370 return sidedata
348 371
349 372
350 373 def decode_files_sidedata(changelogrevision, sidedata):
351 374 """Return a ChangingFiles instance from a changelogrevision using sidata
352 375 """
353 376 touched = changelogrevision.files
354 377
355 378 rawindices = sidedata.get(sidedatamod.SD_FILESADDED)
356 379 added = decodefileindices(touched, rawindices)
357 380
358 381 rawindices = sidedata.get(sidedatamod.SD_FILESREMOVED)
359 382 removed = decodefileindices(touched, rawindices)
360 383
361 384 rawcopies = sidedata.get(sidedatamod.SD_P1COPIES)
362 385 p1_copies = decodecopies(touched, rawcopies)
363 386
364 387 rawcopies = sidedata.get(sidedatamod.SD_P2COPIES)
365 388 p2_copies = decodecopies(touched, rawcopies)
366 389
367 390 return ChangingFiles(
368 391 touched=touched,
369 392 added=added,
370 393 removed=removed,
371 394 p1_copies=p1_copies,
372 395 p2_copies=p2_copies,
373 396 )
374 397
375 398
376 399 def _getsidedata(srcrepo, rev):
377 400 ctx = srcrepo[rev]
378 401 filescopies = computechangesetcopies(ctx)
379 402 filesadded = computechangesetfilesadded(ctx)
380 403 filesremoved = computechangesetfilesremoved(ctx)
381 404 sidedata = {}
382 405 if any([filescopies, filesadded, filesremoved]):
383 406 sortedfiles = sorted(ctx.files())
384 407 p1copies, p2copies = filescopies
385 408 p1copies = encodecopies(sortedfiles, p1copies)
386 409 p2copies = encodecopies(sortedfiles, p2copies)
387 410 filesadded = encodefileindices(sortedfiles, filesadded)
388 411 filesremoved = encodefileindices(sortedfiles, filesremoved)
389 412 if p1copies:
390 413 sidedata[sidedatamod.SD_P1COPIES] = p1copies
391 414 if p2copies:
392 415 sidedata[sidedatamod.SD_P2COPIES] = p2copies
393 416 if filesadded:
394 417 sidedata[sidedatamod.SD_FILESADDED] = filesadded
395 418 if filesremoved:
396 419 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
397 420 return sidedata
398 421
399 422
400 423 def getsidedataadder(srcrepo, destrepo):
401 424 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
402 425 if pycompat.iswindows or not use_w:
403 426 return _get_simple_sidedata_adder(srcrepo, destrepo)
404 427 else:
405 428 return _get_worker_sidedata_adder(srcrepo, destrepo)
406 429
407 430
408 431 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
409 432 """The function used by worker precomputing sidedata
410 433
411 434 It read an input queue containing revision numbers
412 435 It write in an output queue containing (rev, <sidedata-map>)
413 436
414 437 The `None` input value is used as a stop signal.
415 438
416 439 The `tokens` semaphore is user to avoid having too many unprocessed
417 440 entries. The workers needs to acquire one token before fetching a task.
418 441 They will be released by the consumer of the produced data.
419 442 """
420 443 tokens.acquire()
421 444 rev = revs_queue.get()
422 445 while rev is not None:
423 446 data = _getsidedata(srcrepo, rev)
424 447 sidedata_queue.put((rev, data))
425 448 tokens.acquire()
426 449 rev = revs_queue.get()
427 450 # processing of `None` is completed, release the token.
428 451 tokens.release()
429 452
430 453
431 454 BUFF_PER_WORKER = 50
432 455
433 456
434 457 def _get_worker_sidedata_adder(srcrepo, destrepo):
435 458 """The parallel version of the sidedata computation
436 459
437 460 This code spawn a pool of worker that precompute a buffer of sidedata
438 461 before we actually need them"""
439 462 # avoid circular import copies -> scmutil -> worker -> copies
440 463 from . import worker
441 464
442 465 nbworkers = worker._numworkers(srcrepo.ui)
443 466
444 467 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
445 468 revsq = multiprocessing.Queue()
446 469 sidedataq = multiprocessing.Queue()
447 470
448 471 assert srcrepo.filtername is None
449 472 # queue all tasks beforehand, revision numbers are small and it make
450 473 # synchronisation simpler
451 474 #
452 475 # Since the computation for each node can be quite expensive, the overhead
453 476 # of using a single queue is not revelant. In practice, most computation
454 477 # are fast but some are very expensive and dominate all the other smaller
455 478 # cost.
456 479 for r in srcrepo.changelog.revs():
457 480 revsq.put(r)
458 481 # queue the "no more tasks" markers
459 482 for i in range(nbworkers):
460 483 revsq.put(None)
461 484
462 485 allworkers = []
463 486 for i in range(nbworkers):
464 487 args = (srcrepo, revsq, sidedataq, tokens)
465 488 w = multiprocessing.Process(target=_sidedata_worker, args=args)
466 489 allworkers.append(w)
467 490 w.start()
468 491
469 492 # dictionnary to store results for revision higher than we one we are
470 493 # looking for. For example, if we need the sidedatamap for 42, and 43 is
471 494 # received, when shelve 43 for later use.
472 495 staging = {}
473 496
474 497 def sidedata_companion(revlog, rev):
475 498 sidedata = {}
476 499 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
477 500 # Is the data previously shelved ?
478 501 sidedata = staging.pop(rev, None)
479 502 if sidedata is None:
480 503 # look at the queued result until we find the one we are lookig
481 504 # for (shelve the other ones)
482 505 r, sidedata = sidedataq.get()
483 506 while r != rev:
484 507 staging[r] = sidedata
485 508 r, sidedata = sidedataq.get()
486 509 tokens.release()
487 510 return False, (), sidedata
488 511
489 512 return sidedata_companion
490 513
491 514
492 515 def _get_simple_sidedata_adder(srcrepo, destrepo):
493 516 """The simple version of the sidedata computation
494 517
495 518 It just compute it in the same thread on request"""
496 519
497 520 def sidedatacompanion(revlog, rev):
498 521 sidedata = {}
499 522 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
500 523 sidedata = _getsidedata(srcrepo, rev)
501 524 return False, (), sidedata
502 525
503 526 return sidedatacompanion
504 527
505 528
506 529 def getsidedataremover(srcrepo, destrepo):
507 530 def sidedatacompanion(revlog, rev):
508 531 f = ()
509 532 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
510 533 if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
511 534 f = (
512 535 sidedatamod.SD_P1COPIES,
513 536 sidedatamod.SD_P2COPIES,
514 537 sidedatamod.SD_FILESADDED,
515 538 sidedatamod.SD_FILESREMOVED,
516 539 )
517 540 return False, f, {}
518 541
519 542 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now