##// END OF EJS Templates
changing-files: cache the various property...
marmoute -
r46198:d3148337 default
parent child Browse files
Show More
@@ -1,557 +1,569 b''
1 1 # metadata.py -- code related to various metadata computation and access.
2 2 #
3 3 # Copyright 2019 Google, Inc <martinvonz@google.com>
4 4 # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import multiprocessing
11 11
12 12 from . import (
13 13 error,
14 14 node,
15 15 pycompat,
16 16 util,
17 17 )
18 18
19 19 from .revlogutils import (
20 20 flagutil as sidedataflag,
21 21 sidedata as sidedatamod,
22 22 )
23 23
24 24
25 25 class ChangingFiles(object):
26 26 """A class recording the changes made to files by a changeset
27 27
28 28 Actions performed on files are gathered into 3 sets:
29 29
30 30 - added: files actively added in the changeset.
31 31 - merged: files whose history got merged
32 32 - removed: files removed in the revision
33 33 - touched: files affected by the merge
34 34
35 35 and copies information is held by 2 mappings
36 36
37 37 - copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
38 38 - copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
39 39
40 40 See their inline help for details.
41 41 """
42 42
43 43 def __init__(
44 44 self,
45 45 touched=None,
46 46 added=None,
47 47 removed=None,
48 48 merged=None,
49 49 p1_copies=None,
50 50 p2_copies=None,
51 51 ):
52 52 self._added = set(() if added is None else added)
53 53 self._merged = set(() if merged is None else merged)
54 54 self._removed = set(() if removed is None else removed)
55 55 self._touched = set(() if touched is None else touched)
56 56 self._touched.update(self._added)
57 57 self._touched.update(self._merged)
58 58 self._touched.update(self._removed)
59 59 self._p1_copies = dict(() if p1_copies is None else p1_copies)
60 60 self._p2_copies = dict(() if p2_copies is None else p2_copies)
61 61
62 62 def __eq__(self, other):
63 63 return (
64 64 self.added == other.added
65 65 and self.merged == other.merged
66 66 and self.removed == other.removed
67 67 and self.touched == other.touched
68 68 and self.copied_from_p1 == other.copied_from_p1
69 69 and self.copied_from_p2 == other.copied_from_p2
70 70 )
71 71
72 @property
72 @util.propertycache
73 73 def added(self):
74 74 """files actively added in the changeset
75 75
76 76 Any file present in that revision that was absent in all the changeset's
77 77 parents.
78 78
79 79 In case of merge, this means a file absent in one of the parents but
80 80 existing in the other will *not* be contained in this set. (They were
81 81 added by an ancestor)
82 82 """
83 83 return frozenset(self._added)
84 84
85 85 def mark_added(self, filename):
86 if 'added' in vars(self):
87 del self.added
86 88 self._added.add(filename)
87 89 self.mark_touched(filename)
88 90
89 91 def update_added(self, filenames):
90 92 for f in filenames:
91 93 self.mark_added(f)
92 94
93 @property
95 @util.propertycache
94 96 def merged(self):
95 97 """files actively merged during a merge
96 98
97 99 Any modified files which had modification on both size that needed merging.
98 100
99 101 In this case a new filenode was created and it has two parents.
100 102 """
101 103 return frozenset(self._merged)
102 104
103 105 def mark_merged(self, filename):
106 if 'merged' in vars(self):
107 del self.merged
104 108 self._merged.add(filename)
105 109 self.mark_touched(filename)
106 110
107 111 def update_merged(self, filenames):
108 112 for f in filenames:
109 113 self.mark_merged(f)
110 114
111 @property
115 @util.propertycache
112 116 def removed(self):
113 117 """files actively removed by the changeset
114 118
115 119 In case of merge this will only contain the set of files removing "new"
116 120 content. For any file absent in the current changeset:
117 121
118 122 a) If the file exists in both parents, it is clearly "actively" removed
119 123 by this changeset.
120 124
121 125 b) If a file exists in only one parent and in none of the common
122 126 ancestors, then the file was newly added in one of the merged branches
123 127 and then got "actively" removed.
124 128
125 129 c) If a file exists in only one parent and at least one of the common
126 130 ancestors using the same filenode, then the file was unchanged on one
127 131 side and deleted on the other side. The merge "passively" propagated
128 132 that deletion, but didn't "actively" remove the file. In this case the
129 133 file is *not* included in the `removed` set.
130 134
131 135 d) If a file exists in only one parent and at least one of the common
132 136 ancestors using a different filenode, then the file was changed on one
133 137 side and removed on the other side. The merge process "actively"
134 138 decided to drop the new change and delete the file. Unlike in the
135 139 previous case, (c), the file included in the `removed` set.
136 140
137 141 Summary table for merge:
138 142
139 143 case | exists in parents | exists in gca || removed
140 144 (a) | both | * || yes
141 145 (b) | one | none || yes
142 146 (c) | one | same filenode || no
143 147 (d) | one | new filenode || yes
144 148 """
145 149 return frozenset(self._removed)
146 150
147 151 def mark_removed(self, filename):
152 if 'removed' in vars(self):
153 del self.removed
148 154 self._removed.add(filename)
149 155 self.mark_touched(filename)
150 156
151 157 def update_removed(self, filenames):
152 158 for f in filenames:
153 159 self.mark_removed(f)
154 160
155 @property
161 @util.propertycache
156 162 def touched(self):
157 163 """files either actively modified, added or removed"""
158 164 return frozenset(self._touched)
159 165
160 166 def mark_touched(self, filename):
167 if 'touched' in vars(self):
168 del self.touched
161 169 self._touched.add(filename)
162 170
163 171 def update_touched(self, filenames):
164 172 for f in filenames:
165 173 self.mark_touched(f)
166 174
167 @property
175 @util.propertycache
168 176 def copied_from_p1(self):
169 177 return self._p1_copies.copy()
170 178
171 179 def mark_copied_from_p1(self, source, dest):
180 if 'copied_from_p1' in vars(self):
181 del self.copied_from_p1
172 182 self._p1_copies[dest] = source
173 183
174 184 def update_copies_from_p1(self, copies):
175 185 for dest, source in copies.items():
176 186 self.mark_copied_from_p1(source, dest)
177 187
178 @property
188 @util.propertycache
179 189 def copied_from_p2(self):
180 190 return self._p2_copies.copy()
181 191
182 192 def mark_copied_from_p2(self, source, dest):
193 if 'copied_from_p2' in vars(self):
194 del self.copied_from_p2
183 195 self._p2_copies[dest] = source
184 196
185 197 def update_copies_from_p2(self, copies):
186 198 for dest, source in copies.items():
187 199 self.mark_copied_from_p2(source, dest)
188 200
189 201
190 202 def computechangesetfilesadded(ctx):
191 203 """return the list of files added in a changeset
192 204 """
193 205 added = []
194 206 for f in ctx.files():
195 207 if not any(f in p for p in ctx.parents()):
196 208 added.append(f)
197 209 return added
198 210
199 211
200 212 def get_removal_filter(ctx, x=None):
201 213 """return a function to detect files "wrongly" detected as `removed`
202 214
203 215 When a file is removed relative to p1 in a merge, this
204 216 function determines whether the absence is due to a
205 217 deletion from a parent, or whether the merge commit
206 218 itself deletes the file. We decide this by doing a
207 219 simplified three way merge of the manifest entry for
208 220 the file. There are two ways we decide the merge
209 221 itself didn't delete a file:
210 222 - neither parent (nor the merge) contain the file
211 223 - exactly one parent contains the file, and that
212 224 parent has the same filelog entry as the merge
213 225 ancestor (or all of them if there two). In other
214 226 words, that parent left the file unchanged while the
215 227 other one deleted it.
216 228 One way to think about this is that deleting a file is
217 229 similar to emptying it, so the list of changed files
218 230 should be similar either way. The computation
219 231 described above is not done directly in _filecommit
220 232 when creating the list of changed files, however
221 233 it does something very similar by comparing filelog
222 234 nodes.
223 235 """
224 236
225 237 if x is not None:
226 238 p1, p2, m1, m2 = x
227 239 else:
228 240 p1 = ctx.p1()
229 241 p2 = ctx.p2()
230 242 m1 = p1.manifest()
231 243 m2 = p2.manifest()
232 244
233 245 @util.cachefunc
234 246 def mas():
235 247 p1n = p1.node()
236 248 p2n = p2.node()
237 249 cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
238 250 if not cahs:
239 251 cahs = [node.nullrev]
240 252 return [ctx.repo()[r].manifest() for r in cahs]
241 253
242 254 def deletionfromparent(f):
243 255 if f in m1:
244 256 return f not in m2 and all(
245 257 f in ma and ma.find(f) == m1.find(f) for ma in mas()
246 258 )
247 259 elif f in m2:
248 260 return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
249 261 else:
250 262 return True
251 263
252 264 return deletionfromparent
253 265
254 266
255 267 def computechangesetfilesremoved(ctx):
256 268 """return the list of files removed in a changeset
257 269 """
258 270 removed = []
259 271 for f in ctx.files():
260 272 if f not in ctx:
261 273 removed.append(f)
262 274 if removed:
263 275 rf = get_removal_filter(ctx)
264 276 removed = [r for r in removed if not rf(r)]
265 277 return removed
266 278
267 279
268 280 def computechangesetfilesmerged(ctx):
269 281 """return the list of files merged in a changeset
270 282 """
271 283 merged = []
272 284 if len(ctx.parents()) < 2:
273 285 return merged
274 286 for f in ctx.files():
275 287 if f in ctx:
276 288 fctx = ctx[f]
277 289 parents = fctx._filelog.parents(fctx._filenode)
278 290 if parents[1] != node.nullid:
279 291 merged.append(f)
280 292 return merged
281 293
282 294
283 295 def computechangesetcopies(ctx):
284 296 """return the copies data for a changeset
285 297
286 298 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
287 299
288 300 Each dictionnary are in the form: `{newname: oldname}`
289 301 """
290 302 p1copies = {}
291 303 p2copies = {}
292 304 p1 = ctx.p1()
293 305 p2 = ctx.p2()
294 306 narrowmatch = ctx._repo.narrowmatch()
295 307 for dst in ctx.files():
296 308 if not narrowmatch(dst) or dst not in ctx:
297 309 continue
298 310 copied = ctx[dst].renamed()
299 311 if not copied:
300 312 continue
301 313 src, srcnode = copied
302 314 if src in p1 and p1[src].filenode() == srcnode:
303 315 p1copies[dst] = src
304 316 elif src in p2 and p2[src].filenode() == srcnode:
305 317 p2copies[dst] = src
306 318 return p1copies, p2copies
307 319
308 320
309 321 def encodecopies(files, copies):
310 322 items = []
311 323 for i, dst in enumerate(files):
312 324 if dst in copies:
313 325 items.append(b'%d\0%s' % (i, copies[dst]))
314 326 if len(items) != len(copies):
315 327 raise error.ProgrammingError(
316 328 b'some copy targets missing from file list'
317 329 )
318 330 return b"\n".join(items)
319 331
320 332
321 333 def decodecopies(files, data):
322 334 try:
323 335 copies = {}
324 336 if not data:
325 337 return copies
326 338 for l in data.split(b'\n'):
327 339 strindex, src = l.split(b'\0')
328 340 i = int(strindex)
329 341 dst = files[i]
330 342 copies[dst] = src
331 343 return copies
332 344 except (ValueError, IndexError):
333 345 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
334 346 # used different syntax for the value.
335 347 return None
336 348
337 349
338 350 def encodefileindices(files, subset):
339 351 subset = set(subset)
340 352 indices = []
341 353 for i, f in enumerate(files):
342 354 if f in subset:
343 355 indices.append(b'%d' % i)
344 356 return b'\n'.join(indices)
345 357
346 358
347 359 def decodefileindices(files, data):
348 360 try:
349 361 subset = []
350 362 if not data:
351 363 return subset
352 364 for strindex in data.split(b'\n'):
353 365 i = int(strindex)
354 366 if i < 0 or i >= len(files):
355 367 return None
356 368 subset.append(files[i])
357 369 return subset
358 370 except (ValueError, IndexError):
359 371 # Perhaps someone had chosen the same key name (e.g. "added") and
360 372 # used different syntax for the value.
361 373 return None
362 374
363 375
364 376 def encode_files_sidedata(files):
365 377 sortedfiles = sorted(files.touched)
366 378 sidedata = {}
367 379 p1copies = files.copied_from_p1
368 380 if p1copies:
369 381 p1copies = encodecopies(sortedfiles, p1copies)
370 382 sidedata[sidedatamod.SD_P1COPIES] = p1copies
371 383 p2copies = files.copied_from_p2
372 384 if p2copies:
373 385 p2copies = encodecopies(sortedfiles, p2copies)
374 386 sidedata[sidedatamod.SD_P2COPIES] = p2copies
375 387 filesadded = files.added
376 388 if filesadded:
377 389 filesadded = encodefileindices(sortedfiles, filesadded)
378 390 sidedata[sidedatamod.SD_FILESADDED] = filesadded
379 391 filesremoved = files.removed
380 392 if filesremoved:
381 393 filesremoved = encodefileindices(sortedfiles, filesremoved)
382 394 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
383 395 if not sidedata:
384 396 sidedata = None
385 397 return sidedata
386 398
387 399
388 400 def decode_files_sidedata(changelogrevision, sidedata):
389 401 """Return a ChangingFiles instance from a changelogrevision using sidata
390 402 """
391 403 touched = changelogrevision.files
392 404
393 405 rawindices = sidedata.get(sidedatamod.SD_FILESADDED)
394 406 added = decodefileindices(touched, rawindices)
395 407
396 408 rawindices = sidedata.get(sidedatamod.SD_FILESREMOVED)
397 409 removed = decodefileindices(touched, rawindices)
398 410
399 411 rawcopies = sidedata.get(sidedatamod.SD_P1COPIES)
400 412 p1_copies = decodecopies(touched, rawcopies)
401 413
402 414 rawcopies = sidedata.get(sidedatamod.SD_P2COPIES)
403 415 p2_copies = decodecopies(touched, rawcopies)
404 416
405 417 return ChangingFiles(
406 418 touched=touched,
407 419 added=added,
408 420 removed=removed,
409 421 p1_copies=p1_copies,
410 422 p2_copies=p2_copies,
411 423 )
412 424
413 425
414 426 def _getsidedata(srcrepo, rev):
415 427 ctx = srcrepo[rev]
416 428 filescopies = computechangesetcopies(ctx)
417 429 filesadded = computechangesetfilesadded(ctx)
418 430 filesremoved = computechangesetfilesremoved(ctx)
419 431 sidedata = {}
420 432 if any([filescopies, filesadded, filesremoved]):
421 433 sortedfiles = sorted(ctx.files())
422 434 p1copies, p2copies = filescopies
423 435 p1copies = encodecopies(sortedfiles, p1copies)
424 436 p2copies = encodecopies(sortedfiles, p2copies)
425 437 filesadded = encodefileindices(sortedfiles, filesadded)
426 438 filesremoved = encodefileindices(sortedfiles, filesremoved)
427 439 if p1copies:
428 440 sidedata[sidedatamod.SD_P1COPIES] = p1copies
429 441 if p2copies:
430 442 sidedata[sidedatamod.SD_P2COPIES] = p2copies
431 443 if filesadded:
432 444 sidedata[sidedatamod.SD_FILESADDED] = filesadded
433 445 if filesremoved:
434 446 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
435 447 return sidedata
436 448
437 449
438 450 def getsidedataadder(srcrepo, destrepo):
439 451 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
440 452 if pycompat.iswindows or not use_w:
441 453 return _get_simple_sidedata_adder(srcrepo, destrepo)
442 454 else:
443 455 return _get_worker_sidedata_adder(srcrepo, destrepo)
444 456
445 457
446 458 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
447 459 """The function used by worker precomputing sidedata
448 460
449 461 It read an input queue containing revision numbers
450 462 It write in an output queue containing (rev, <sidedata-map>)
451 463
452 464 The `None` input value is used as a stop signal.
453 465
454 466 The `tokens` semaphore is user to avoid having too many unprocessed
455 467 entries. The workers needs to acquire one token before fetching a task.
456 468 They will be released by the consumer of the produced data.
457 469 """
458 470 tokens.acquire()
459 471 rev = revs_queue.get()
460 472 while rev is not None:
461 473 data = _getsidedata(srcrepo, rev)
462 474 sidedata_queue.put((rev, data))
463 475 tokens.acquire()
464 476 rev = revs_queue.get()
465 477 # processing of `None` is completed, release the token.
466 478 tokens.release()
467 479
468 480
469 481 BUFF_PER_WORKER = 50
470 482
471 483
472 484 def _get_worker_sidedata_adder(srcrepo, destrepo):
473 485 """The parallel version of the sidedata computation
474 486
475 487 This code spawn a pool of worker that precompute a buffer of sidedata
476 488 before we actually need them"""
477 489 # avoid circular import copies -> scmutil -> worker -> copies
478 490 from . import worker
479 491
480 492 nbworkers = worker._numworkers(srcrepo.ui)
481 493
482 494 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
483 495 revsq = multiprocessing.Queue()
484 496 sidedataq = multiprocessing.Queue()
485 497
486 498 assert srcrepo.filtername is None
487 499 # queue all tasks beforehand, revision numbers are small and it make
488 500 # synchronisation simpler
489 501 #
490 502 # Since the computation for each node can be quite expensive, the overhead
491 503 # of using a single queue is not revelant. In practice, most computation
492 504 # are fast but some are very expensive and dominate all the other smaller
493 505 # cost.
494 506 for r in srcrepo.changelog.revs():
495 507 revsq.put(r)
496 508 # queue the "no more tasks" markers
497 509 for i in range(nbworkers):
498 510 revsq.put(None)
499 511
500 512 allworkers = []
501 513 for i in range(nbworkers):
502 514 args = (srcrepo, revsq, sidedataq, tokens)
503 515 w = multiprocessing.Process(target=_sidedata_worker, args=args)
504 516 allworkers.append(w)
505 517 w.start()
506 518
507 519 # dictionnary to store results for revision higher than we one we are
508 520 # looking for. For example, if we need the sidedatamap for 42, and 43 is
509 521 # received, when shelve 43 for later use.
510 522 staging = {}
511 523
512 524 def sidedata_companion(revlog, rev):
513 525 sidedata = {}
514 526 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
515 527 # Is the data previously shelved ?
516 528 sidedata = staging.pop(rev, None)
517 529 if sidedata is None:
518 530 # look at the queued result until we find the one we are lookig
519 531 # for (shelve the other ones)
520 532 r, sidedata = sidedataq.get()
521 533 while r != rev:
522 534 staging[r] = sidedata
523 535 r, sidedata = sidedataq.get()
524 536 tokens.release()
525 537 return False, (), sidedata
526 538
527 539 return sidedata_companion
528 540
529 541
530 542 def _get_simple_sidedata_adder(srcrepo, destrepo):
531 543 """The simple version of the sidedata computation
532 544
533 545 It just compute it in the same thread on request"""
534 546
535 547 def sidedatacompanion(revlog, rev):
536 548 sidedata = {}
537 549 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
538 550 sidedata = _getsidedata(srcrepo, rev)
539 551 return False, (), sidedata
540 552
541 553 return sidedatacompanion
542 554
543 555
544 556 def getsidedataremover(srcrepo, destrepo):
545 557 def sidedatacompanion(revlog, rev):
546 558 f = ()
547 559 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
548 560 if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
549 561 f = (
550 562 sidedatamod.SD_P1COPIES,
551 563 sidedatamod.SD_P2COPIES,
552 564 sidedatamod.SD_FILESADDED,
553 565 sidedatamod.SD_FILESREMOVED,
554 566 )
555 567 return False, f, {}
556 568
557 569 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now