##// END OF EJS Templates
changing-files: document the various sets...
marmoute -
r46078:1f50bcc9 default
parent child Browse files
Show More
@@ -1,424 +1,479 b''
1 1 # metadata.py -- code related to various metadata computation and access.
2 2 #
3 3 # Copyright 2019 Google, Inc <martinvonz@google.com>
4 4 # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import multiprocessing
11 11
12 12 from . import (
13 13 error,
14 14 node,
15 15 pycompat,
16 16 util,
17 17 )
18 18
19 19 from .revlogutils import (
20 20 flagutil as sidedataflag,
21 21 sidedata as sidedatamod,
22 22 )
23 23
24 24
25 25 class ChangingFiles(object):
26 """A class recording the changes made to a file by a revision
26 """A class recording the changes made to a file by a changeset
27
28 Actions performed on files are gathered into 3 sets:
29
30 - added: files actively added in the changeset.
31 - removed: files removed in the revision
32 - touched: files affected by the merge
33
34 and copies information is held by 2 mappings
35
36 - copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
37 - copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
38
39 See their inline help for details.
27 40 """
28 41
29 42 def __init__(
30 43 self, touched=(), added=(), removed=(), p1_copies=(), p2_copies=(),
31 44 ):
32 45 self._added = set(added)
33 46 self._removed = set(removed)
34 47 self._touched = set(touched)
35 48 self._touched.update(self._added)
36 49 self._touched.update(self._removed)
37 50 self._p1_copies = dict(p1_copies)
38 51 self._p2_copies = dict(p2_copies)
39 52
40 53 @property
41 54 def added(self):
55 """files actively added in the changeset
56
57 Any file present in that revision that was absent in all the changeset's
58 parents.
59
60 In case of merge, this means a file absent in one of the parents but
61 existing in the other will *not* be contained in this set. (They were
62 added by an ancestor)
63 """
42 64 return frozenset(self._added)
43 65
44 66 def mark_added(self, filename):
45 67 self._added.add(filename)
46 68 self._touched.add(filename)
47 69
48 70 def update_added(self, filenames):
49 71 for f in filenames:
50 72 self.mark_added(f)
51 73
52 74 @property
53 75 def removed(self):
76 """files actively removed by the changeset
77
78 In case of merge this will only contain the set of files removing "new"
79 content. For any file absent in the current changeset:
80
81 a) If the file exists in both parents, it is clearly "actively" removed
82 by this changeset.
83
84 b) If a file exists in only one parent and in none of the common
85 ancestors, then the file was newly added in one of the merged branches
86 and then got "actively" removed.
87
88 c) If a file exists in only one parent and at least one of the common
89 ancestors using the same filenode, then the file was unchanged on one
90 side and deleted on the other side. The merge "passively" propagated
91 that deletion, but didn't "actively" remove the file. In this case the
92 file is *not* included in the `removed` set.
93
94 d) If a file exists in only one parent and at least one of the common
95 ancestors using a different filenode, then the file was changed on one
96 side and removed on the other side. The merge process "actively"
97 decided to drop the new change and delete the file. Unlike in the
98 previous case, (c), the file included in the `removed` set.
99
100 Summary table for merge:
101
102 case | exists in parents | exists in gca || removed
103 (a) | both | * || yes
104 (b) | one | none || yes
105 (c) | one | same filenode || no
106 (d) | one | new filenode || yes
107 """
54 108 return frozenset(self._removed)
55 109
56 110 def mark_removed(self, filename):
57 111 self._removed.add(filename)
58 112 self._touched.add(filename)
59 113
60 114 def update_removed(self, filenames):
61 115 for f in filenames:
62 116 self.mark_removed(f)
63 117
64 118 @property
65 119 def touched(self):
120 """files either actively modified, added or removed"""
66 121 return frozenset(self._touched)
67 122
68 123 def mark_touched(self, filename):
69 124 self._touched.add(filename)
70 125
71 126 def update_touched(self, filenames):
72 127 for f in filenames:
73 128 self.mark_touched(f)
74 129
75 130 @property
76 131 def copied_from_p1(self):
77 132 return self._p1_copies.copy()
78 133
79 134 def mark_copied_from_p1(self, source, dest):
80 135 self._p1_copies[dest] = source
81 136
82 137 def update_copies_from_p1(self, copies):
83 138 for dest, source in copies.items():
84 139 self.mark_copied_from_p1(source, dest)
85 140
86 141 @property
87 142 def copied_from_p2(self):
88 143 return self._p2_copies.copy()
89 144
90 145 def mark_copied_from_p2(self, source, dest):
91 146 self._p2_copies[dest] = source
92 147
93 148 def update_copies_from_p2(self, copies):
94 149 for dest, source in copies.items():
95 150 self.mark_copied_from_p2(source, dest)
96 151
97 152
98 153 def computechangesetfilesadded(ctx):
99 154 """return the list of files added in a changeset
100 155 """
101 156 added = []
102 157 for f in ctx.files():
103 158 if not any(f in p for p in ctx.parents()):
104 159 added.append(f)
105 160 return added
106 161
107 162
108 163 def get_removal_filter(ctx, x=None):
109 164 """return a function to detect files "wrongly" detected as `removed`
110 165
111 166 When a file is removed relative to p1 in a merge, this
112 167 function determines whether the absence is due to a
113 168 deletion from a parent, or whether the merge commit
114 169 itself deletes the file. We decide this by doing a
115 170 simplified three way merge of the manifest entry for
116 171 the file. There are two ways we decide the merge
117 172 itself didn't delete a file:
118 173 - neither parent (nor the merge) contain the file
119 174 - exactly one parent contains the file, and that
120 175 parent has the same filelog entry as the merge
121 176 ancestor (or all of them if there two). In other
122 177 words, that parent left the file unchanged while the
123 178 other one deleted it.
124 179 One way to think about this is that deleting a file is
125 180 similar to emptying it, so the list of changed files
126 181 should be similar either way. The computation
127 182 described above is not done directly in _filecommit
128 183 when creating the list of changed files, however
129 184 it does something very similar by comparing filelog
130 185 nodes.
131 186 """
132 187
133 188 if x is not None:
134 189 p1, p2, m1, m2 = x
135 190 else:
136 191 p1 = ctx.p1()
137 192 p2 = ctx.p2()
138 193 m1 = p1.manifest()
139 194 m2 = p2.manifest()
140 195
141 196 @util.cachefunc
142 197 def mas():
143 198 p1n = p1.node()
144 199 p2n = p2.node()
145 200 cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
146 201 if not cahs:
147 202 cahs = [node.nullrev]
148 203 return [ctx.repo()[r].manifest() for r in cahs]
149 204
150 205 def deletionfromparent(f):
151 206 if f in m1:
152 207 return f not in m2 and all(
153 208 f in ma and ma.find(f) == m1.find(f) for ma in mas()
154 209 )
155 210 elif f in m2:
156 211 return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
157 212 else:
158 213 return True
159 214
160 215 return deletionfromparent
161 216
162 217
163 218 def computechangesetfilesremoved(ctx):
164 219 """return the list of files removed in a changeset
165 220 """
166 221 removed = []
167 222 for f in ctx.files():
168 223 if f not in ctx:
169 224 removed.append(f)
170 225 if removed:
171 226 rf = get_removal_filter(ctx)
172 227 removed = [r for r in removed if not rf(r)]
173 228 return removed
174 229
175 230
176 231 def computechangesetcopies(ctx):
177 232 """return the copies data for a changeset
178 233
179 234 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
180 235
181 236 Each dictionnary are in the form: `{newname: oldname}`
182 237 """
183 238 p1copies = {}
184 239 p2copies = {}
185 240 p1 = ctx.p1()
186 241 p2 = ctx.p2()
187 242 narrowmatch = ctx._repo.narrowmatch()
188 243 for dst in ctx.files():
189 244 if not narrowmatch(dst) or dst not in ctx:
190 245 continue
191 246 copied = ctx[dst].renamed()
192 247 if not copied:
193 248 continue
194 249 src, srcnode = copied
195 250 if src in p1 and p1[src].filenode() == srcnode:
196 251 p1copies[dst] = src
197 252 elif src in p2 and p2[src].filenode() == srcnode:
198 253 p2copies[dst] = src
199 254 return p1copies, p2copies
200 255
201 256
202 257 def encodecopies(files, copies):
203 258 items = []
204 259 for i, dst in enumerate(files):
205 260 if dst in copies:
206 261 items.append(b'%d\0%s' % (i, copies[dst]))
207 262 if len(items) != len(copies):
208 263 raise error.ProgrammingError(
209 264 b'some copy targets missing from file list'
210 265 )
211 266 return b"\n".join(items)
212 267
213 268
214 269 def decodecopies(files, data):
215 270 try:
216 271 copies = {}
217 272 if not data:
218 273 return copies
219 274 for l in data.split(b'\n'):
220 275 strindex, src = l.split(b'\0')
221 276 i = int(strindex)
222 277 dst = files[i]
223 278 copies[dst] = src
224 279 return copies
225 280 except (ValueError, IndexError):
226 281 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
227 282 # used different syntax for the value.
228 283 return None
229 284
230 285
231 286 def encodefileindices(files, subset):
232 287 subset = set(subset)
233 288 indices = []
234 289 for i, f in enumerate(files):
235 290 if f in subset:
236 291 indices.append(b'%d' % i)
237 292 return b'\n'.join(indices)
238 293
239 294
240 295 def decodefileindices(files, data):
241 296 try:
242 297 subset = []
243 298 if not data:
244 299 return subset
245 300 for strindex in data.split(b'\n'):
246 301 i = int(strindex)
247 302 if i < 0 or i >= len(files):
248 303 return None
249 304 subset.append(files[i])
250 305 return subset
251 306 except (ValueError, IndexError):
252 307 # Perhaps someone had chosen the same key name (e.g. "added") and
253 308 # used different syntax for the value.
254 309 return None
255 310
256 311
257 312 def encode_copies_sidedata(files):
258 313 sortedfiles = sorted(files.touched)
259 314 sidedata = {}
260 315 p1copies = files.copied_from_p1
261 316 if p1copies:
262 317 p1copies = encodecopies(sortedfiles, p1copies)
263 318 sidedata[sidedatamod.SD_P1COPIES] = p1copies
264 319 p2copies = files.copied_from_p2
265 320 if p2copies:
266 321 p2copies = encodecopies(sortedfiles, p2copies)
267 322 sidedata[sidedatamod.SD_P2COPIES] = p2copies
268 323 filesadded = files.added
269 324 if filesadded:
270 325 filesadded = encodefileindices(sortedfiles, filesadded)
271 326 sidedata[sidedatamod.SD_FILESADDED] = filesadded
272 327 filesremoved = files.removed
273 328 if filesremoved:
274 329 filesremoved = encodefileindices(sortedfiles, filesremoved)
275 330 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
276 331 if not sidedata:
277 332 sidedata = None
278 333 return sidedata
279 334
280 335
281 336 def _getsidedata(srcrepo, rev):
282 337 ctx = srcrepo[rev]
283 338 filescopies = computechangesetcopies(ctx)
284 339 filesadded = computechangesetfilesadded(ctx)
285 340 filesremoved = computechangesetfilesremoved(ctx)
286 341 sidedata = {}
287 342 if any([filescopies, filesadded, filesremoved]):
288 343 sortedfiles = sorted(ctx.files())
289 344 p1copies, p2copies = filescopies
290 345 p1copies = encodecopies(sortedfiles, p1copies)
291 346 p2copies = encodecopies(sortedfiles, p2copies)
292 347 filesadded = encodefileindices(sortedfiles, filesadded)
293 348 filesremoved = encodefileindices(sortedfiles, filesremoved)
294 349 if p1copies:
295 350 sidedata[sidedatamod.SD_P1COPIES] = p1copies
296 351 if p2copies:
297 352 sidedata[sidedatamod.SD_P2COPIES] = p2copies
298 353 if filesadded:
299 354 sidedata[sidedatamod.SD_FILESADDED] = filesadded
300 355 if filesremoved:
301 356 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
302 357 return sidedata
303 358
304 359
305 360 def getsidedataadder(srcrepo, destrepo):
306 361 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
307 362 if pycompat.iswindows or not use_w:
308 363 return _get_simple_sidedata_adder(srcrepo, destrepo)
309 364 else:
310 365 return _get_worker_sidedata_adder(srcrepo, destrepo)
311 366
312 367
313 368 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
314 369 """The function used by worker precomputing sidedata
315 370
316 371 It read an input queue containing revision numbers
317 372 It write in an output queue containing (rev, <sidedata-map>)
318 373
319 374 The `None` input value is used as a stop signal.
320 375
321 376 The `tokens` semaphore is user to avoid having too many unprocessed
322 377 entries. The workers needs to acquire one token before fetching a task.
323 378 They will be released by the consumer of the produced data.
324 379 """
325 380 tokens.acquire()
326 381 rev = revs_queue.get()
327 382 while rev is not None:
328 383 data = _getsidedata(srcrepo, rev)
329 384 sidedata_queue.put((rev, data))
330 385 tokens.acquire()
331 386 rev = revs_queue.get()
332 387 # processing of `None` is completed, release the token.
333 388 tokens.release()
334 389
335 390
336 391 BUFF_PER_WORKER = 50
337 392
338 393
339 394 def _get_worker_sidedata_adder(srcrepo, destrepo):
340 395 """The parallel version of the sidedata computation
341 396
342 397 This code spawn a pool of worker that precompute a buffer of sidedata
343 398 before we actually need them"""
344 399 # avoid circular import copies -> scmutil -> worker -> copies
345 400 from . import worker
346 401
347 402 nbworkers = worker._numworkers(srcrepo.ui)
348 403
349 404 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
350 405 revsq = multiprocessing.Queue()
351 406 sidedataq = multiprocessing.Queue()
352 407
353 408 assert srcrepo.filtername is None
354 409 # queue all tasks beforehand, revision numbers are small and it make
355 410 # synchronisation simpler
356 411 #
357 412 # Since the computation for each node can be quite expensive, the overhead
358 413 # of using a single queue is not revelant. In practice, most computation
359 414 # are fast but some are very expensive and dominate all the other smaller
360 415 # cost.
361 416 for r in srcrepo.changelog.revs():
362 417 revsq.put(r)
363 418 # queue the "no more tasks" markers
364 419 for i in range(nbworkers):
365 420 revsq.put(None)
366 421
367 422 allworkers = []
368 423 for i in range(nbworkers):
369 424 args = (srcrepo, revsq, sidedataq, tokens)
370 425 w = multiprocessing.Process(target=_sidedata_worker, args=args)
371 426 allworkers.append(w)
372 427 w.start()
373 428
374 429 # dictionnary to store results for revision higher than we one we are
375 430 # looking for. For example, if we need the sidedatamap for 42, and 43 is
376 431 # received, when shelve 43 for later use.
377 432 staging = {}
378 433
379 434 def sidedata_companion(revlog, rev):
380 435 sidedata = {}
381 436 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
382 437 # Is the data previously shelved ?
383 438 sidedata = staging.pop(rev, None)
384 439 if sidedata is None:
385 440 # look at the queued result until we find the one we are lookig
386 441 # for (shelve the other ones)
387 442 r, sidedata = sidedataq.get()
388 443 while r != rev:
389 444 staging[r] = sidedata
390 445 r, sidedata = sidedataq.get()
391 446 tokens.release()
392 447 return False, (), sidedata
393 448
394 449 return sidedata_companion
395 450
396 451
397 452 def _get_simple_sidedata_adder(srcrepo, destrepo):
398 453 """The simple version of the sidedata computation
399 454
400 455 It just compute it in the same thread on request"""
401 456
402 457 def sidedatacompanion(revlog, rev):
403 458 sidedata = {}
404 459 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
405 460 sidedata = _getsidedata(srcrepo, rev)
406 461 return False, (), sidedata
407 462
408 463 return sidedatacompanion
409 464
410 465
411 466 def getsidedataremover(srcrepo, destrepo):
412 467 def sidedatacompanion(revlog, rev):
413 468 f = ()
414 469 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
415 470 if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
416 471 f = (
417 472 sidedatamod.SD_P1COPIES,
418 473 sidedatamod.SD_P2COPIES,
419 474 sidedatamod.SD_FILESADDED,
420 475 sidedatamod.SD_FILESREMOVED,
421 476 )
422 477 return False, f, {}
423 478
424 479 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now