##// END OF EJS Templates
metadata: filter the `removed` set to only contains relevant data...
marmoute -
r45468:25512a65 default
parent child Browse files
Show More
@@ -1,324 +1,327
1 1 # metadata.py -- code related to various metadata computation and access.
2 2 #
3 3 # Copyright 2019 Google, Inc <martinvonz@google.com>
4 4 # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import multiprocessing
11 11
12 12 from . import (
13 13 error,
14 14 node,
15 15 pycompat,
16 16 util,
17 17 )
18 18
19 19 from .revlogutils import (
20 20 flagutil as sidedataflag,
21 21 sidedata as sidedatamod,
22 22 )
23 23
24 24
25 25 def computechangesetfilesadded(ctx):
26 26 """return the list of files added in a changeset
27 27 """
28 28 added = []
29 29 for f in ctx.files():
30 30 if not any(f in p for p in ctx.parents()):
31 31 added.append(f)
32 32 return added
33 33
34 34
35 35 def get_removal_filter(ctx, x=None):
36 36 """return a function to detect files "wrongly" detected as `removed`
37 37
38 38 When a file is removed relative to p1 in a merge, this
39 39 function determines whether the absence is due to a
40 40 deletion from a parent, or whether the merge commit
41 41 itself deletes the file. We decide this by doing a
42 42 simplified three way merge of the manifest entry for
43 43 the file. There are two ways we decide the merge
44 44 itself didn't delete a file:
45 45 - neither parent (nor the merge) contain the file
46 46 - exactly one parent contains the file, and that
47 47 parent has the same filelog entry as the merge
48 48 ancestor (or all of them if there two). In other
49 49 words, that parent left the file unchanged while the
50 50 other one deleted it.
51 51 One way to think about this is that deleting a file is
52 52 similar to emptying it, so the list of changed files
53 53 should be similar either way. The computation
54 54 described above is not done directly in _filecommit
55 55 when creating the list of changed files, however
56 56 it does something very similar by comparing filelog
57 57 nodes.
58 58 """
59 59
60 60 if x is not None:
61 61 p1, p2, m1, m2 = x
62 62 else:
63 63 p1 = ctx.p1()
64 64 p2 = ctx.p2()
65 65 m1 = p1.manifest()
66 66 m2 = p2.manifest()
67 67
68 68 @util.cachefunc
69 69 def mas():
70 70 p1n = p1.node()
71 71 p2n = p2.node()
72 72 cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
73 73 if not cahs:
74 74 cahs = [node.nullrev]
75 75 return [ctx.repo()[r].manifest() for r in cahs]
76 76
77 77 def deletionfromparent(f):
78 78 if f in m1:
79 79 return f not in m2 and all(
80 80 f in ma and ma.find(f) == m1.find(f) for ma in mas()
81 81 )
82 82 elif f in m2:
83 83 return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
84 84 else:
85 85 return True
86 86
87 87 return deletionfromparent
88 88
89 89
90 90 def computechangesetfilesremoved(ctx):
91 91 """return the list of files removed in a changeset
92 92 """
93 93 removed = []
94 94 for f in ctx.files():
95 95 if f not in ctx:
96 96 removed.append(f)
97 if removed:
98 rf = get_removal_filter(ctx)
99 removed = [r for r in removed if not rf(r)]
97 100 return removed
98 101
99 102
100 103 def computechangesetcopies(ctx):
101 104 """return the copies data for a changeset
102 105
103 106 The copies data are returned as a pair of dictionnary (p1copies, p2copies).
104 107
105 108 Each dictionnary are in the form: `{newname: oldname}`
106 109 """
107 110 p1copies = {}
108 111 p2copies = {}
109 112 p1 = ctx.p1()
110 113 p2 = ctx.p2()
111 114 narrowmatch = ctx._repo.narrowmatch()
112 115 for dst in ctx.files():
113 116 if not narrowmatch(dst) or dst not in ctx:
114 117 continue
115 118 copied = ctx[dst].renamed()
116 119 if not copied:
117 120 continue
118 121 src, srcnode = copied
119 122 if src in p1 and p1[src].filenode() == srcnode:
120 123 p1copies[dst] = src
121 124 elif src in p2 and p2[src].filenode() == srcnode:
122 125 p2copies[dst] = src
123 126 return p1copies, p2copies
124 127
125 128
126 129 def encodecopies(files, copies):
127 130 items = []
128 131 for i, dst in enumerate(files):
129 132 if dst in copies:
130 133 items.append(b'%d\0%s' % (i, copies[dst]))
131 134 if len(items) != len(copies):
132 135 raise error.ProgrammingError(
133 136 b'some copy targets missing from file list'
134 137 )
135 138 return b"\n".join(items)
136 139
137 140
138 141 def decodecopies(files, data):
139 142 try:
140 143 copies = {}
141 144 if not data:
142 145 return copies
143 146 for l in data.split(b'\n'):
144 147 strindex, src = l.split(b'\0')
145 148 i = int(strindex)
146 149 dst = files[i]
147 150 copies[dst] = src
148 151 return copies
149 152 except (ValueError, IndexError):
150 153 # Perhaps someone had chosen the same key name (e.g. "p1copies") and
151 154 # used different syntax for the value.
152 155 return None
153 156
154 157
155 158 def encodefileindices(files, subset):
156 159 subset = set(subset)
157 160 indices = []
158 161 for i, f in enumerate(files):
159 162 if f in subset:
160 163 indices.append(b'%d' % i)
161 164 return b'\n'.join(indices)
162 165
163 166
164 167 def decodefileindices(files, data):
165 168 try:
166 169 subset = []
167 170 if not data:
168 171 return subset
169 172 for strindex in data.split(b'\n'):
170 173 i = int(strindex)
171 174 if i < 0 or i >= len(files):
172 175 return None
173 176 subset.append(files[i])
174 177 return subset
175 178 except (ValueError, IndexError):
176 179 # Perhaps someone had chosen the same key name (e.g. "added") and
177 180 # used different syntax for the value.
178 181 return None
179 182
180 183
181 184 def _getsidedata(srcrepo, rev):
182 185 ctx = srcrepo[rev]
183 186 filescopies = computechangesetcopies(ctx)
184 187 filesadded = computechangesetfilesadded(ctx)
185 188 filesremoved = computechangesetfilesremoved(ctx)
186 189 sidedata = {}
187 190 if any([filescopies, filesadded, filesremoved]):
188 191 sortedfiles = sorted(ctx.files())
189 192 p1copies, p2copies = filescopies
190 193 p1copies = encodecopies(sortedfiles, p1copies)
191 194 p2copies = encodecopies(sortedfiles, p2copies)
192 195 filesadded = encodefileindices(sortedfiles, filesadded)
193 196 filesremoved = encodefileindices(sortedfiles, filesremoved)
194 197 if p1copies:
195 198 sidedata[sidedatamod.SD_P1COPIES] = p1copies
196 199 if p2copies:
197 200 sidedata[sidedatamod.SD_P2COPIES] = p2copies
198 201 if filesadded:
199 202 sidedata[sidedatamod.SD_FILESADDED] = filesadded
200 203 if filesremoved:
201 204 sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved
202 205 return sidedata
203 206
204 207
205 208 def getsidedataadder(srcrepo, destrepo):
206 209 use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade')
207 210 if pycompat.iswindows or not use_w:
208 211 return _get_simple_sidedata_adder(srcrepo, destrepo)
209 212 else:
210 213 return _get_worker_sidedata_adder(srcrepo, destrepo)
211 214
212 215
213 216 def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
214 217 """The function used by worker precomputing sidedata
215 218
216 219 It read an input queue containing revision numbers
217 220 It write in an output queue containing (rev, <sidedata-map>)
218 221
219 222 The `None` input value is used as a stop signal.
220 223
221 224 The `tokens` semaphore is user to avoid having too many unprocessed
222 225 entries. The workers needs to acquire one token before fetching a task.
223 226 They will be released by the consumer of the produced data.
224 227 """
225 228 tokens.acquire()
226 229 rev = revs_queue.get()
227 230 while rev is not None:
228 231 data = _getsidedata(srcrepo, rev)
229 232 sidedata_queue.put((rev, data))
230 233 tokens.acquire()
231 234 rev = revs_queue.get()
232 235 # processing of `None` is completed, release the token.
233 236 tokens.release()
234 237
235 238
236 239 BUFF_PER_WORKER = 50
237 240
238 241
239 242 def _get_worker_sidedata_adder(srcrepo, destrepo):
240 243 """The parallel version of the sidedata computation
241 244
242 245 This code spawn a pool of worker that precompute a buffer of sidedata
243 246 before we actually need them"""
244 247 # avoid circular import copies -> scmutil -> worker -> copies
245 248 from . import worker
246 249
247 250 nbworkers = worker._numworkers(srcrepo.ui)
248 251
249 252 tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
250 253 revsq = multiprocessing.Queue()
251 254 sidedataq = multiprocessing.Queue()
252 255
253 256 assert srcrepo.filtername is None
254 257 # queue all tasks beforehand, revision numbers are small and it make
255 258 # synchronisation simpler
256 259 #
257 260 # Since the computation for each node can be quite expensive, the overhead
258 261 # of using a single queue is not revelant. In practice, most computation
259 262 # are fast but some are very expensive and dominate all the other smaller
260 263 # cost.
261 264 for r in srcrepo.changelog.revs():
262 265 revsq.put(r)
263 266 # queue the "no more tasks" markers
264 267 for i in range(nbworkers):
265 268 revsq.put(None)
266 269
267 270 allworkers = []
268 271 for i in range(nbworkers):
269 272 args = (srcrepo, revsq, sidedataq, tokens)
270 273 w = multiprocessing.Process(target=_sidedata_worker, args=args)
271 274 allworkers.append(w)
272 275 w.start()
273 276
274 277 # dictionnary to store results for revision higher than we one we are
275 278 # looking for. For example, if we need the sidedatamap for 42, and 43 is
276 279 # received, when shelve 43 for later use.
277 280 staging = {}
278 281
279 282 def sidedata_companion(revlog, rev):
280 283 sidedata = {}
281 284 if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog
282 285 # Is the data previously shelved ?
283 286 sidedata = staging.pop(rev, None)
284 287 if sidedata is None:
285 288 # look at the queued result until we find the one we are lookig
286 289 # for (shelve the other ones)
287 290 r, sidedata = sidedataq.get()
288 291 while r != rev:
289 292 staging[r] = sidedata
290 293 r, sidedata = sidedataq.get()
291 294 tokens.release()
292 295 return False, (), sidedata
293 296
294 297 return sidedata_companion
295 298
296 299
297 300 def _get_simple_sidedata_adder(srcrepo, destrepo):
298 301 """The simple version of the sidedata computation
299 302
300 303 It just compute it in the same thread on request"""
301 304
302 305 def sidedatacompanion(revlog, rev):
303 306 sidedata = {}
304 307 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
305 308 sidedata = _getsidedata(srcrepo, rev)
306 309 return False, (), sidedata
307 310
308 311 return sidedatacompanion
309 312
310 313
311 314 def getsidedataremover(srcrepo, destrepo):
312 315 def sidedatacompanion(revlog, rev):
313 316 f = ()
314 317 if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog
315 318 if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA:
316 319 f = (
317 320 sidedatamod.SD_P1COPIES,
318 321 sidedatamod.SD_P2COPIES,
319 322 sidedatamod.SD_FILESADDED,
320 323 sidedatamod.SD_FILESREMOVED,
321 324 )
322 325 return False, f, {}
323 326
324 327 return sidedatacompanion
General Comments 0
You need to be logged in to leave comments. Login now