Show More
@@ -1,324 +1,327 | |||
|
1 | 1 | # metadata.py -- code related to various metadata computation and access. |
|
2 | 2 | # |
|
3 | 3 | # Copyright 2019 Google, Inc <martinvonz@google.com> |
|
4 | 4 | # Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net> |
|
5 | 5 | # |
|
6 | 6 | # This software may be used and distributed according to the terms of the |
|
7 | 7 | # GNU General Public License version 2 or any later version. |
|
8 | 8 | from __future__ import absolute_import, print_function |
|
9 | 9 | |
|
10 | 10 | import multiprocessing |
|
11 | 11 | |
|
12 | 12 | from . import ( |
|
13 | 13 | error, |
|
14 | 14 | node, |
|
15 | 15 | pycompat, |
|
16 | 16 | util, |
|
17 | 17 | ) |
|
18 | 18 | |
|
19 | 19 | from .revlogutils import ( |
|
20 | 20 | flagutil as sidedataflag, |
|
21 | 21 | sidedata as sidedatamod, |
|
22 | 22 | ) |
|
23 | 23 | |
|
24 | 24 | |
|
25 | 25 | def computechangesetfilesadded(ctx): |
|
26 | 26 | """return the list of files added in a changeset |
|
27 | 27 | """ |
|
28 | 28 | added = [] |
|
29 | 29 | for f in ctx.files(): |
|
30 | 30 | if not any(f in p for p in ctx.parents()): |
|
31 | 31 | added.append(f) |
|
32 | 32 | return added |
|
33 | 33 | |
|
34 | 34 | |
|
35 | 35 | def get_removal_filter(ctx, x=None): |
|
36 | 36 | """return a function to detect files "wrongly" detected as `removed` |
|
37 | 37 | |
|
38 | 38 | When a file is removed relative to p1 in a merge, this |
|
39 | 39 | function determines whether the absence is due to a |
|
40 | 40 | deletion from a parent, or whether the merge commit |
|
41 | 41 | itself deletes the file. We decide this by doing a |
|
42 | 42 | simplified three way merge of the manifest entry for |
|
43 | 43 | the file. There are two ways we decide the merge |
|
44 | 44 | itself didn't delete a file: |
|
45 | 45 | - neither parent (nor the merge) contain the file |
|
46 | 46 | - exactly one parent contains the file, and that |
|
47 | 47 | parent has the same filelog entry as the merge |
|
48 | 48 | ancestor (or all of them if there two). In other |
|
49 | 49 | words, that parent left the file unchanged while the |
|
50 | 50 | other one deleted it. |
|
51 | 51 | One way to think about this is that deleting a file is |
|
52 | 52 | similar to emptying it, so the list of changed files |
|
53 | 53 | should be similar either way. The computation |
|
54 | 54 | described above is not done directly in _filecommit |
|
55 | 55 | when creating the list of changed files, however |
|
56 | 56 | it does something very similar by comparing filelog |
|
57 | 57 | nodes. |
|
58 | 58 | """ |
|
59 | 59 | |
|
60 | 60 | if x is not None: |
|
61 | 61 | p1, p2, m1, m2 = x |
|
62 | 62 | else: |
|
63 | 63 | p1 = ctx.p1() |
|
64 | 64 | p2 = ctx.p2() |
|
65 | 65 | m1 = p1.manifest() |
|
66 | 66 | m2 = p2.manifest() |
|
67 | 67 | |
|
68 | 68 | @util.cachefunc |
|
69 | 69 | def mas(): |
|
70 | 70 | p1n = p1.node() |
|
71 | 71 | p2n = p2.node() |
|
72 | 72 | cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n) |
|
73 | 73 | if not cahs: |
|
74 | 74 | cahs = [node.nullrev] |
|
75 | 75 | return [ctx.repo()[r].manifest() for r in cahs] |
|
76 | 76 | |
|
77 | 77 | def deletionfromparent(f): |
|
78 | 78 | if f in m1: |
|
79 | 79 | return f not in m2 and all( |
|
80 | 80 | f in ma and ma.find(f) == m1.find(f) for ma in mas() |
|
81 | 81 | ) |
|
82 | 82 | elif f in m2: |
|
83 | 83 | return all(f in ma and ma.find(f) == m2.find(f) for ma in mas()) |
|
84 | 84 | else: |
|
85 | 85 | return True |
|
86 | 86 | |
|
87 | 87 | return deletionfromparent |
|
88 | 88 | |
|
89 | 89 | |
|
90 | 90 | def computechangesetfilesremoved(ctx): |
|
91 | 91 | """return the list of files removed in a changeset |
|
92 | 92 | """ |
|
93 | 93 | removed = [] |
|
94 | 94 | for f in ctx.files(): |
|
95 | 95 | if f not in ctx: |
|
96 | 96 | removed.append(f) |
|
97 | if removed: | |
|
98 | rf = get_removal_filter(ctx) | |
|
99 | removed = [r for r in removed if not rf(r)] | |
|
97 | 100 | return removed |
|
98 | 101 | |
|
99 | 102 | |
|
100 | 103 | def computechangesetcopies(ctx): |
|
101 | 104 | """return the copies data for a changeset |
|
102 | 105 | |
|
103 | 106 | The copies data are returned as a pair of dictionnary (p1copies, p2copies). |
|
104 | 107 | |
|
105 | 108 | Each dictionnary are in the form: `{newname: oldname}` |
|
106 | 109 | """ |
|
107 | 110 | p1copies = {} |
|
108 | 111 | p2copies = {} |
|
109 | 112 | p1 = ctx.p1() |
|
110 | 113 | p2 = ctx.p2() |
|
111 | 114 | narrowmatch = ctx._repo.narrowmatch() |
|
112 | 115 | for dst in ctx.files(): |
|
113 | 116 | if not narrowmatch(dst) or dst not in ctx: |
|
114 | 117 | continue |
|
115 | 118 | copied = ctx[dst].renamed() |
|
116 | 119 | if not copied: |
|
117 | 120 | continue |
|
118 | 121 | src, srcnode = copied |
|
119 | 122 | if src in p1 and p1[src].filenode() == srcnode: |
|
120 | 123 | p1copies[dst] = src |
|
121 | 124 | elif src in p2 and p2[src].filenode() == srcnode: |
|
122 | 125 | p2copies[dst] = src |
|
123 | 126 | return p1copies, p2copies |
|
124 | 127 | |
|
125 | 128 | |
|
126 | 129 | def encodecopies(files, copies): |
|
127 | 130 | items = [] |
|
128 | 131 | for i, dst in enumerate(files): |
|
129 | 132 | if dst in copies: |
|
130 | 133 | items.append(b'%d\0%s' % (i, copies[dst])) |
|
131 | 134 | if len(items) != len(copies): |
|
132 | 135 | raise error.ProgrammingError( |
|
133 | 136 | b'some copy targets missing from file list' |
|
134 | 137 | ) |
|
135 | 138 | return b"\n".join(items) |
|
136 | 139 | |
|
137 | 140 | |
|
138 | 141 | def decodecopies(files, data): |
|
139 | 142 | try: |
|
140 | 143 | copies = {} |
|
141 | 144 | if not data: |
|
142 | 145 | return copies |
|
143 | 146 | for l in data.split(b'\n'): |
|
144 | 147 | strindex, src = l.split(b'\0') |
|
145 | 148 | i = int(strindex) |
|
146 | 149 | dst = files[i] |
|
147 | 150 | copies[dst] = src |
|
148 | 151 | return copies |
|
149 | 152 | except (ValueError, IndexError): |
|
150 | 153 | # Perhaps someone had chosen the same key name (e.g. "p1copies") and |
|
151 | 154 | # used different syntax for the value. |
|
152 | 155 | return None |
|
153 | 156 | |
|
154 | 157 | |
|
155 | 158 | def encodefileindices(files, subset): |
|
156 | 159 | subset = set(subset) |
|
157 | 160 | indices = [] |
|
158 | 161 | for i, f in enumerate(files): |
|
159 | 162 | if f in subset: |
|
160 | 163 | indices.append(b'%d' % i) |
|
161 | 164 | return b'\n'.join(indices) |
|
162 | 165 | |
|
163 | 166 | |
|
164 | 167 | def decodefileindices(files, data): |
|
165 | 168 | try: |
|
166 | 169 | subset = [] |
|
167 | 170 | if not data: |
|
168 | 171 | return subset |
|
169 | 172 | for strindex in data.split(b'\n'): |
|
170 | 173 | i = int(strindex) |
|
171 | 174 | if i < 0 or i >= len(files): |
|
172 | 175 | return None |
|
173 | 176 | subset.append(files[i]) |
|
174 | 177 | return subset |
|
175 | 178 | except (ValueError, IndexError): |
|
176 | 179 | # Perhaps someone had chosen the same key name (e.g. "added") and |
|
177 | 180 | # used different syntax for the value. |
|
178 | 181 | return None |
|
179 | 182 | |
|
180 | 183 | |
|
181 | 184 | def _getsidedata(srcrepo, rev): |
|
182 | 185 | ctx = srcrepo[rev] |
|
183 | 186 | filescopies = computechangesetcopies(ctx) |
|
184 | 187 | filesadded = computechangesetfilesadded(ctx) |
|
185 | 188 | filesremoved = computechangesetfilesremoved(ctx) |
|
186 | 189 | sidedata = {} |
|
187 | 190 | if any([filescopies, filesadded, filesremoved]): |
|
188 | 191 | sortedfiles = sorted(ctx.files()) |
|
189 | 192 | p1copies, p2copies = filescopies |
|
190 | 193 | p1copies = encodecopies(sortedfiles, p1copies) |
|
191 | 194 | p2copies = encodecopies(sortedfiles, p2copies) |
|
192 | 195 | filesadded = encodefileindices(sortedfiles, filesadded) |
|
193 | 196 | filesremoved = encodefileindices(sortedfiles, filesremoved) |
|
194 | 197 | if p1copies: |
|
195 | 198 | sidedata[sidedatamod.SD_P1COPIES] = p1copies |
|
196 | 199 | if p2copies: |
|
197 | 200 | sidedata[sidedatamod.SD_P2COPIES] = p2copies |
|
198 | 201 | if filesadded: |
|
199 | 202 | sidedata[sidedatamod.SD_FILESADDED] = filesadded |
|
200 | 203 | if filesremoved: |
|
201 | 204 | sidedata[sidedatamod.SD_FILESREMOVED] = filesremoved |
|
202 | 205 | return sidedata |
|
203 | 206 | |
|
204 | 207 | |
|
205 | 208 | def getsidedataadder(srcrepo, destrepo): |
|
206 | 209 | use_w = srcrepo.ui.configbool(b'experimental', b'worker.repository-upgrade') |
|
207 | 210 | if pycompat.iswindows or not use_w: |
|
208 | 211 | return _get_simple_sidedata_adder(srcrepo, destrepo) |
|
209 | 212 | else: |
|
210 | 213 | return _get_worker_sidedata_adder(srcrepo, destrepo) |
|
211 | 214 | |
|
212 | 215 | |
|
213 | 216 | def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens): |
|
214 | 217 | """The function used by worker precomputing sidedata |
|
215 | 218 | |
|
216 | 219 | It read an input queue containing revision numbers |
|
217 | 220 | It write in an output queue containing (rev, <sidedata-map>) |
|
218 | 221 | |
|
219 | 222 | The `None` input value is used as a stop signal. |
|
220 | 223 | |
|
221 | 224 | The `tokens` semaphore is user to avoid having too many unprocessed |
|
222 | 225 | entries. The workers needs to acquire one token before fetching a task. |
|
223 | 226 | They will be released by the consumer of the produced data. |
|
224 | 227 | """ |
|
225 | 228 | tokens.acquire() |
|
226 | 229 | rev = revs_queue.get() |
|
227 | 230 | while rev is not None: |
|
228 | 231 | data = _getsidedata(srcrepo, rev) |
|
229 | 232 | sidedata_queue.put((rev, data)) |
|
230 | 233 | tokens.acquire() |
|
231 | 234 | rev = revs_queue.get() |
|
232 | 235 | # processing of `None` is completed, release the token. |
|
233 | 236 | tokens.release() |
|
234 | 237 | |
|
235 | 238 | |
|
236 | 239 | BUFF_PER_WORKER = 50 |
|
237 | 240 | |
|
238 | 241 | |
|
239 | 242 | def _get_worker_sidedata_adder(srcrepo, destrepo): |
|
240 | 243 | """The parallel version of the sidedata computation |
|
241 | 244 | |
|
242 | 245 | This code spawn a pool of worker that precompute a buffer of sidedata |
|
243 | 246 | before we actually need them""" |
|
244 | 247 | # avoid circular import copies -> scmutil -> worker -> copies |
|
245 | 248 | from . import worker |
|
246 | 249 | |
|
247 | 250 | nbworkers = worker._numworkers(srcrepo.ui) |
|
248 | 251 | |
|
249 | 252 | tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER) |
|
250 | 253 | revsq = multiprocessing.Queue() |
|
251 | 254 | sidedataq = multiprocessing.Queue() |
|
252 | 255 | |
|
253 | 256 | assert srcrepo.filtername is None |
|
254 | 257 | # queue all tasks beforehand, revision numbers are small and it make |
|
255 | 258 | # synchronisation simpler |
|
256 | 259 | # |
|
257 | 260 | # Since the computation for each node can be quite expensive, the overhead |
|
258 | 261 | # of using a single queue is not revelant. In practice, most computation |
|
259 | 262 | # are fast but some are very expensive and dominate all the other smaller |
|
260 | 263 | # cost. |
|
261 | 264 | for r in srcrepo.changelog.revs(): |
|
262 | 265 | revsq.put(r) |
|
263 | 266 | # queue the "no more tasks" markers |
|
264 | 267 | for i in range(nbworkers): |
|
265 | 268 | revsq.put(None) |
|
266 | 269 | |
|
267 | 270 | allworkers = [] |
|
268 | 271 | for i in range(nbworkers): |
|
269 | 272 | args = (srcrepo, revsq, sidedataq, tokens) |
|
270 | 273 | w = multiprocessing.Process(target=_sidedata_worker, args=args) |
|
271 | 274 | allworkers.append(w) |
|
272 | 275 | w.start() |
|
273 | 276 | |
|
274 | 277 | # dictionnary to store results for revision higher than we one we are |
|
275 | 278 | # looking for. For example, if we need the sidedatamap for 42, and 43 is |
|
276 | 279 | # received, when shelve 43 for later use. |
|
277 | 280 | staging = {} |
|
278 | 281 | |
|
279 | 282 | def sidedata_companion(revlog, rev): |
|
280 | 283 | sidedata = {} |
|
281 | 284 | if util.safehasattr(revlog, b'filteredrevs'): # this is a changelog |
|
282 | 285 | # Is the data previously shelved ? |
|
283 | 286 | sidedata = staging.pop(rev, None) |
|
284 | 287 | if sidedata is None: |
|
285 | 288 | # look at the queued result until we find the one we are lookig |
|
286 | 289 | # for (shelve the other ones) |
|
287 | 290 | r, sidedata = sidedataq.get() |
|
288 | 291 | while r != rev: |
|
289 | 292 | staging[r] = sidedata |
|
290 | 293 | r, sidedata = sidedataq.get() |
|
291 | 294 | tokens.release() |
|
292 | 295 | return False, (), sidedata |
|
293 | 296 | |
|
294 | 297 | return sidedata_companion |
|
295 | 298 | |
|
296 | 299 | |
|
297 | 300 | def _get_simple_sidedata_adder(srcrepo, destrepo): |
|
298 | 301 | """The simple version of the sidedata computation |
|
299 | 302 | |
|
300 | 303 | It just compute it in the same thread on request""" |
|
301 | 304 | |
|
302 | 305 | def sidedatacompanion(revlog, rev): |
|
303 | 306 | sidedata = {} |
|
304 | 307 | if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog |
|
305 | 308 | sidedata = _getsidedata(srcrepo, rev) |
|
306 | 309 | return False, (), sidedata |
|
307 | 310 | |
|
308 | 311 | return sidedatacompanion |
|
309 | 312 | |
|
310 | 313 | |
|
311 | 314 | def getsidedataremover(srcrepo, destrepo): |
|
312 | 315 | def sidedatacompanion(revlog, rev): |
|
313 | 316 | f = () |
|
314 | 317 | if util.safehasattr(revlog, 'filteredrevs'): # this is a changelog |
|
315 | 318 | if revlog.flags(rev) & sidedataflag.REVIDX_SIDEDATA: |
|
316 | 319 | f = ( |
|
317 | 320 | sidedatamod.SD_P1COPIES, |
|
318 | 321 | sidedatamod.SD_P2COPIES, |
|
319 | 322 | sidedatamod.SD_FILESADDED, |
|
320 | 323 | sidedatamod.SD_FILESREMOVED, |
|
321 | 324 | ) |
|
322 | 325 | return False, f, {} |
|
323 | 326 | |
|
324 | 327 | return sidedatacompanion |
General Comments 0
You need to be logged in to leave comments.
Login now