##// END OF EJS Templates
synthrepo: add missing import of sys...
Bryan O'Sullivan -
r18927:deffb5e9 default
parent child Browse files
Show More
@@ -1,379 +1,379 b''
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26
27 27 A few obvious properties that are not currently handled realistically:
28 28
29 29 - Merges are treated as regular commits with two parents, which is not
30 30 realistic
31 31 - Modifications are not treated as operations on hunks of lines, but
32 32 as insertions and deletions of randomly chosen single lines
33 33 - Committer ID (always random)
34 34 - Executability of files
35 35 - Symlinks and binary files are ignored
36 36 '''
37 37
38 import bisect, collections, json, os, random, time
38 import bisect, collections, json, os, random, time, sys
39 39 from mercurial import cmdutil, context, patch, scmutil, url, util, hg
40 40 from mercurial.i18n import _
41 41 from mercurial.node import nullrev, nullid
42 42
43 43 testedwith = 'internal'
44 44
45 45 cmdtable = {}
46 46 command = cmdutil.command(cmdtable)
47 47
48 48 newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
49 49
50 50 def zerodict():
51 51 return collections.defaultdict(lambda: 0)
52 52
53 53 def roundto(x, k):
54 54 if x > k * 2:
55 55 return int(round(x / float(k)) * k)
56 56 return int(round(x))
57 57
58 58 def parsegitdiff(lines):
59 59 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
60 60 binary = False
61 61 for line in lines:
62 62 start = line[:6]
63 63 if start == 'diff -':
64 64 if filename:
65 65 yield filename, mar, lineadd, lineremove, binary
66 66 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
67 67 filename = patch.gitre.match(line).group(1)
68 68 elif start in newfile:
69 69 mar = 'a'
70 70 elif start == 'GIT bi':
71 71 binary = True
72 72 elif start == 'delete':
73 73 mar = 'r'
74 74 elif start:
75 75 s = start[0]
76 76 if s == '-' and not line.startswith('--- '):
77 77 lineremove += 1
78 78 elif s == '+' and not line.startswith('+++ '):
79 79 lineadd[roundto(len(line) - 1, 5)] += 1
80 80 if filename:
81 81 yield filename, mar, lineadd, lineremove, binary
82 82
83 83 @command('analyze',
84 84 [('o', 'output', [], _('write output to given file'), _('FILE')),
85 85 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
86 86 _('hg analyze'))
87 87 def analyze(ui, repo, *revs, **opts):
88 88 '''create a simple model of a repository to use for later synthesis
89 89
90 90 This command examines every changeset in the given range (or all
91 91 of history if none are specified) and creates a simple statistical
92 92 model of the history of the repository.
93 93
94 94 The model is written out to a JSON file, and can be used by
95 95 :hg:`synthesize` to create or augment a repository with synthetic
96 96 commits that have a structure that is statistically similar to the
97 97 analyzed repository.
98 98 '''
99 99
100 100 revs = list(revs)
101 101 revs.extend(opts['rev'])
102 102 if not revs:
103 103 revs = [':']
104 104
105 105 output = opts['output']
106 106 if not output:
107 107 output = os.path.basename(repo.root) + '.json'
108 108
109 109 if output == '-':
110 110 fp = sys.stdout
111 111 else:
112 112 fp = open(output, 'w')
113 113
114 114 revs = scmutil.revrange(repo, revs)
115 115 revs.sort()
116 116
117 117 lineschanged = zerodict()
118 118 children = zerodict()
119 119 p1distance = zerodict()
120 120 p2distance = zerodict()
121 121 linesinfilesadded = zerodict()
122 122 fileschanged = zerodict()
123 123 filesadded = zerodict()
124 124 filesremoved = zerodict()
125 125 linelengths = zerodict()
126 126 interarrival = zerodict()
127 127 parents = zerodict()
128 128 dirsadded = zerodict()
129 129 tzoffset = zerodict()
130 130
131 131 progress = ui.progress
132 132 _analyzing = _('analyzing')
133 133 _changesets = _('changesets')
134 134 _total = len(revs)
135 135
136 136 for i, rev in enumerate(revs):
137 137 progress(_analyzing, i, unit=_changesets, total=_total)
138 138 ctx = repo[rev]
139 139 pl = ctx.parents()
140 140 pctx = pl[0]
141 141 prev = pctx.rev()
142 142 children[prev] += 1
143 143 p1distance[rev - prev] += 1
144 144 parents[len(pl)] += 1
145 145 tzoffset[ctx.date()[1]] += 1
146 146 if len(pl) > 1:
147 147 p2distance[rev - pl[1].rev()] += 1
148 148 if prev == rev - 1:
149 149 lastctx = pctx
150 150 else:
151 151 lastctx = repo[rev - 1]
152 152 if lastctx.rev() != nullrev:
153 153 interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
154 154 diff = sum((d.splitlines()
155 155 for d in ctx.diff(pctx, opts=dict(git=True))), [])
156 156 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
157 157 for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
158 158 if binary:
159 159 continue
160 160 added = sum(lineadd.itervalues(), 0)
161 161 if mar == 'm':
162 162 if added and lineremove:
163 163 lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
164 164 filechanges += 1
165 165 elif mar == 'a':
166 166 fileadds += 1
167 167 if '/' in filename:
168 168 filedir = filename.rsplit('/', 1)[0]
169 169 if filedir not in pctx.dirs():
170 170 diradds += 1
171 171 linesinfilesadded[roundto(added, 5)] += 1
172 172 elif mar == 'r':
173 173 fileremoves += 1
174 174 for length, count in lineadd.iteritems():
175 175 linelengths[length] += count
176 176 fileschanged[filechanges] += 1
177 177 filesadded[fileadds] += 1
178 178 dirsadded[diradds] += 1
179 179 filesremoved[fileremoves] += 1
180 180
181 181 invchildren = zerodict()
182 182
183 183 for rev, count in children.iteritems():
184 184 invchildren[count] += 1
185 185
186 186 if output != '-':
187 187 ui.status(_('writing output to %s\n') % output)
188 188
189 189 def pronk(d):
190 190 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
191 191
192 192 json.dump(dict(revs=len(revs),
193 193 lineschanged=pronk(lineschanged),
194 194 children=pronk(invchildren),
195 195 fileschanged=pronk(fileschanged),
196 196 filesadded=pronk(filesadded),
197 197 linesinfilesadded=pronk(linesinfilesadded),
198 198 dirsadded=pronk(dirsadded),
199 199 filesremoved=pronk(filesremoved),
200 200 linelengths=pronk(linelengths),
201 201 parents=pronk(parents),
202 202 p1distance=pronk(p1distance),
203 203 p2distance=pronk(p2distance),
204 204 interarrival=pronk(interarrival),
205 205 tzoffset=pronk(tzoffset),
206 206 ),
207 207 fp)
208 208 fp.close()
209 209
210 210 @command('synthesize',
211 211 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
212 212 ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
213 213 _('hg synthesize [OPTION].. DESCFILE'))
214 214 def synthesize(ui, repo, descpath, **opts):
215 215 '''synthesize commits based on a model of an existing repository
216 216
217 217 The model must have been generated by :hg:`analyze`. Commits will
218 218 be generated randomly according to the probabilities described in
219 219 the model.
220 220
221 221 When synthesizing new content, commit descriptions, and user
222 222 names, words will be chosen randomly from a dictionary that is
223 223 presumed to contain one word per line. Use --dict to specify the
224 224 path to an alternate dictionary to use.
225 225 '''
226 226 try:
227 227 fp = hg.openpath(ui, descpath)
228 228 except Exception, err:
229 229 raise util.Abort('%s: %s' % (descpath, err[0].strerror))
230 230 desc = json.load(fp)
231 231 fp.close()
232 232
233 233 def cdf(l):
234 234 if not l:
235 235 return [], []
236 236 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
237 237 t = float(sum(probs, 0))
238 238 s, cdfs = 0, []
239 239 for v in probs:
240 240 s += v
241 241 cdfs.append(s / t)
242 242 return vals, cdfs
243 243
244 244 lineschanged = cdf(desc['lineschanged'])
245 245 fileschanged = cdf(desc['fileschanged'])
246 246 filesadded = cdf(desc['filesadded'])
247 247 dirsadded = cdf(desc['dirsadded'])
248 248 filesremoved = cdf(desc['filesremoved'])
249 249 linelengths = cdf(desc['linelengths'])
250 250 parents = cdf(desc['parents'])
251 251 p1distance = cdf(desc['p1distance'])
252 252 p2distance = cdf(desc['p2distance'])
253 253 interarrival = cdf(desc['interarrival'])
254 254 linesinfilesadded = cdf(desc['linesinfilesadded'])
255 255 tzoffset = cdf(desc['tzoffset'])
256 256
257 257 dictfile = opts.get('dict') or '/usr/share/dict/words'
258 258 try:
259 259 fp = open(dictfile, 'rU')
260 260 except IOError, err:
261 261 raise util.Abort('%s: %s' % (dictfile, err.strerror))
262 262 words = fp.read().splitlines()
263 263 fp.close()
264 264
265 265 def pick(cdf):
266 266 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
267 267
268 268 def makeline(minimum=0):
269 269 total = max(minimum, pick(linelengths))
270 270 c, l = 0, []
271 271 while c < total:
272 272 w = random.choice(words)
273 273 c += len(w) + 1
274 274 l.append(w)
275 275 return ' '.join(l)
276 276
277 277 wlock = repo.wlock()
278 278 lock = repo.lock()
279 279
280 280 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
281 281
282 282 progress = ui.progress
283 283 _synthesizing = _('synthesizing')
284 284 _changesets = _('changesets')
285 285
286 286 count = int(opts['count'])
287 287 heads = set(map(repo.changelog.rev, repo.heads()))
288 288 for i in xrange(count):
289 289 progress(_synthesizing, i, unit=_changesets, total=count)
290 290
291 291 node = repo.changelog.node
292 292 revs = len(repo)
293 293
294 294 def pickhead(heads, distance):
295 295 if heads:
296 296 lheads = sorted(heads)
297 297 rev = revs - min(pick(distance), revs)
298 298 if rev < lheads[-1]:
299 299 rev = lheads[bisect.bisect_left(lheads, rev)]
300 300 else:
301 301 rev = lheads[-1]
302 302 return rev, node(rev)
303 303 return nullrev, nullid
304 304
305 305 r1 = revs - min(pick(p1distance), revs)
306 306 p1 = node(r1)
307 307
308 308 # the number of heads will grow without bound if we use a pure
309 309 # model, so artificially constrain their proliferation
310 310 if pick(parents) == 2 or len(heads) > random.randint(1, 20):
311 311 r2, p2 = pickhead(heads.difference([r1]), p2distance)
312 312 else:
313 313 r2, p2 = nullrev, nullid
314 314
315 315 pl = [p1, p2]
316 316 pctx = repo[r1]
317 317 mf = pctx.manifest()
318 318 mfk = mf.keys()
319 319 changes = {}
320 320 if mfk:
321 321 for __ in xrange(pick(fileschanged)):
322 322 for __ in xrange(10):
323 323 fctx = pctx.filectx(random.choice(mfk))
324 324 path = fctx.path()
325 325 if not (path in nevertouch or fctx.isbinary() or
326 326 'l' in fctx.flags()):
327 327 break
328 328 lines = fctx.data().splitlines()
329 329 add, remove = pick(lineschanged)
330 330 for __ in xrange(remove):
331 331 if not lines:
332 332 break
333 333 del lines[random.randrange(0, len(lines))]
334 334 for __ in xrange(add):
335 335 lines.insert(random.randint(0, len(lines)), makeline())
336 336 path = fctx.path()
337 337 changes[path] = context.memfilectx(path,
338 338 '\n'.join(lines) + '\n')
339 339 for __ in xrange(pick(filesremoved)):
340 340 path = random.choice(mfk)
341 341 for __ in xrange(10):
342 342 path = random.choice(mfk)
343 343 if path not in changes:
344 344 changes[path] = None
345 345 break
346 346 if filesadded:
347 347 dirs = list(pctx.dirs())
348 348 dirs.append('')
349 349 for __ in xrange(pick(filesadded)):
350 350 path = [random.choice(dirs)]
351 351 if pick(dirsadded):
352 352 path.append(random.choice(words))
353 353 path.append(random.choice(words))
354 354 path = '/'.join(filter(None, path))
355 355 data = '\n'.join(makeline()
356 356 for __ in xrange(pick(linesinfilesadded))) + '\n'
357 357 changes[path] = context.memfilectx(path, data)
358 358 def filectxfn(repo, memctx, path):
359 359 data = changes[path]
360 360 if data is None:
361 361 raise IOError
362 362 return data
363 363 if not changes:
364 364 continue
365 365 if revs:
366 366 date = repo['tip'].date()[0] + pick(interarrival)
367 367 else:
368 368 date = time.time() - (86400 * count)
369 369 user = random.choice(words) + '@' + random.choice(words)
370 370 mc = context.memctx(repo, pl, makeline(minimum=2),
371 371 sorted(changes.iterkeys()),
372 372 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
373 373 newnode = mc.commit()
374 374 heads.add(repo.changelog.rev(newnode))
375 375 heads.discard(r1)
376 376 heads.discard(r2)
377 377
378 378 lock.release()
379 379 wlock.release()
General Comments 0
You need to be logged in to leave comments. Login now