##// END OF EJS Templates
contrib: add a commit synthesizer for reproducing scaling problems...
Bryan O'Sullivan -
r17734:619068c2 default
parent child Browse files
Show More
@@ -0,0 +1,377 b''
1 # synthrepo.py - repo synthesis
2 #
3 # Copyright 2012 Facebook
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
7
8 '''synthesize structurally interesting change history
9
10 This extension is useful for creating a repository with properties
11 that are statistically similar to an existing repository. During
12 analysis, a simple probability table is constructed from the history
13 of an existing repository. During synthesis, these properties are
14 reconstructed.
15
16 Properties that are analyzed and synthesized include the following:
17
18 - Lines added or removed when an existing file is modified
19 - Number and sizes of files added
20 - Number of files removed
21 - Line lengths
22 - Topological distance to parent changeset(s)
23 - Probability of a commit being a merge
24 - Probability of a newly added file being added to a new directory
25 - Interarrival time, and time zone, of commits
26
27 A few obvious properties that are not currently handled realistically:
28
29 - Merges are treated as regular commits with two parents, which is not
30 realistic
31 - Modifications are not treated as operations on hunks of lines, but
32 as insertions and deletions of randomly chosen single lines
33 - Committer ID (always random)
34 - Executability of files
35 - Symlinks and binary files are ignored
36 '''
37
38 import bisect, collections, json, os, random, time
39 from mercurial import cmdutil, context, patch, scmutil, url, util
40 from mercurial.i18n import _
41 from mercurial.node import nullrev, nullid
42
43 testedwith = 'internal'
44
45 cmdtable = {}
46 command = cmdutil.command(cmdtable)
47
48 newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
49
50 def zerodict():
51 return collections.defaultdict(lambda: 0)
52
53 def roundto(x, k):
54 if x > k * 2:
55 return int(round(x / float(k)) * k)
56 return int(round(x))
57
58 def parsegitdiff(lines):
59 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
60 binary = False
61 for line in lines:
62 start = line[:6]
63 if start == 'diff -':
64 if filename:
65 yield filename, mar, lineadd, lineremove, binary
66 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
67 filename = patch.gitre.match(line).group(1)
68 elif start in newfile:
69 mar = 'a'
70 elif start == 'GIT bi':
71 binary = True
72 elif start == 'delete':
73 mar = 'r'
74 elif start:
75 s = start[0]
76 if s == '-' and not line.startswith('--- '):
77 lineremove += 1
78 elif s == '+' and not line.startswith('+++ '):
79 lineadd[roundto(len(line) - 1, 5)] += 1
80 if filename:
81 yield filename, mar, lineadd, lineremove, binary
82
83 @command('analyze',
84 [('o', 'output', [], _('write output to given file'), _('FILE')),
85 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
86 _('hg analyze'))
87 def analyze(ui, repo, *revs, **opts):
88 '''create a simple model of a repository to use for later synthesis
89
90 This command examines every changeset in the given range (or all
91 of history if none are specified) and creates a simple statistical
92 model of the history of the repository.
93
94 The model is written out to a JSON file, and can be used by
95 :hg:`synthesize` to create or augment a repository with synthetic
96 commits that have a structure that is statistically similar to the
97 analyzed repository.
98 '''
99
100 revs = list(revs)
101 revs.extend(opts['rev'])
102 if not revs:
103 revs = [':']
104
105 output = opts['output']
106 if not output:
107 output = os.path.basename(repo.root) + '.json'
108
109 if output == '-':
110 fp = sys.stdout
111 else:
112 fp = open(output, 'w')
113
114 revs = scmutil.revrange(repo, revs)
115 revs.sort()
116
117 lineschanged = zerodict()
118 children = zerodict()
119 p1distance = zerodict()
120 p2distance = zerodict()
121 linesinfilesadded = zerodict()
122 fileschanged = zerodict()
123 filesadded = zerodict()
124 filesremoved = zerodict()
125 linelengths = zerodict()
126 interarrival = zerodict()
127 parents = zerodict()
128 dirsadded = zerodict()
129 tzoffset = zerodict()
130
131 progress = ui.progress
132 _analyzing = _('analyzing')
133 _changesets = _('changesets')
134 _total = len(revs)
135
136 for i, rev in enumerate(revs):
137 progress(_analyzing, i, unit=_changesets, total=_total)
138 ctx = repo[rev]
139 pl = ctx.parents()
140 pctx = pl[0]
141 prev = pctx.rev()
142 children[prev] += 1
143 p1distance[rev - prev] += 1
144 parents[len(pl)] += 1
145 tzoffset[ctx.date()[1]] += 1
146 if len(pl) > 1:
147 p2distance[rev - pl[1].rev()] += 1
148 if prev == rev - 1:
149 lastctx = pctx
150 else:
151 lastctx = repo[rev - 1]
152 if lastctx.rev() != nullrev:
153 interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
154 diff = sum((d.splitlines()
155 for d in ctx.diff(pctx, opts=dict(git=True))), [])
156 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
157 for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
158 if binary:
159 continue
160 added = sum(lineadd.itervalues(), 0)
161 if mar == 'm':
162 if added and lineremove:
163 lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
164 filechanges += 1
165 elif mar == 'a':
166 fileadds += 1
167 if '/' in filename:
168 filedir = filename.rsplit('/', 1)[0]
169 if filedir not in pctx.dirs():
170 diradds += 1
171 linesinfilesadded[roundto(added, 5)] += 1
172 elif mar == 'r':
173 fileremoves += 1
174 for length, count in lineadd.iteritems():
175 linelengths[length] += count
176 fileschanged[filechanges] += 1
177 filesadded[fileadds] += 1
178 dirsadded[diradds] += 1
179 filesremoved[fileremoves] += 1
180
181 invchildren = zerodict()
182
183 for rev, count in children.iteritems():
184 invchildren[count] += 1
185
186 if output != '-':
187 ui.status(_('writing output to %s\n') % output)
188
189 def pronk(d):
190 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
191
192 json.dump(dict(revs=len(revs),
193 lineschanged=pronk(lineschanged),
194 children=pronk(invchildren),
195 fileschanged=pronk(fileschanged),
196 filesadded=pronk(filesadded),
197 linesinfilesadded=pronk(linesinfilesadded),
198 dirsadded=pronk(dirsadded),
199 filesremoved=pronk(filesremoved),
200 linelengths=pronk(linelengths),
201 parents=pronk(parents),
202 p1distance=pronk(p1distance),
203 p2distance=pronk(p2distance),
204 interarrival=pronk(interarrival),
205 tzoffset=pronk(tzoffset),
206 ),
207 fp)
208 fp.close()
209
210 @command('synthesize',
211 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
212 ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
213 _('hg synthesize [OPTION].. DESCFILE'))
214 def synthesize(ui, repo, descpath, **opts):
215 '''synthesize commits based on a model of an existing repository
216
217 The model must have been generated by :hg:`analyze`. Commits will
218 be generated randomly according to the probabilities described in
219 the model.
220
221 When synthesizing new content, commit descriptions, and user
222 names, words will be chosen randomly from a dictionary that is
223 presumed to contain one word per line. Use --dict to specify the
224 path to an alternate dictionary to use.
225 '''
226 try:
227 fp = url.open(ui, descpath)
228 except Exception, err:
229 raise util.Abort('%s: %s' % (descpath, err[0].strerror))
230 desc = json.load(fp)
231 fp.close()
232
233 def cdf(l):
234 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
235 t = float(sum(probs, 0))
236 s, cdfs = 0, []
237 for v in probs:
238 s += v
239 cdfs.append(s / t)
240 return vals, cdfs
241
242 lineschanged = cdf(desc['lineschanged'])
243 fileschanged = cdf(desc['fileschanged'])
244 filesadded = cdf(desc['filesadded'])
245 dirsadded = cdf(desc['dirsadded'])
246 filesremoved = cdf(desc['filesremoved'])
247 linelengths = cdf(desc['linelengths'])
248 parents = cdf(desc['parents'])
249 p1distance = cdf(desc['p1distance'])
250 p2distance = cdf(desc['p2distance'])
251 interarrival = cdf(desc['interarrival'])
252 linesinfilesadded = cdf(desc['linesinfilesadded'])
253 tzoffset = cdf(desc['tzoffset'])
254
255 dictfile = opts.get('dict') or '/usr/share/dict/words'
256 try:
257 fp = open(dictfile, 'rU')
258 except IOError, err:
259 raise util.Abort('%s: %s' % (dictfile, err.strerror))
260 words = fp.read().splitlines()
261 fp.close()
262
263 def pick(cdf):
264 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
265
266 def makeline(minimum=0):
267 total = max(minimum, pick(linelengths))
268 c, l = 0, []
269 while c < total:
270 w = random.choice(words)
271 c += len(w) + 1
272 l.append(w)
273 return ' '.join(l)
274
275 wlock = repo.wlock()
276 lock = repo.lock()
277
278 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
279
280 progress = ui.progress
281 _synthesizing = _('synthesizing')
282 _changesets = _('changesets')
283
284 count = int(opts['count'])
285 heads = set(map(repo.changelog.rev, repo.heads()))
286 for i in xrange(count):
287 progress(_synthesizing, i, unit=_changesets, total=count)
288
289 node = repo.changelog.node
290 revs = len(repo)
291
292 def pickhead(heads, distance):
293 if heads:
294 lheads = sorted(heads)
295 rev = revs - min(pick(distance), revs)
296 if rev < lheads[-1]:
297 rev = lheads[bisect.bisect_left(lheads, rev)]
298 else:
299 rev = lheads[-1]
300 return rev, node(rev)
301 return nullrev, nullid
302
303 r1 = revs - min(pick(p1distance), revs)
304 p1 = node(r1)
305
306 # the number of heads will grow without bound if we use a pure
307 # model, so artificially constrain their proliferation
308 if pick(parents) == 2 or len(heads) > random.randint(1, 20):
309 r2, p2 = pickhead(heads.difference([r1]), p2distance)
310 else:
311 r2, p2 = nullrev, nullid
312
313 pl = [p1, p2]
314 pctx = repo[r1]
315 mf = pctx.manifest()
316 mfk = mf.keys()
317 changes = {}
318 if mfk:
319 for __ in xrange(pick(fileschanged)):
320 for __ in xrange(10):
321 fctx = pctx.filectx(random.choice(mfk))
322 path = fctx.path()
323 if not (path in nevertouch or fctx.isbinary() or
324 'l' in fctx.flags()):
325 break
326 lines = fctx.data().splitlines()
327 add, remove = pick(lineschanged)
328 for __ in xrange(remove):
329 if not lines:
330 break
331 del lines[random.randrange(0, len(lines))]
332 for __ in xrange(add):
333 lines.insert(random.randint(0, len(lines)), makeline())
334 path = fctx.path()
335 changes[path] = context.memfilectx(path,
336 '\n'.join(lines) + '\n')
337 for __ in xrange(pick(filesremoved)):
338 path = random.choice(mfk)
339 for __ in xrange(10):
340 path = random.choice(mfk)
341 if path not in changes:
342 changes[path] = None
343 break
344 if filesadded:
345 dirs = list(pctx.dirs())
346 dirs.append('')
347 for __ in xrange(pick(filesadded)):
348 path = [random.choice(dirs)]
349 if pick(dirsadded):
350 path.append(random.choice(words))
351 path.append(random.choice(words))
352 path = '/'.join(filter(None, path))
353 data = '\n'.join(makeline()
354 for __ in xrange(pick(linesinfilesadded))) + '\n'
355 changes[path] = context.memfilectx(path, data)
356 def filectxfn(repo, memctx, path):
357 data = changes[path]
358 if data is None:
359 raise IOError
360 return data
361 if not changes:
362 continue
363 if revs:
364 date = repo['tip'].date()[0] + pick(interarrival)
365 else:
366 date = time.time() - (86400 * count)
367 user = random.choice(words) + '@' + random.choice(words)
368 mc = context.memctx(repo, pl, makeline(minimum=2),
369 sorted(changes.iterkeys()),
370 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
371 newnode = mc.commit()
372 heads.add(repo.changelog.rev(newnode))
373 heads.discard(r1)
374 heads.discard(r2)
375
376 lock.release()
377 wlock.release()
General Comments 0
You need to be logged in to leave comments. Login now