##// END OF EJS Templates
contrib/synthrepo: only generate 2 parents if model contains merges...
Mike Edgar -
r22472:2e2577b0 default
parent child Browse files
Show More
@@ -1,376 +1,377
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26
27 27 A few obvious properties that are not currently handled realistically:
28 28
29 29 - Merges are treated as regular commits with two parents, which is not
30 30 realistic
31 31 - Modifications are not treated as operations on hunks of lines, but
32 32 as insertions and deletions of randomly chosen single lines
33 33 - Committer ID (always random)
34 34 - Executability of files
35 35 - Symlinks and binary files are ignored
36 36 '''
37 37
38 38 import bisect, collections, json, os, random, time, sys
39 39 from mercurial import cmdutil, context, patch, scmutil, util, hg
40 40 from mercurial.i18n import _
41 41 from mercurial.node import nullrev, nullid
42 42
43 43 testedwith = 'internal'
44 44
45 45 cmdtable = {}
46 46 command = cmdutil.command(cmdtable)
47 47
48 48 newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
49 49
50 50 def zerodict():
51 51 return collections.defaultdict(lambda: 0)
52 52
53 53 def roundto(x, k):
54 54 if x > k * 2:
55 55 return int(round(x / float(k)) * k)
56 56 return int(round(x))
57 57
58 58 def parsegitdiff(lines):
59 59 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
60 60 binary = False
61 61 for line in lines:
62 62 start = line[:6]
63 63 if start == 'diff -':
64 64 if filename:
65 65 yield filename, mar, lineadd, lineremove, binary
66 66 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
67 67 filename = patch.gitre.match(line).group(1)
68 68 elif start in newfile:
69 69 mar = 'a'
70 70 elif start == 'GIT bi':
71 71 binary = True
72 72 elif start == 'delete':
73 73 mar = 'r'
74 74 elif start:
75 75 s = start[0]
76 76 if s == '-' and not line.startswith('--- '):
77 77 lineremove += 1
78 78 elif s == '+' and not line.startswith('+++ '):
79 79 lineadd[roundto(len(line) - 1, 5)] += 1
80 80 if filename:
81 81 yield filename, mar, lineadd, lineremove, binary
82 82
83 83 @command('analyze',
84 84 [('o', 'output', [], _('write output to given file'), _('FILE')),
85 85 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
86 86 _('hg analyze'))
87 87 def analyze(ui, repo, *revs, **opts):
88 88 '''create a simple model of a repository to use for later synthesis
89 89
90 90 This command examines every changeset in the given range (or all
91 91 of history if none are specified) and creates a simple statistical
92 92 model of the history of the repository.
93 93
94 94 The model is written out to a JSON file, and can be used by
95 95 :hg:`synthesize` to create or augment a repository with synthetic
96 96 commits that have a structure that is statistically similar to the
97 97 analyzed repository.
98 98 '''
99 99
100 100 revs = list(revs)
101 101 revs.extend(opts['rev'])
102 102 if not revs:
103 103 revs = [':']
104 104
105 105 output = opts['output']
106 106 if not output:
107 107 output = os.path.basename(repo.root) + '.json'
108 108
109 109 if output == '-':
110 110 fp = sys.stdout
111 111 else:
112 112 fp = open(output, 'w')
113 113
114 114 revs = scmutil.revrange(repo, revs)
115 115 revs.sort()
116 116
117 117 lineschanged = zerodict()
118 118 children = zerodict()
119 119 p1distance = zerodict()
120 120 p2distance = zerodict()
121 121 linesinfilesadded = zerodict()
122 122 fileschanged = zerodict()
123 123 filesadded = zerodict()
124 124 filesremoved = zerodict()
125 125 linelengths = zerodict()
126 126 interarrival = zerodict()
127 127 parents = zerodict()
128 128 dirsadded = zerodict()
129 129 tzoffset = zerodict()
130 130
131 131 progress = ui.progress
132 132 _analyzing = _('analyzing')
133 133 _changesets = _('changesets')
134 134 _total = len(revs)
135 135
136 136 for i, rev in enumerate(revs):
137 137 progress(_analyzing, i, unit=_changesets, total=_total)
138 138 ctx = repo[rev]
139 139 pl = ctx.parents()
140 140 pctx = pl[0]
141 141 prev = pctx.rev()
142 142 children[prev] += 1
143 143 p1distance[rev - prev] += 1
144 144 parents[len(pl)] += 1
145 145 tzoffset[ctx.date()[1]] += 1
146 146 if len(pl) > 1:
147 147 p2distance[rev - pl[1].rev()] += 1
148 148 if prev == rev - 1:
149 149 lastctx = pctx
150 150 else:
151 151 lastctx = repo[rev - 1]
152 152 if lastctx.rev() != nullrev:
153 153 interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
154 154 diff = sum((d.splitlines()
155 155 for d in ctx.diff(pctx, opts={'git': True})), [])
156 156 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
157 157 for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
158 158 if binary:
159 159 continue
160 160 added = sum(lineadd.itervalues(), 0)
161 161 if mar == 'm':
162 162 if added and lineremove:
163 163 lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
164 164 filechanges += 1
165 165 elif mar == 'a':
166 166 fileadds += 1
167 167 if '/' in filename:
168 168 filedir = filename.rsplit('/', 1)[0]
169 169 if filedir not in pctx.dirs():
170 170 diradds += 1
171 171 linesinfilesadded[roundto(added, 5)] += 1
172 172 elif mar == 'r':
173 173 fileremoves += 1
174 174 for length, count in lineadd.iteritems():
175 175 linelengths[length] += count
176 176 fileschanged[filechanges] += 1
177 177 filesadded[fileadds] += 1
178 178 dirsadded[diradds] += 1
179 179 filesremoved[fileremoves] += 1
180 180
181 181 invchildren = zerodict()
182 182
183 183 for rev, count in children.iteritems():
184 184 invchildren[count] += 1
185 185
186 186 if output != '-':
187 187 ui.status(_('writing output to %s\n') % output)
188 188
189 189 def pronk(d):
190 190 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
191 191
192 192 json.dump({'revs': len(revs),
193 193 'lineschanged': pronk(lineschanged),
194 194 'children': pronk(invchildren),
195 195 'fileschanged': pronk(fileschanged),
196 196 'filesadded': pronk(filesadded),
197 197 'linesinfilesadded': pronk(linesinfilesadded),
198 198 'dirsadded': pronk(dirsadded),
199 199 'filesremoved': pronk(filesremoved),
200 200 'linelengths': pronk(linelengths),
201 201 'parents': pronk(parents),
202 202 'p1distance': pronk(p1distance),
203 203 'p2distance': pronk(p2distance),
204 204 'interarrival': pronk(interarrival),
205 205 'tzoffset': pronk(tzoffset),
206 206 },
207 207 fp)
208 208 fp.close()
209 209
210 210 @command('synthesize',
211 211 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
212 212 ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
213 213 _('hg synthesize [OPTION].. DESCFILE'))
214 214 def synthesize(ui, repo, descpath, **opts):
215 215 '''synthesize commits based on a model of an existing repository
216 216
217 217 The model must have been generated by :hg:`analyze`. Commits will
218 218 be generated randomly according to the probabilities described in
219 219 the model.
220 220
221 221 When synthesizing new content, commit descriptions, and user
222 222 names, words will be chosen randomly from a dictionary that is
223 223 presumed to contain one word per line. Use --dict to specify the
224 224 path to an alternate dictionary to use.
225 225 '''
226 226 try:
227 227 fp = hg.openpath(ui, descpath)
228 228 except Exception, err:
229 229 raise util.Abort('%s: %s' % (descpath, err[0].strerror))
230 230 desc = json.load(fp)
231 231 fp.close()
232 232
233 233 def cdf(l):
234 234 if not l:
235 235 return [], []
236 236 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
237 237 t = float(sum(probs, 0))
238 238 s, cdfs = 0, []
239 239 for v in probs:
240 240 s += v
241 241 cdfs.append(s / t)
242 242 return vals, cdfs
243 243
244 244 lineschanged = cdf(desc['lineschanged'])
245 245 fileschanged = cdf(desc['fileschanged'])
246 246 filesadded = cdf(desc['filesadded'])
247 247 dirsadded = cdf(desc['dirsadded'])
248 248 filesremoved = cdf(desc['filesremoved'])
249 249 linelengths = cdf(desc['linelengths'])
250 250 parents = cdf(desc['parents'])
251 251 p1distance = cdf(desc['p1distance'])
252 252 p2distance = cdf(desc['p2distance'])
253 253 interarrival = cdf(desc['interarrival'])
254 254 linesinfilesadded = cdf(desc['linesinfilesadded'])
255 255 tzoffset = cdf(desc['tzoffset'])
256 256
257 257 dictfile = opts.get('dict') or '/usr/share/dict/words'
258 258 try:
259 259 fp = open(dictfile, 'rU')
260 260 except IOError, err:
261 261 raise util.Abort('%s: %s' % (dictfile, err.strerror))
262 262 words = fp.read().splitlines()
263 263 fp.close()
264 264
265 265 def pick(cdf):
266 266 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
267 267
268 268 def makeline(minimum=0):
269 269 total = max(minimum, pick(linelengths))
270 270 c, l = 0, []
271 271 while c < total:
272 272 w = random.choice(words)
273 273 c += len(w) + 1
274 274 l.append(w)
275 275 return ' '.join(l)
276 276
277 277 wlock = repo.wlock()
278 278 lock = repo.lock()
279 279
280 280 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
281 281
282 282 progress = ui.progress
283 283 _synthesizing = _('synthesizing')
284 284 _changesets = _('changesets')
285 285
286 286 count = int(opts['count'])
287 287 heads = set(map(repo.changelog.rev, repo.heads()))
288 288 for i in xrange(count):
289 289 progress(_synthesizing, i, unit=_changesets, total=count)
290 290
291 291 node = repo.changelog.node
292 292 revs = len(repo)
293 293
294 294 def pickhead(heads, distance):
295 295 if heads:
296 296 lheads = sorted(heads)
297 297 rev = revs - min(pick(distance), revs)
298 298 if rev < lheads[-1]:
299 299 rev = lheads[bisect.bisect_left(lheads, rev)]
300 300 else:
301 301 rev = lheads[-1]
302 302 return rev, node(rev)
303 303 return nullrev, nullid
304 304
305 305 r1 = revs - min(pick(p1distance), revs)
306 306 p1 = node(r1)
307 307
308 308 # the number of heads will grow without bound if we use a pure
309 309 # model, so artificially constrain their proliferation
310 if pick(parents) == 2 or len(heads) > random.randint(1, 20):
310 toomanyheads = len(heads) > random.randint(1, 20)
311 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
311 312 r2, p2 = pickhead(heads.difference([r1]), p2distance)
312 313 else:
313 314 r2, p2 = nullrev, nullid
314 315
315 316 pl = [p1, p2]
316 317 pctx = repo[r1]
317 318 mf = pctx.manifest()
318 319 mfk = mf.keys()
319 320 changes = {}
320 321 if mfk:
321 322 for __ in xrange(pick(fileschanged)):
322 323 for __ in xrange(10):
323 324 fctx = pctx.filectx(random.choice(mfk))
324 325 path = fctx.path()
325 326 if not (path in nevertouch or fctx.isbinary() or
326 327 'l' in fctx.flags()):
327 328 break
328 329 lines = fctx.data().splitlines()
329 330 add, remove = pick(lineschanged)
330 331 for __ in xrange(remove):
331 332 if not lines:
332 333 break
333 334 del lines[random.randrange(0, len(lines))]
334 335 for __ in xrange(add):
335 336 lines.insert(random.randint(0, len(lines)), makeline())
336 337 path = fctx.path()
337 338 changes[path] = context.memfilectx(repo, path,
338 339 '\n'.join(lines) + '\n')
339 340 for __ in xrange(pick(filesremoved)):
340 341 path = random.choice(mfk)
341 342 for __ in xrange(10):
342 343 path = random.choice(mfk)
343 344 if path not in changes:
344 345 changes[path] = None
345 346 break
346 347 if filesadded:
347 348 dirs = list(pctx.dirs())
348 349 dirs.append('')
349 350 for __ in xrange(pick(filesadded)):
350 351 path = [random.choice(dirs)]
351 352 if pick(dirsadded):
352 353 path.append(random.choice(words))
353 354 path.append(random.choice(words))
354 355 path = '/'.join(filter(None, path))
355 356 data = '\n'.join(makeline()
356 357 for __ in xrange(pick(linesinfilesadded))) + '\n'
357 358 changes[path] = context.memfilectx(repo, path, data)
358 359 def filectxfn(repo, memctx, path):
359 360 return changes[path]
360 361 if not changes:
361 362 continue
362 363 if revs:
363 364 date = repo['tip'].date()[0] + pick(interarrival)
364 365 else:
365 366 date = time.time() - (86400 * count)
366 367 user = random.choice(words) + '@' + random.choice(words)
367 368 mc = context.memctx(repo, pl, makeline(minimum=2),
368 369 sorted(changes.iterkeys()),
369 370 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
370 371 newnode = mc.commit()
371 372 heads.add(repo.changelog.rev(newnode))
372 373 heads.discard(r1)
373 374 heads.discard(r2)
374 375
375 376 lock.release()
376 377 wlock.release()
General Comments 0
You need to be logged in to leave comments. Login now