##// END OF EJS Templates
synthrepo: new filenames must not also be new directories, and vice-versa...
Mike Edgar -
r23778:a5dbec25 default
parent child Browse files
Show More
@@ -1,468 +1,485 b''
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26 - Number of files in each directory
27 27
28 28 A few obvious properties that are not currently handled realistically:
29 29
30 30 - Merges are treated as regular commits with two parents, which is not
31 31 realistic
32 32 - Modifications are not treated as operations on hunks of lines, but
33 33 as insertions and deletions of randomly chosen single lines
34 34 - Committer ID (always random)
35 35 - Executability of files
36 36 - Symlinks and binary files are ignored
37 37 '''
38 38
39 39 import bisect, collections, itertools, json, os, random, time, sys
40 40 from mercurial import cmdutil, context, patch, scmutil, util, hg
41 41 from mercurial.i18n import _
42 42 from mercurial.node import nullrev, nullid, short
43 43
44 44 testedwith = 'internal'
45 45
46 46 cmdtable = {}
47 47 command = cmdutil.command(cmdtable)
48 48
49 49 newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
50 50
51 51 def zerodict():
52 52 return collections.defaultdict(lambda: 0)
53 53
54 54 def roundto(x, k):
55 55 if x > k * 2:
56 56 return int(round(x / float(k)) * k)
57 57 return int(round(x))
58 58
59 59 def parsegitdiff(lines):
60 60 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
61 61 binary = False
62 62 for line in lines:
63 63 start = line[:6]
64 64 if start == 'diff -':
65 65 if filename:
66 66 yield filename, mar, lineadd, lineremove, binary
67 67 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
68 68 filename = patch.gitre.match(line).group(1)
69 69 elif start in newfile:
70 70 mar = 'a'
71 71 elif start == 'GIT bi':
72 72 binary = True
73 73 elif start == 'delete':
74 74 mar = 'r'
75 75 elif start:
76 76 s = start[0]
77 77 if s == '-' and not line.startswith('--- '):
78 78 lineremove += 1
79 79 elif s == '+' and not line.startswith('+++ '):
80 80 lineadd[roundto(len(line) - 1, 5)] += 1
81 81 if filename:
82 82 yield filename, mar, lineadd, lineremove, binary
83 83
84 84 @command('analyze',
85 85 [('o', 'output', '', _('write output to given file'), _('FILE')),
86 86 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
87 87 _('hg analyze'), optionalrepo=True)
88 88 def analyze(ui, repo, *revs, **opts):
89 89 '''create a simple model of a repository to use for later synthesis
90 90
91 91 This command examines every changeset in the given range (or all
92 92 of history if none are specified) and creates a simple statistical
93 93 model of the history of the repository. It also measures the directory
94 94 structure of the repository as checked out.
95 95
96 96 The model is written out to a JSON file, and can be used by
97 97 :hg:`synthesize` to create or augment a repository with synthetic
98 98 commits that have a structure that is statistically similar to the
99 99 analyzed repository.
100 100 '''
101 101 root = repo.root
102 102 if not root.endswith(os.path.sep):
103 103 root += os.path.sep
104 104
105 105 revs = list(revs)
106 106 revs.extend(opts['rev'])
107 107 if not revs:
108 108 revs = [':']
109 109
110 110 output = opts['output']
111 111 if not output:
112 112 output = os.path.basename(root) + '.json'
113 113
114 114 if output == '-':
115 115 fp = sys.stdout
116 116 else:
117 117 fp = open(output, 'w')
118 118
119 119 # Always obtain file counts of each directory in the given root directory.
120 120 def onerror(e):
121 121 ui.warn(_('error walking directory structure: %s\n') % e)
122 122
123 123 dirs = {}
124 124 rootprefixlen = len(root)
125 125 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
126 126 dirpathfromroot = dirpath[rootprefixlen:]
127 127 dirs[dirpathfromroot] = len(filenames)
128 128 if '.hg' in dirnames:
129 129 dirnames.remove('.hg')
130 130
131 131 lineschanged = zerodict()
132 132 children = zerodict()
133 133 p1distance = zerodict()
134 134 p2distance = zerodict()
135 135 linesinfilesadded = zerodict()
136 136 fileschanged = zerodict()
137 137 filesadded = zerodict()
138 138 filesremoved = zerodict()
139 139 linelengths = zerodict()
140 140 interarrival = zerodict()
141 141 parents = zerodict()
142 142 dirsadded = zerodict()
143 143 tzoffset = zerodict()
144 144
145 145 # If a mercurial repo is available, also model the commit history.
146 146 if repo:
147 147 revs = scmutil.revrange(repo, revs)
148 148 revs.sort()
149 149
150 150 progress = ui.progress
151 151 _analyzing = _('analyzing')
152 152 _changesets = _('changesets')
153 153 _total = len(revs)
154 154
155 155 for i, rev in enumerate(revs):
156 156 progress(_analyzing, i, unit=_changesets, total=_total)
157 157 ctx = repo[rev]
158 158 pl = ctx.parents()
159 159 pctx = pl[0]
160 160 prev = pctx.rev()
161 161 children[prev] += 1
162 162 p1distance[rev - prev] += 1
163 163 parents[len(pl)] += 1
164 164 tzoffset[ctx.date()[1]] += 1
165 165 if len(pl) > 1:
166 166 p2distance[rev - pl[1].rev()] += 1
167 167 if prev == rev - 1:
168 168 lastctx = pctx
169 169 else:
170 170 lastctx = repo[rev - 1]
171 171 if lastctx.rev() != nullrev:
172 172 timedelta = ctx.date()[0] - lastctx.date()[0]
173 173 interarrival[roundto(timedelta, 300)] += 1
174 174 diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
175 175 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
176 176 for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
177 177 if isbin:
178 178 continue
179 179 added = sum(lineadd.itervalues(), 0)
180 180 if mar == 'm':
181 181 if added and lineremove:
182 182 lineschanged[roundto(added, 5),
183 183 roundto(lineremove, 5)] += 1
184 184 filechanges += 1
185 185 elif mar == 'a':
186 186 fileadds += 1
187 187 if '/' in filename:
188 188 filedir = filename.rsplit('/', 1)[0]
189 189 if filedir not in pctx.dirs():
190 190 diradds += 1
191 191 linesinfilesadded[roundto(added, 5)] += 1
192 192 elif mar == 'r':
193 193 fileremoves += 1
194 194 for length, count in lineadd.iteritems():
195 195 linelengths[length] += count
196 196 fileschanged[filechanges] += 1
197 197 filesadded[fileadds] += 1
198 198 dirsadded[diradds] += 1
199 199 filesremoved[fileremoves] += 1
200 200
201 201 invchildren = zerodict()
202 202
203 203 for rev, count in children.iteritems():
204 204 invchildren[count] += 1
205 205
206 206 if output != '-':
207 207 ui.status(_('writing output to %s\n') % output)
208 208
209 209 def pronk(d):
210 210 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
211 211
212 212 json.dump({'revs': len(revs),
213 213 'initdirs': pronk(dirs),
214 214 'lineschanged': pronk(lineschanged),
215 215 'children': pronk(invchildren),
216 216 'fileschanged': pronk(fileschanged),
217 217 'filesadded': pronk(filesadded),
218 218 'linesinfilesadded': pronk(linesinfilesadded),
219 219 'dirsadded': pronk(dirsadded),
220 220 'filesremoved': pronk(filesremoved),
221 221 'linelengths': pronk(linelengths),
222 222 'parents': pronk(parents),
223 223 'p1distance': pronk(p1distance),
224 224 'p2distance': pronk(p2distance),
225 225 'interarrival': pronk(interarrival),
226 226 'tzoffset': pronk(tzoffset),
227 227 },
228 228 fp)
229 229 fp.close()
230 230
231 231 @command('synthesize',
232 232 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
233 233 ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
234 234 ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
235 235 _('hg synthesize [OPTION].. DESCFILE'))
236 236 def synthesize(ui, repo, descpath, **opts):
237 237 '''synthesize commits based on a model of an existing repository
238 238
239 239 The model must have been generated by :hg:`analyze`. Commits will
240 240 be generated randomly according to the probabilities described in
241 241 the model. If --initfiles is set, the repository will be seeded with
242 242 the given number files following the modeled repository's directory
243 243 structure.
244 244
245 245 When synthesizing new content, commit descriptions, and user
246 246 names, words will be chosen randomly from a dictionary that is
247 247 presumed to contain one word per line. Use --dict to specify the
248 248 path to an alternate dictionary to use.
249 249 '''
250 250 try:
251 251 fp = hg.openpath(ui, descpath)
252 252 except Exception, err:
253 253 raise util.Abort('%s: %s' % (descpath, err[0].strerror))
254 254 desc = json.load(fp)
255 255 fp.close()
256 256
257 257 def cdf(l):
258 258 if not l:
259 259 return [], []
260 260 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
261 261 t = float(sum(probs, 0))
262 262 s, cdfs = 0, []
263 263 for v in probs:
264 264 s += v
265 265 cdfs.append(s / t)
266 266 return vals, cdfs
267 267
268 268 lineschanged = cdf(desc['lineschanged'])
269 269 fileschanged = cdf(desc['fileschanged'])
270 270 filesadded = cdf(desc['filesadded'])
271 271 dirsadded = cdf(desc['dirsadded'])
272 272 filesremoved = cdf(desc['filesremoved'])
273 273 linelengths = cdf(desc['linelengths'])
274 274 parents = cdf(desc['parents'])
275 275 p1distance = cdf(desc['p1distance'])
276 276 p2distance = cdf(desc['p2distance'])
277 277 interarrival = cdf(desc['interarrival'])
278 278 linesinfilesadded = cdf(desc['linesinfilesadded'])
279 279 tzoffset = cdf(desc['tzoffset'])
280 280
281 281 dictfile = opts.get('dict') or '/usr/share/dict/words'
282 282 try:
283 283 fp = open(dictfile, 'rU')
284 284 except IOError, err:
285 285 raise util.Abort('%s: %s' % (dictfile, err.strerror))
286 286 words = fp.read().splitlines()
287 287 fp.close()
288 288
289 289 initdirs = {}
290 290 if desc['initdirs']:
291 291 for k, v in desc['initdirs']:
292 292 initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
293 293 initdirs = renamedirs(initdirs, words)
294 294 initdirscdf = cdf(initdirs)
295 295
296 296 def pick(cdf):
297 297 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
298 298
299 299 def pickpath():
300 300 return os.path.join(pick(initdirscdf), random.choice(words))
301 301
302 302 def makeline(minimum=0):
303 303 total = max(minimum, pick(linelengths))
304 304 c, l = 0, []
305 305 while c < total:
306 306 w = random.choice(words)
307 307 c += len(w) + 1
308 308 l.append(w)
309 309 return ' '.join(l)
310 310
311 311 wlock = repo.wlock()
312 312 lock = repo.lock()
313 313
314 314 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
315 315
316 316 progress = ui.progress
317 317 _synthesizing = _('synthesizing')
318 318 _files = _('initial files')
319 319 _changesets = _('changesets')
320 320
321 321 # Synthesize a single initial revision adding files to the repo according
322 322 # to the modeled directory structure.
323 323 initcount = int(opts['initfiles'])
324 324 if initcount and initdirs:
325 325 pctx = repo[None].parents()[0]
326 dirs = set(pctx.dirs())
326 327 files = {}
328
329 def validpath(path):
330 # Don't pick filenames which are already directory names.
331 if path in dirs:
332 return False
333 # Don't pick directories which were used as file names.
334 while path:
335 if path in files:
336 return False
337 path = os.path.dirname(path)
338 return True
339
327 340 for i in xrange(0, initcount):
328 341 ui.progress(_synthesizing, i, unit=_files, total=initcount)
329 342
330 343 path = pickpath()
331 while path in pctx.dirs():
344 while not validpath(path):
332 345 path = pickpath()
333 346 data = '%s contents\n' % path
334 347 files[path] = context.memfilectx(repo, path, data)
348 dir = os.path.dirname(path)
349 while dir and dir not in dirs:
350 dirs.add(dir)
351 dir = os.path.dirname(dir)
335 352
336 353 def filectxfn(repo, memctx, path):
337 354 return files[path]
338 355
339 356 ui.progress(_synthesizing, None)
340 357 message = 'synthesized wide repo with %d files' % (len(files),)
341 358 mc = context.memctx(repo, [pctx.node(), nullid], message,
342 359 files.iterkeys(), filectxfn, ui.username(),
343 360 '%d %d' % util.makedate())
344 361 initnode = mc.commit()
345 362 hexfn = ui.debugflag and hex or short
346 363 ui.status(_('added commit %s with %d files\n')
347 364 % (hexfn(initnode), len(files)))
348 365
349 366 # Synthesize incremental revisions to the repository, adding repo depth.
350 367 count = int(opts['count'])
351 368 heads = set(map(repo.changelog.rev, repo.heads()))
352 369 for i in xrange(count):
353 370 progress(_synthesizing, i, unit=_changesets, total=count)
354 371
355 372 node = repo.changelog.node
356 373 revs = len(repo)
357 374
358 375 def pickhead(heads, distance):
359 376 if heads:
360 377 lheads = sorted(heads)
361 378 rev = revs - min(pick(distance), revs)
362 379 if rev < lheads[-1]:
363 380 rev = lheads[bisect.bisect_left(lheads, rev)]
364 381 else:
365 382 rev = lheads[-1]
366 383 return rev, node(rev)
367 384 return nullrev, nullid
368 385
369 386 r1 = revs - min(pick(p1distance), revs)
370 387 p1 = node(r1)
371 388
372 389 # the number of heads will grow without bound if we use a pure
373 390 # model, so artificially constrain their proliferation
374 391 toomanyheads = len(heads) > random.randint(1, 20)
375 392 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
376 393 r2, p2 = pickhead(heads.difference([r1]), p2distance)
377 394 else:
378 395 r2, p2 = nullrev, nullid
379 396
380 397 pl = [p1, p2]
381 398 pctx = repo[r1]
382 399 mf = pctx.manifest()
383 400 mfk = mf.keys()
384 401 changes = {}
385 402 if mfk:
386 403 for __ in xrange(pick(fileschanged)):
387 404 for __ in xrange(10):
388 405 fctx = pctx.filectx(random.choice(mfk))
389 406 path = fctx.path()
390 407 if not (path in nevertouch or fctx.isbinary() or
391 408 'l' in fctx.flags()):
392 409 break
393 410 lines = fctx.data().splitlines()
394 411 add, remove = pick(lineschanged)
395 412 for __ in xrange(remove):
396 413 if not lines:
397 414 break
398 415 del lines[random.randrange(0, len(lines))]
399 416 for __ in xrange(add):
400 417 lines.insert(random.randint(0, len(lines)), makeline())
401 418 path = fctx.path()
402 419 changes[path] = context.memfilectx(repo, path,
403 420 '\n'.join(lines) + '\n')
404 421 for __ in xrange(pick(filesremoved)):
405 422 path = random.choice(mfk)
406 423 for __ in xrange(10):
407 424 path = random.choice(mfk)
408 425 if path not in changes:
409 426 changes[path] = None
410 427 break
411 428 if filesadded:
412 429 dirs = list(pctx.dirs())
413 430 dirs.insert(0, '')
414 431 for __ in xrange(pick(filesadded)):
415 432 pathstr = ''
416 433 while pathstr in dirs:
417 434 path = [random.choice(dirs)]
418 435 if pick(dirsadded):
419 436 path.append(random.choice(words))
420 437 path.append(random.choice(words))
421 438 pathstr = '/'.join(filter(None, path))
422 439 data = '\n'.join(makeline()
423 440 for __ in xrange(pick(linesinfilesadded))) + '\n'
424 441 changes[pathstr] = context.memfilectx(repo, pathstr, data)
425 442 def filectxfn(repo, memctx, path):
426 443 return changes[path]
427 444 if not changes:
428 445 continue
429 446 if revs:
430 447 date = repo['tip'].date()[0] + pick(interarrival)
431 448 else:
432 449 date = time.time() - (86400 * count)
433 450 # dates in mercurial must be positive, fit in 32-bit signed integers.
434 451 date = min(0x7fffffff, max(0, date))
435 452 user = random.choice(words) + '@' + random.choice(words)
436 453 mc = context.memctx(repo, pl, makeline(minimum=2),
437 454 sorted(changes.iterkeys()),
438 455 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
439 456 newnode = mc.commit()
440 457 heads.add(repo.changelog.rev(newnode))
441 458 heads.discard(r1)
442 459 heads.discard(r2)
443 460
444 461 lock.release()
445 462 wlock.release()
446 463
447 464 def renamedirs(dirs, words):
448 465 '''Randomly rename the directory names in the per-dir file count dict.'''
449 466 wordgen = itertools.cycle(words)
450 467 replacements = {'': ''}
451 468 def rename(dirpath):
452 469 '''Recursively rename the directory and all path prefixes.
453 470
454 471 The mapping from path to renamed path is stored for all path prefixes
455 472 as in dynamic programming, ensuring linear runtime and consistent
456 473 renaming regardless of iteration order through the model.
457 474 '''
458 475 if dirpath in replacements:
459 476 return replacements[dirpath]
460 477 head, _ = os.path.split(dirpath)
461 478 head = head and rename(head) or ''
462 479 renamed = os.path.join(head, wordgen.next())
463 480 replacements[dirpath] = renamed
464 481 return renamed
465 482 result = []
466 483 for dirpath, count in dirs.iteritems():
467 484 result.append([rename(dirpath.lstrip(os.sep)), count])
468 485 return result
General Comments 0
You need to be logged in to leave comments. Login now