##// END OF EJS Templates
synthrepo: use progress helper...
Martin von Zweigbergk -
r38427:6540333a default
parent child Browse files
Show More
@@ -1,516 +1,514 b''
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26 - Number of files in each directory
27 27
28 28 A few obvious properties that are not currently handled realistically:
29 29
30 30 - Merges are treated as regular commits with two parents, which is not
31 31 realistic
32 32 - Modifications are not treated as operations on hunks of lines, but
33 33 as insertions and deletions of randomly chosen single lines
34 34 - Committer ID (always random)
35 35 - Executability of files
36 36 - Symlinks and binary files are ignored
37 37 '''
38 38
39 39 from __future__ import absolute_import
40 40 import bisect
41 41 import collections
42 42 import itertools
43 43 import json
44 44 import os
45 45 import random
46 46 import sys
47 47 import time
48 48
49 49 from mercurial.i18n import _
50 50 from mercurial.node import (
51 51 nullid,
52 52 nullrev,
53 53 short,
54 54 )
55 55 from mercurial import (
56 56 context,
57 57 error,
58 58 hg,
59 59 patch,
60 60 registrar,
61 61 scmutil,
62 62 )
63 63 from mercurial.utils import dateutil
64 64
65 65 # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
66 66 # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
67 67 # be specifying the version(s) of Mercurial they are tested with, or
68 68 # leave the attribute unspecified.
69 69 testedwith = 'ships-with-hg-core'
70 70
71 71 cmdtable = {}
72 72 command = registrar.command(cmdtable)
73 73
74 74 newfile = {'new fi', 'rename', 'copy f', 'copy t'}
75 75
76 76 def zerodict():
77 77 return collections.defaultdict(lambda: 0)
78 78
79 79 def roundto(x, k):
80 80 if x > k * 2:
81 81 return int(round(x / float(k)) * k)
82 82 return int(round(x))
83 83
84 84 def parsegitdiff(lines):
85 85 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
86 86 binary = False
87 87 for line in lines:
88 88 start = line[:6]
89 89 if start == 'diff -':
90 90 if filename:
91 91 yield filename, mar, lineadd, lineremove, binary
92 92 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
93 93 filename = patch.gitre.match(line).group(1)
94 94 elif start in newfile:
95 95 mar = 'a'
96 96 elif start == 'GIT bi':
97 97 binary = True
98 98 elif start == 'delete':
99 99 mar = 'r'
100 100 elif start:
101 101 s = start[0]
102 102 if s == '-' and not line.startswith('--- '):
103 103 lineremove += 1
104 104 elif s == '+' and not line.startswith('+++ '):
105 105 lineadd[roundto(len(line) - 1, 5)] += 1
106 106 if filename:
107 107 yield filename, mar, lineadd, lineremove, binary
108 108
109 109 @command('analyze',
110 110 [('o', 'output', '', _('write output to given file'), _('FILE')),
111 111 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
112 112 _('hg analyze'), optionalrepo=True)
113 113 def analyze(ui, repo, *revs, **opts):
114 114 '''create a simple model of a repository to use for later synthesis
115 115
116 116 This command examines every changeset in the given range (or all
117 117 of history if none are specified) and creates a simple statistical
118 118 model of the history of the repository. It also measures the directory
119 119 structure of the repository as checked out.
120 120
121 121 The model is written out to a JSON file, and can be used by
122 122 :hg:`synthesize` to create or augment a repository with synthetic
123 123 commits that have a structure that is statistically similar to the
124 124 analyzed repository.
125 125 '''
126 126 root = repo.root
127 127 if not root.endswith(os.path.sep):
128 128 root += os.path.sep
129 129
130 130 revs = list(revs)
131 131 revs.extend(opts['rev'])
132 132 if not revs:
133 133 revs = [':']
134 134
135 135 output = opts['output']
136 136 if not output:
137 137 output = os.path.basename(root) + '.json'
138 138
139 139 if output == '-':
140 140 fp = sys.stdout
141 141 else:
142 142 fp = open(output, 'w')
143 143
144 144 # Always obtain file counts of each directory in the given root directory.
145 145 def onerror(e):
146 146 ui.warn(_('error walking directory structure: %s\n') % e)
147 147
148 148 dirs = {}
149 149 rootprefixlen = len(root)
150 150 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
151 151 dirpathfromroot = dirpath[rootprefixlen:]
152 152 dirs[dirpathfromroot] = len(filenames)
153 153 if '.hg' in dirnames:
154 154 dirnames.remove('.hg')
155 155
156 156 lineschanged = zerodict()
157 157 children = zerodict()
158 158 p1distance = zerodict()
159 159 p2distance = zerodict()
160 160 linesinfilesadded = zerodict()
161 161 fileschanged = zerodict()
162 162 filesadded = zerodict()
163 163 filesremoved = zerodict()
164 164 linelengths = zerodict()
165 165 interarrival = zerodict()
166 166 parents = zerodict()
167 167 dirsadded = zerodict()
168 168 tzoffset = zerodict()
169 169
170 170 # If a mercurial repo is available, also model the commit history.
171 171 if repo:
172 172 revs = scmutil.revrange(repo, revs)
173 173 revs.sort()
174 174
175 progress = ui.progress
176 _analyzing = _('analyzing')
177 _changesets = _('changesets')
178 _total = len(revs)
179
175 progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),
176 total=len(revs))
180 177 for i, rev in enumerate(revs):
181 progress(_analyzing, i, unit=_changesets, total=_total)
178 progress.update(i)
182 179 ctx = repo[rev]
183 180 pl = ctx.parents()
184 181 pctx = pl[0]
185 182 prev = pctx.rev()
186 183 children[prev] += 1
187 184 p1distance[rev - prev] += 1
188 185 parents[len(pl)] += 1
189 186 tzoffset[ctx.date()[1]] += 1
190 187 if len(pl) > 1:
191 188 p2distance[rev - pl[1].rev()] += 1
192 189 if prev == rev - 1:
193 190 lastctx = pctx
194 191 else:
195 192 lastctx = repo[rev - 1]
196 193 if lastctx.rev() != nullrev:
197 194 timedelta = ctx.date()[0] - lastctx.date()[0]
198 195 interarrival[roundto(timedelta, 300)] += 1
199 196 diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
200 197 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
201 198 for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
202 199 if isbin:
203 200 continue
204 201 added = sum(lineadd.itervalues(), 0)
205 202 if mar == 'm':
206 203 if added and lineremove:
207 204 lineschanged[roundto(added, 5),
208 205 roundto(lineremove, 5)] += 1
209 206 filechanges += 1
210 207 elif mar == 'a':
211 208 fileadds += 1
212 209 if '/' in filename:
213 210 filedir = filename.rsplit('/', 1)[0]
214 211 if filedir not in pctx.dirs():
215 212 diradds += 1
216 213 linesinfilesadded[roundto(added, 5)] += 1
217 214 elif mar == 'r':
218 215 fileremoves += 1
219 216 for length, count in lineadd.iteritems():
220 217 linelengths[length] += count
221 218 fileschanged[filechanges] += 1
222 219 filesadded[fileadds] += 1
223 220 dirsadded[diradds] += 1
224 221 filesremoved[fileremoves] += 1
225 222
226 223 invchildren = zerodict()
227 224
228 225 for rev, count in children.iteritems():
229 226 invchildren[count] += 1
230 227
231 228 if output != '-':
232 229 ui.status(_('writing output to %s\n') % output)
233 230
234 231 def pronk(d):
235 232 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
236 233
237 234 json.dump({'revs': len(revs),
238 235 'initdirs': pronk(dirs),
239 236 'lineschanged': pronk(lineschanged),
240 237 'children': pronk(invchildren),
241 238 'fileschanged': pronk(fileschanged),
242 239 'filesadded': pronk(filesadded),
243 240 'linesinfilesadded': pronk(linesinfilesadded),
244 241 'dirsadded': pronk(dirsadded),
245 242 'filesremoved': pronk(filesremoved),
246 243 'linelengths': pronk(linelengths),
247 244 'parents': pronk(parents),
248 245 'p1distance': pronk(p1distance),
249 246 'p2distance': pronk(p2distance),
250 247 'interarrival': pronk(interarrival),
251 248 'tzoffset': pronk(tzoffset),
252 249 },
253 250 fp)
254 251 fp.close()
255 252
256 253 @command('synthesize',
257 254 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
258 255 ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
259 256 ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
260 257 _('hg synthesize [OPTION].. DESCFILE'))
261 258 def synthesize(ui, repo, descpath, **opts):
262 259 '''synthesize commits based on a model of an existing repository
263 260
264 261 The model must have been generated by :hg:`analyze`. Commits will
265 262 be generated randomly according to the probabilities described in
266 263 the model. If --initfiles is set, the repository will be seeded with
267 264 the given number files following the modeled repository's directory
268 265 structure.
269 266
270 267 When synthesizing new content, commit descriptions, and user
271 268 names, words will be chosen randomly from a dictionary that is
272 269 presumed to contain one word per line. Use --dict to specify the
273 270 path to an alternate dictionary to use.
274 271 '''
275 272 try:
276 273 fp = hg.openpath(ui, descpath)
277 274 except Exception as err:
278 275 raise error.Abort('%s: %s' % (descpath, err[0].strerror))
279 276 desc = json.load(fp)
280 277 fp.close()
281 278
282 279 def cdf(l):
283 280 if not l:
284 281 return [], []
285 282 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
286 283 t = float(sum(probs, 0))
287 284 s, cdfs = 0, []
288 285 for v in probs:
289 286 s += v
290 287 cdfs.append(s / t)
291 288 return vals, cdfs
292 289
293 290 lineschanged = cdf(desc['lineschanged'])
294 291 fileschanged = cdf(desc['fileschanged'])
295 292 filesadded = cdf(desc['filesadded'])
296 293 dirsadded = cdf(desc['dirsadded'])
297 294 filesremoved = cdf(desc['filesremoved'])
298 295 linelengths = cdf(desc['linelengths'])
299 296 parents = cdf(desc['parents'])
300 297 p1distance = cdf(desc['p1distance'])
301 298 p2distance = cdf(desc['p2distance'])
302 299 interarrival = cdf(desc['interarrival'])
303 300 linesinfilesadded = cdf(desc['linesinfilesadded'])
304 301 tzoffset = cdf(desc['tzoffset'])
305 302
306 303 dictfile = opts.get('dict') or '/usr/share/dict/words'
307 304 try:
308 305 fp = open(dictfile, 'rU')
309 306 except IOError as err:
310 307 raise error.Abort('%s: %s' % (dictfile, err.strerror))
311 308 words = fp.read().splitlines()
312 309 fp.close()
313 310
314 311 initdirs = {}
315 312 if desc['initdirs']:
316 313 for k, v in desc['initdirs']:
317 314 initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
318 315 initdirs = renamedirs(initdirs, words)
319 316 initdirscdf = cdf(initdirs)
320 317
321 318 def pick(cdf):
322 319 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
323 320
324 321 def pickpath():
325 322 return os.path.join(pick(initdirscdf), random.choice(words))
326 323
327 324 def makeline(minimum=0):
328 325 total = max(minimum, pick(linelengths))
329 326 c, l = 0, []
330 327 while c < total:
331 328 w = random.choice(words)
332 329 c += len(w) + 1
333 330 l.append(w)
334 331 return ' '.join(l)
335 332
336 333 wlock = repo.wlock()
337 334 lock = repo.lock()
338 335
339 336 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
340 337
341 progress = ui.progress
342 338 _synthesizing = _('synthesizing')
343 339 _files = _('initial files')
344 340 _changesets = _('changesets')
345 341
346 342 # Synthesize a single initial revision adding files to the repo according
347 343 # to the modeled directory structure.
348 344 initcount = int(opts['initfiles'])
349 345 if initcount and initdirs:
350 346 pctx = repo[None].parents()[0]
351 347 dirs = set(pctx.dirs())
352 348 files = {}
353 349
354 350 def validpath(path):
355 351 # Don't pick filenames which are already directory names.
356 352 if path in dirs:
357 353 return False
358 354 # Don't pick directories which were used as file names.
359 355 while path:
360 356 if path in files:
361 357 return False
362 358 path = os.path.dirname(path)
363 359 return True
364 360
361 progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)
365 362 for i in xrange(0, initcount):
366 ui.progress(_synthesizing, i, unit=_files, total=initcount)
363 progress.update(i)
367 364
368 365 path = pickpath()
369 366 while not validpath(path):
370 367 path = pickpath()
371 368 data = '%s contents\n' % path
372 369 files[path] = data
373 370 dir = os.path.dirname(path)
374 371 while dir and dir not in dirs:
375 372 dirs.add(dir)
376 373 dir = os.path.dirname(dir)
377 374
378 375 def filectxfn(repo, memctx, path):
379 376 return context.memfilectx(repo, memctx, path, files[path])
380 377
381 ui.progress(_synthesizing, None)
378 progress.complete()
382 379 message = 'synthesized wide repo with %d files' % (len(files),)
383 380 mc = context.memctx(repo, [pctx.node(), nullid], message,
384 381 files, filectxfn, ui.username(),
385 382 '%d %d' % dateutil.makedate())
386 383 initnode = mc.commit()
387 384 if ui.debugflag:
388 385 hexfn = hex
389 386 else:
390 387 hexfn = short
391 388 ui.status(_('added commit %s with %d files\n')
392 389 % (hexfn(initnode), len(files)))
393 390
394 391 # Synthesize incremental revisions to the repository, adding repo depth.
395 392 count = int(opts['count'])
396 393 heads = set(map(repo.changelog.rev, repo.heads()))
394 progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)
397 395 for i in xrange(count):
398 progress(_synthesizing, i, unit=_changesets, total=count)
396 progress.update(i)
399 397
400 398 node = repo.changelog.node
401 399 revs = len(repo)
402 400
403 401 def pickhead(heads, distance):
404 402 if heads:
405 403 lheads = sorted(heads)
406 404 rev = revs - min(pick(distance), revs)
407 405 if rev < lheads[-1]:
408 406 rev = lheads[bisect.bisect_left(lheads, rev)]
409 407 else:
410 408 rev = lheads[-1]
411 409 return rev, node(rev)
412 410 return nullrev, nullid
413 411
414 412 r1 = revs - min(pick(p1distance), revs)
415 413 p1 = node(r1)
416 414
417 415 # the number of heads will grow without bound if we use a pure
418 416 # model, so artificially constrain their proliferation
419 417 toomanyheads = len(heads) > random.randint(1, 20)
420 418 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
421 419 r2, p2 = pickhead(heads.difference([r1]), p2distance)
422 420 else:
423 421 r2, p2 = nullrev, nullid
424 422
425 423 pl = [p1, p2]
426 424 pctx = repo[r1]
427 425 mf = pctx.manifest()
428 426 mfk = mf.keys()
429 427 changes = {}
430 428 if mfk:
431 429 for __ in xrange(pick(fileschanged)):
432 430 for __ in xrange(10):
433 431 fctx = pctx.filectx(random.choice(mfk))
434 432 path = fctx.path()
435 433 if not (path in nevertouch or fctx.isbinary() or
436 434 'l' in fctx.flags()):
437 435 break
438 436 lines = fctx.data().splitlines()
439 437 add, remove = pick(lineschanged)
440 438 for __ in xrange(remove):
441 439 if not lines:
442 440 break
443 441 del lines[random.randrange(0, len(lines))]
444 442 for __ in xrange(add):
445 443 lines.insert(random.randint(0, len(lines)), makeline())
446 444 path = fctx.path()
447 445 changes[path] = '\n'.join(lines) + '\n'
448 446 for __ in xrange(pick(filesremoved)):
449 447 path = random.choice(mfk)
450 448 for __ in xrange(10):
451 449 path = random.choice(mfk)
452 450 if path not in changes:
453 451 break
454 452 if filesadded:
455 453 dirs = list(pctx.dirs())
456 454 dirs.insert(0, '')
457 455 for __ in xrange(pick(filesadded)):
458 456 pathstr = ''
459 457 while pathstr in dirs:
460 458 path = [random.choice(dirs)]
461 459 if pick(dirsadded):
462 460 path.append(random.choice(words))
463 461 path.append(random.choice(words))
464 462 pathstr = '/'.join(filter(None, path))
465 463 data = '\n'.join(makeline()
466 464 for __ in xrange(pick(linesinfilesadded))) + '\n'
467 465 changes[pathstr] = data
468 466 def filectxfn(repo, memctx, path):
469 467 if path not in changes:
470 468 return None
471 469 return context.memfilectx(repo, memctx, path, changes[path])
472 470 if not changes:
473 471 continue
474 472 if revs:
475 473 date = repo['tip'].date()[0] + pick(interarrival)
476 474 else:
477 475 date = time.time() - (86400 * count)
478 476 # dates in mercurial must be positive, fit in 32-bit signed integers.
479 477 date = min(0x7fffffff, max(0, date))
480 478 user = random.choice(words) + '@' + random.choice(words)
481 479 mc = context.memctx(repo, pl, makeline(minimum=2),
482 480 sorted(changes),
483 481 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
484 482 newnode = mc.commit()
485 483 heads.add(repo.changelog.rev(newnode))
486 484 heads.discard(r1)
487 485 heads.discard(r2)
488 486
489 487 lock.release()
490 488 wlock.release()
491 489
492 490 def renamedirs(dirs, words):
493 491 '''Randomly rename the directory names in the per-dir file count dict.'''
494 492 wordgen = itertools.cycle(words)
495 493 replacements = {'': ''}
496 494 def rename(dirpath):
497 495 '''Recursively rename the directory and all path prefixes.
498 496
499 497 The mapping from path to renamed path is stored for all path prefixes
500 498 as in dynamic programming, ensuring linear runtime and consistent
501 499 renaming regardless of iteration order through the model.
502 500 '''
503 501 if dirpath in replacements:
504 502 return replacements[dirpath]
505 503 head, _ = os.path.split(dirpath)
506 504 if head:
507 505 head = rename(head)
508 506 else:
509 507 head = ''
510 508 renamed = os.path.join(head, next(wordgen))
511 509 replacements[dirpath] = renamed
512 510 return renamed
513 511 result = []
514 512 for dirpath, count in dirs.iteritems():
515 513 result.append([rename(dirpath.lstrip(os.sep)), count])
516 514 return result
General Comments 0
You need to be logged in to leave comments. Login now