##// END OF EJS Templates
synthrepo: simply use the ui passed as a function argument
Yuya Nishihara -
r38603:c6398fc2 default
parent child Browse files
Show More
@@ -1,521 +1,521 b''
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26 - Number of files in each directory
27 27
28 28 A few obvious properties that are not currently handled realistically:
29 29
30 30 - Merges are treated as regular commits with two parents, which is not
31 31 realistic
32 32 - Modifications are not treated as operations on hunks of lines, but
33 33 as insertions and deletions of randomly chosen single lines
34 34 - Committer ID (always random)
35 35 - Executability of files
36 36 - Symlinks and binary files are ignored
37 37 '''
38 38
39 39 from __future__ import absolute_import
40 40 import bisect
41 41 import collections
42 42 import itertools
43 43 import json
44 44 import os
45 45 import random
46 46 import sys
47 47 import time
48 48
49 49 from mercurial.i18n import _
50 50 from mercurial.node import (
51 51 nullid,
52 52 nullrev,
53 53 short,
54 54 )
55 55 from mercurial import (
56 56 context,
57 57 error,
58 58 hg,
59 59 patch,
60 60 registrar,
61 61 scmutil,
62 62 )
63 63 from mercurial.utils import (
64 64 dateutil,
65 65 diffutil,
66 66 )
67 67
68 68 # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
69 69 # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
70 70 # be specifying the version(s) of Mercurial they are tested with, or
71 71 # leave the attribute unspecified.
72 72 testedwith = 'ships-with-hg-core'
73 73
74 74 cmdtable = {}
75 75 command = registrar.command(cmdtable)
76 76
77 77 newfile = {'new fi', 'rename', 'copy f', 'copy t'}
78 78
79 79 def zerodict():
80 80 return collections.defaultdict(lambda: 0)
81 81
82 82 def roundto(x, k):
83 83 if x > k * 2:
84 84 return int(round(x / float(k)) * k)
85 85 return int(round(x))
86 86
87 87 def parsegitdiff(lines):
88 88 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
89 89 binary = False
90 90 for line in lines:
91 91 start = line[:6]
92 92 if start == 'diff -':
93 93 if filename:
94 94 yield filename, mar, lineadd, lineremove, binary
95 95 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
96 96 filename = patch.gitre.match(line).group(1)
97 97 elif start in newfile:
98 98 mar = 'a'
99 99 elif start == 'GIT bi':
100 100 binary = True
101 101 elif start == 'delete':
102 102 mar = 'r'
103 103 elif start:
104 104 s = start[0]
105 105 if s == '-' and not line.startswith('--- '):
106 106 lineremove += 1
107 107 elif s == '+' and not line.startswith('+++ '):
108 108 lineadd[roundto(len(line) - 1, 5)] += 1
109 109 if filename:
110 110 yield filename, mar, lineadd, lineremove, binary
111 111
112 112 @command('analyze',
113 113 [('o', 'output', '', _('write output to given file'), _('FILE')),
114 114 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
115 115 _('hg analyze'), optionalrepo=True)
116 116 def analyze(ui, repo, *revs, **opts):
117 117 '''create a simple model of a repository to use for later synthesis
118 118
119 119 This command examines every changeset in the given range (or all
120 120 of history if none are specified) and creates a simple statistical
121 121 model of the history of the repository. It also measures the directory
122 122 structure of the repository as checked out.
123 123
124 124 The model is written out to a JSON file, and can be used by
125 125 :hg:`synthesize` to create or augment a repository with synthetic
126 126 commits that have a structure that is statistically similar to the
127 127 analyzed repository.
128 128 '''
129 129 root = repo.root
130 130 if not root.endswith(os.path.sep):
131 131 root += os.path.sep
132 132
133 133 revs = list(revs)
134 134 revs.extend(opts['rev'])
135 135 if not revs:
136 136 revs = [':']
137 137
138 138 output = opts['output']
139 139 if not output:
140 140 output = os.path.basename(root) + '.json'
141 141
142 142 if output == '-':
143 143 fp = sys.stdout
144 144 else:
145 145 fp = open(output, 'w')
146 146
147 147 # Always obtain file counts of each directory in the given root directory.
148 148 def onerror(e):
149 149 ui.warn(_('error walking directory structure: %s\n') % e)
150 150
151 151 dirs = {}
152 152 rootprefixlen = len(root)
153 153 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
154 154 dirpathfromroot = dirpath[rootprefixlen:]
155 155 dirs[dirpathfromroot] = len(filenames)
156 156 if '.hg' in dirnames:
157 157 dirnames.remove('.hg')
158 158
159 159 lineschanged = zerodict()
160 160 children = zerodict()
161 161 p1distance = zerodict()
162 162 p2distance = zerodict()
163 163 linesinfilesadded = zerodict()
164 164 fileschanged = zerodict()
165 165 filesadded = zerodict()
166 166 filesremoved = zerodict()
167 167 linelengths = zerodict()
168 168 interarrival = zerodict()
169 169 parents = zerodict()
170 170 dirsadded = zerodict()
171 171 tzoffset = zerodict()
172 172
173 173 # If a mercurial repo is available, also model the commit history.
174 174 if repo:
175 175 revs = scmutil.revrange(repo, revs)
176 176 revs.sort()
177 177
178 178 progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),
179 179 total=len(revs))
180 180 for i, rev in enumerate(revs):
181 181 progress.update(i)
182 182 ctx = repo[rev]
183 183 pl = ctx.parents()
184 184 pctx = pl[0]
185 185 prev = pctx.rev()
186 186 children[prev] += 1
187 187 p1distance[rev - prev] += 1
188 188 parents[len(pl)] += 1
189 189 tzoffset[ctx.date()[1]] += 1
190 190 if len(pl) > 1:
191 191 p2distance[rev - pl[1].rev()] += 1
192 192 if prev == rev - 1:
193 193 lastctx = pctx
194 194 else:
195 195 lastctx = repo[rev - 1]
196 196 if lastctx.rev() != nullrev:
197 197 timedelta = ctx.date()[0] - lastctx.date()[0]
198 198 interarrival[roundto(timedelta, 300)] += 1
199 diffopts = diffutil.diffopts(ctx._repo.ui, {'git': True})
199 diffopts = diffutil.diffopts(ui, {'git': True})
200 200 diff = sum((d.splitlines()
201 201 for d in ctx.diff(pctx, opts=diffopts)), [])
202 202 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
203 203 for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
204 204 if isbin:
205 205 continue
206 206 added = sum(lineadd.itervalues(), 0)
207 207 if mar == 'm':
208 208 if added and lineremove:
209 209 lineschanged[roundto(added, 5),
210 210 roundto(lineremove, 5)] += 1
211 211 filechanges += 1
212 212 elif mar == 'a':
213 213 fileadds += 1
214 214 if '/' in filename:
215 215 filedir = filename.rsplit('/', 1)[0]
216 216 if filedir not in pctx.dirs():
217 217 diradds += 1
218 218 linesinfilesadded[roundto(added, 5)] += 1
219 219 elif mar == 'r':
220 220 fileremoves += 1
221 221 for length, count in lineadd.iteritems():
222 222 linelengths[length] += count
223 223 fileschanged[filechanges] += 1
224 224 filesadded[fileadds] += 1
225 225 dirsadded[diradds] += 1
226 226 filesremoved[fileremoves] += 1
227 227 progress.complete()
228 228
229 229 invchildren = zerodict()
230 230
231 231 for rev, count in children.iteritems():
232 232 invchildren[count] += 1
233 233
234 234 if output != '-':
235 235 ui.status(_('writing output to %s\n') % output)
236 236
237 237 def pronk(d):
238 238 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
239 239
240 240 json.dump({'revs': len(revs),
241 241 'initdirs': pronk(dirs),
242 242 'lineschanged': pronk(lineschanged),
243 243 'children': pronk(invchildren),
244 244 'fileschanged': pronk(fileschanged),
245 245 'filesadded': pronk(filesadded),
246 246 'linesinfilesadded': pronk(linesinfilesadded),
247 247 'dirsadded': pronk(dirsadded),
248 248 'filesremoved': pronk(filesremoved),
249 249 'linelengths': pronk(linelengths),
250 250 'parents': pronk(parents),
251 251 'p1distance': pronk(p1distance),
252 252 'p2distance': pronk(p2distance),
253 253 'interarrival': pronk(interarrival),
254 254 'tzoffset': pronk(tzoffset),
255 255 },
256 256 fp)
257 257 fp.close()
258 258
259 259 @command('synthesize',
260 260 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
261 261 ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
262 262 ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
263 263 _('hg synthesize [OPTION].. DESCFILE'))
264 264 def synthesize(ui, repo, descpath, **opts):
265 265 '''synthesize commits based on a model of an existing repository
266 266
267 267 The model must have been generated by :hg:`analyze`. Commits will
268 268 be generated randomly according to the probabilities described in
269 269 the model. If --initfiles is set, the repository will be seeded with
270 270 the given number files following the modeled repository's directory
271 271 structure.
272 272
273 273 When synthesizing new content, commit descriptions, and user
274 274 names, words will be chosen randomly from a dictionary that is
275 275 presumed to contain one word per line. Use --dict to specify the
276 276 path to an alternate dictionary to use.
277 277 '''
278 278 try:
279 279 fp = hg.openpath(ui, descpath)
280 280 except Exception as err:
281 281 raise error.Abort('%s: %s' % (descpath, err[0].strerror))
282 282 desc = json.load(fp)
283 283 fp.close()
284 284
285 285 def cdf(l):
286 286 if not l:
287 287 return [], []
288 288 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
289 289 t = float(sum(probs, 0))
290 290 s, cdfs = 0, []
291 291 for v in probs:
292 292 s += v
293 293 cdfs.append(s / t)
294 294 return vals, cdfs
295 295
296 296 lineschanged = cdf(desc['lineschanged'])
297 297 fileschanged = cdf(desc['fileschanged'])
298 298 filesadded = cdf(desc['filesadded'])
299 299 dirsadded = cdf(desc['dirsadded'])
300 300 filesremoved = cdf(desc['filesremoved'])
301 301 linelengths = cdf(desc['linelengths'])
302 302 parents = cdf(desc['parents'])
303 303 p1distance = cdf(desc['p1distance'])
304 304 p2distance = cdf(desc['p2distance'])
305 305 interarrival = cdf(desc['interarrival'])
306 306 linesinfilesadded = cdf(desc['linesinfilesadded'])
307 307 tzoffset = cdf(desc['tzoffset'])
308 308
309 309 dictfile = opts.get('dict') or '/usr/share/dict/words'
310 310 try:
311 311 fp = open(dictfile, 'rU')
312 312 except IOError as err:
313 313 raise error.Abort('%s: %s' % (dictfile, err.strerror))
314 314 words = fp.read().splitlines()
315 315 fp.close()
316 316
317 317 initdirs = {}
318 318 if desc['initdirs']:
319 319 for k, v in desc['initdirs']:
320 320 initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
321 321 initdirs = renamedirs(initdirs, words)
322 322 initdirscdf = cdf(initdirs)
323 323
324 324 def pick(cdf):
325 325 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
326 326
327 327 def pickpath():
328 328 return os.path.join(pick(initdirscdf), random.choice(words))
329 329
330 330 def makeline(minimum=0):
331 331 total = max(minimum, pick(linelengths))
332 332 c, l = 0, []
333 333 while c < total:
334 334 w = random.choice(words)
335 335 c += len(w) + 1
336 336 l.append(w)
337 337 return ' '.join(l)
338 338
339 339 wlock = repo.wlock()
340 340 lock = repo.lock()
341 341
342 342 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
343 343
344 344 _synthesizing = _('synthesizing')
345 345 _files = _('initial files')
346 346 _changesets = _('changesets')
347 347
348 348 # Synthesize a single initial revision adding files to the repo according
349 349 # to the modeled directory structure.
350 350 initcount = int(opts['initfiles'])
351 351 if initcount and initdirs:
352 352 pctx = repo[None].parents()[0]
353 353 dirs = set(pctx.dirs())
354 354 files = {}
355 355
356 356 def validpath(path):
357 357 # Don't pick filenames which are already directory names.
358 358 if path in dirs:
359 359 return False
360 360 # Don't pick directories which were used as file names.
361 361 while path:
362 362 if path in files:
363 363 return False
364 364 path = os.path.dirname(path)
365 365 return True
366 366
367 367 progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)
368 368 for i in xrange(0, initcount):
369 369 progress.update(i)
370 370
371 371 path = pickpath()
372 372 while not validpath(path):
373 373 path = pickpath()
374 374 data = '%s contents\n' % path
375 375 files[path] = data
376 376 dir = os.path.dirname(path)
377 377 while dir and dir not in dirs:
378 378 dirs.add(dir)
379 379 dir = os.path.dirname(dir)
380 380
381 381 def filectxfn(repo, memctx, path):
382 382 return context.memfilectx(repo, memctx, path, files[path])
383 383
384 384 progress.complete()
385 385 message = 'synthesized wide repo with %d files' % (len(files),)
386 386 mc = context.memctx(repo, [pctx.node(), nullid], message,
387 387 files, filectxfn, ui.username(),
388 388 '%d %d' % dateutil.makedate())
389 389 initnode = mc.commit()
390 390 if ui.debugflag:
391 391 hexfn = hex
392 392 else:
393 393 hexfn = short
394 394 ui.status(_('added commit %s with %d files\n')
395 395 % (hexfn(initnode), len(files)))
396 396
397 397 # Synthesize incremental revisions to the repository, adding repo depth.
398 398 count = int(opts['count'])
399 399 heads = set(map(repo.changelog.rev, repo.heads()))
400 400 progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)
401 401 for i in xrange(count):
402 402 progress.update(i)
403 403
404 404 node = repo.changelog.node
405 405 revs = len(repo)
406 406
407 407 def pickhead(heads, distance):
408 408 if heads:
409 409 lheads = sorted(heads)
410 410 rev = revs - min(pick(distance), revs)
411 411 if rev < lheads[-1]:
412 412 rev = lheads[bisect.bisect_left(lheads, rev)]
413 413 else:
414 414 rev = lheads[-1]
415 415 return rev, node(rev)
416 416 return nullrev, nullid
417 417
418 418 r1 = revs - min(pick(p1distance), revs)
419 419 p1 = node(r1)
420 420
421 421 # the number of heads will grow without bound if we use a pure
422 422 # model, so artificially constrain their proliferation
423 423 toomanyheads = len(heads) > random.randint(1, 20)
424 424 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
425 425 r2, p2 = pickhead(heads.difference([r1]), p2distance)
426 426 else:
427 427 r2, p2 = nullrev, nullid
428 428
429 429 pl = [p1, p2]
430 430 pctx = repo[r1]
431 431 mf = pctx.manifest()
432 432 mfk = mf.keys()
433 433 changes = {}
434 434 if mfk:
435 435 for __ in xrange(pick(fileschanged)):
436 436 for __ in xrange(10):
437 437 fctx = pctx.filectx(random.choice(mfk))
438 438 path = fctx.path()
439 439 if not (path in nevertouch or fctx.isbinary() or
440 440 'l' in fctx.flags()):
441 441 break
442 442 lines = fctx.data().splitlines()
443 443 add, remove = pick(lineschanged)
444 444 for __ in xrange(remove):
445 445 if not lines:
446 446 break
447 447 del lines[random.randrange(0, len(lines))]
448 448 for __ in xrange(add):
449 449 lines.insert(random.randint(0, len(lines)), makeline())
450 450 path = fctx.path()
451 451 changes[path] = '\n'.join(lines) + '\n'
452 452 for __ in xrange(pick(filesremoved)):
453 453 path = random.choice(mfk)
454 454 for __ in xrange(10):
455 455 path = random.choice(mfk)
456 456 if path not in changes:
457 457 break
458 458 if filesadded:
459 459 dirs = list(pctx.dirs())
460 460 dirs.insert(0, '')
461 461 for __ in xrange(pick(filesadded)):
462 462 pathstr = ''
463 463 while pathstr in dirs:
464 464 path = [random.choice(dirs)]
465 465 if pick(dirsadded):
466 466 path.append(random.choice(words))
467 467 path.append(random.choice(words))
468 468 pathstr = '/'.join(filter(None, path))
469 469 data = '\n'.join(makeline()
470 470 for __ in xrange(pick(linesinfilesadded))) + '\n'
471 471 changes[pathstr] = data
472 472 def filectxfn(repo, memctx, path):
473 473 if path not in changes:
474 474 return None
475 475 return context.memfilectx(repo, memctx, path, changes[path])
476 476 if not changes:
477 477 continue
478 478 if revs:
479 479 date = repo['tip'].date()[0] + pick(interarrival)
480 480 else:
481 481 date = time.time() - (86400 * count)
482 482 # dates in mercurial must be positive, fit in 32-bit signed integers.
483 483 date = min(0x7fffffff, max(0, date))
484 484 user = random.choice(words) + '@' + random.choice(words)
485 485 mc = context.memctx(repo, pl, makeline(minimum=2),
486 486 sorted(changes),
487 487 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
488 488 newnode = mc.commit()
489 489 heads.add(repo.changelog.rev(newnode))
490 490 heads.discard(r1)
491 491 heads.discard(r2)
492 492 progress.complete()
493 493
494 494 lock.release()
495 495 wlock.release()
496 496
497 497 def renamedirs(dirs, words):
498 498 '''Randomly rename the directory names in the per-dir file count dict.'''
499 499 wordgen = itertools.cycle(words)
500 500 replacements = {'': ''}
501 501 def rename(dirpath):
502 502 '''Recursively rename the directory and all path prefixes.
503 503
504 504 The mapping from path to renamed path is stored for all path prefixes
505 505 as in dynamic programming, ensuring linear runtime and consistent
506 506 renaming regardless of iteration order through the model.
507 507 '''
508 508 if dirpath in replacements:
509 509 return replacements[dirpath]
510 510 head, _ = os.path.split(dirpath)
511 511 if head:
512 512 head = rename(head)
513 513 else:
514 514 head = ''
515 515 renamed = os.path.join(head, next(wordgen))
516 516 replacements[dirpath] = renamed
517 517 return renamed
518 518 result = []
519 519 for dirpath, count in dirs.iteritems():
520 520 result.append([rename(dirpath.lstrip(os.sep)), count])
521 521 return result
General Comments 0
You need to be logged in to leave comments. Login now