##// END OF EJS Templates
synthrepo: use pycompat.xrange...
Gregory Szorc -
r43274:c07812bd default
parent child Browse files
Show More
@@ -1,520 +1,522 b''
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26 - Number of files in each directory
27 27
28 28 A few obvious properties that are not currently handled realistically:
29 29
30 30 - Merges are treated as regular commits with two parents, which is not
31 31 realistic
32 32 - Modifications are not treated as operations on hunks of lines, but
33 33 as insertions and deletions of randomly chosen single lines
34 34 - Committer ID (always random)
35 35 - Executability of files
36 36 - Symlinks and binary files are ignored
37 37 '''
38 38
39 39 from __future__ import absolute_import
40 40 import bisect
41 41 import collections
42 42 import itertools
43 43 import json
44 44 import os
45 45 import random
46 46 import sys
47 47 import time
48 48
49 49 from mercurial.i18n import _
50 50 from mercurial.node import (
51 51 nullid,
52 52 nullrev,
53 53 short,
54 54 )
55 55 from mercurial import (
56 56 context,
57 57 diffutil,
58 58 error,
59 59 hg,
60 60 patch,
61 pycompat,
61 62 registrar,
62 63 scmutil,
63 64 )
64 65 from mercurial.utils import (
65 66 dateutil,
66 67 )
67 68
68 69 # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
69 70 # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
70 71 # be specifying the version(s) of Mercurial they are tested with, or
71 72 # leave the attribute unspecified.
72 73 testedwith = 'ships-with-hg-core'
73 74
74 75 cmdtable = {}
75 76 command = registrar.command(cmdtable)
76 77
77 78 newfile = {'new fi', 'rename', 'copy f', 'copy t'}
78 79
79 80 def zerodict():
80 81 return collections.defaultdict(lambda: 0)
81 82
82 83 def roundto(x, k):
83 84 if x > k * 2:
84 85 return int(round(x / float(k)) * k)
85 86 return int(round(x))
86 87
87 88 def parsegitdiff(lines):
88 89 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
89 90 binary = False
90 91 for line in lines:
91 92 start = line[:6]
92 93 if start == 'diff -':
93 94 if filename:
94 95 yield filename, mar, lineadd, lineremove, binary
95 96 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
96 97 filename = patch.gitre.match(line).group(1)
97 98 elif start in newfile:
98 99 mar = 'a'
99 100 elif start == 'GIT bi':
100 101 binary = True
101 102 elif start == 'delete':
102 103 mar = 'r'
103 104 elif start:
104 105 s = start[0]
105 106 if s == '-' and not line.startswith('--- '):
106 107 lineremove += 1
107 108 elif s == '+' and not line.startswith('+++ '):
108 109 lineadd[roundto(len(line) - 1, 5)] += 1
109 110 if filename:
110 111 yield filename, mar, lineadd, lineremove, binary
111 112
112 113 @command('analyze',
113 114 [('o', 'output', '', _('write output to given file'), _('FILE')),
114 115 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
115 116 _('hg analyze'), optionalrepo=True)
116 117 def analyze(ui, repo, *revs, **opts):
117 118 '''create a simple model of a repository to use for later synthesis
118 119
119 120 This command examines every changeset in the given range (or all
120 121 of history if none are specified) and creates a simple statistical
121 122 model of the history of the repository. It also measures the directory
122 123 structure of the repository as checked out.
123 124
124 125 The model is written out to a JSON file, and can be used by
125 126 :hg:`synthesize` to create or augment a repository with synthetic
126 127 commits that have a structure that is statistically similar to the
127 128 analyzed repository.
128 129 '''
129 130 root = repo.root
130 131 if not root.endswith(os.path.sep):
131 132 root += os.path.sep
132 133
133 134 revs = list(revs)
134 135 revs.extend(opts['rev'])
135 136 if not revs:
136 137 revs = [':']
137 138
138 139 output = opts['output']
139 140 if not output:
140 141 output = os.path.basename(root) + '.json'
141 142
142 143 if output == '-':
143 144 fp = sys.stdout
144 145 else:
145 146 fp = open(output, 'w')
146 147
147 148 # Always obtain file counts of each directory in the given root directory.
148 149 def onerror(e):
149 150 ui.warn(_('error walking directory structure: %s\n') % e)
150 151
151 152 dirs = {}
152 153 rootprefixlen = len(root)
153 154 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
154 155 dirpathfromroot = dirpath[rootprefixlen:]
155 156 dirs[dirpathfromroot] = len(filenames)
156 157 if '.hg' in dirnames:
157 158 dirnames.remove('.hg')
158 159
159 160 lineschanged = zerodict()
160 161 children = zerodict()
161 162 p1distance = zerodict()
162 163 p2distance = zerodict()
163 164 linesinfilesadded = zerodict()
164 165 fileschanged = zerodict()
165 166 filesadded = zerodict()
166 167 filesremoved = zerodict()
167 168 linelengths = zerodict()
168 169 interarrival = zerodict()
169 170 parents = zerodict()
170 171 dirsadded = zerodict()
171 172 tzoffset = zerodict()
172 173
173 174 # If a mercurial repo is available, also model the commit history.
174 175 if repo:
175 176 revs = scmutil.revrange(repo, revs)
176 177 revs.sort()
177 178
178 179 progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),
179 180 total=len(revs))
180 181 for i, rev in enumerate(revs):
181 182 progress.update(i)
182 183 ctx = repo[rev]
183 184 pl = ctx.parents()
184 185 pctx = pl[0]
185 186 prev = pctx.rev()
186 187 children[prev] += 1
187 188 p1distance[rev - prev] += 1
188 189 parents[len(pl)] += 1
189 190 tzoffset[ctx.date()[1]] += 1
190 191 if len(pl) > 1:
191 192 p2distance[rev - pl[1].rev()] += 1
192 193 if prev == rev - 1:
193 194 lastctx = pctx
194 195 else:
195 196 lastctx = repo[rev - 1]
196 197 if lastctx.rev() != nullrev:
197 198 timedelta = ctx.date()[0] - lastctx.date()[0]
198 199 interarrival[roundto(timedelta, 300)] += 1
199 200 diffopts = diffutil.diffallopts(ui, {'git': True})
200 201 diff = sum((d.splitlines()
201 202 for d in ctx.diff(pctx, opts=diffopts)), [])
202 203 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
203 204 for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
204 205 if isbin:
205 206 continue
206 207 added = sum(lineadd.itervalues(), 0)
207 208 if mar == 'm':
208 209 if added and lineremove:
209 210 lineschanged[roundto(added, 5),
210 211 roundto(lineremove, 5)] += 1
211 212 filechanges += 1
212 213 elif mar == 'a':
213 214 fileadds += 1
214 215 if '/' in filename:
215 216 filedir = filename.rsplit('/', 1)[0]
216 217 if filedir not in pctx.dirs():
217 218 diradds += 1
218 219 linesinfilesadded[roundto(added, 5)] += 1
219 220 elif mar == 'r':
220 221 fileremoves += 1
221 222 for length, count in lineadd.iteritems():
222 223 linelengths[length] += count
223 224 fileschanged[filechanges] += 1
224 225 filesadded[fileadds] += 1
225 226 dirsadded[diradds] += 1
226 227 filesremoved[fileremoves] += 1
227 228 progress.complete()
228 229
229 230 invchildren = zerodict()
230 231
231 232 for rev, count in children.iteritems():
232 233 invchildren[count] += 1
233 234
234 235 if output != '-':
235 236 ui.status(_('writing output to %s\n') % output)
236 237
237 238 def pronk(d):
238 239 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
239 240
240 241 json.dump({'revs': len(revs),
241 242 'initdirs': pronk(dirs),
242 243 'lineschanged': pronk(lineschanged),
243 244 'children': pronk(invchildren),
244 245 'fileschanged': pronk(fileschanged),
245 246 'filesadded': pronk(filesadded),
246 247 'linesinfilesadded': pronk(linesinfilesadded),
247 248 'dirsadded': pronk(dirsadded),
248 249 'filesremoved': pronk(filesremoved),
249 250 'linelengths': pronk(linelengths),
250 251 'parents': pronk(parents),
251 252 'p1distance': pronk(p1distance),
252 253 'p2distance': pronk(p2distance),
253 254 'interarrival': pronk(interarrival),
254 255 'tzoffset': pronk(tzoffset),
255 256 },
256 257 fp)
257 258 fp.close()
258 259
259 260 @command('synthesize',
260 261 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
261 262 ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
262 263 ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
263 264 _('hg synthesize [OPTION].. DESCFILE'))
264 265 def synthesize(ui, repo, descpath, **opts):
265 266 '''synthesize commits based on a model of an existing repository
266 267
267 268 The model must have been generated by :hg:`analyze`. Commits will
268 269 be generated randomly according to the probabilities described in
269 270 the model. If --initfiles is set, the repository will be seeded with
270 271 the given number files following the modeled repository's directory
271 272 structure.
272 273
273 274 When synthesizing new content, commit descriptions, and user
274 275 names, words will be chosen randomly from a dictionary that is
275 276 presumed to contain one word per line. Use --dict to specify the
276 277 path to an alternate dictionary to use.
277 278 '''
278 279 try:
279 280 fp = hg.openpath(ui, descpath)
280 281 except Exception as err:
281 282 raise error.Abort('%s: %s' % (descpath, err[0].strerror))
282 283 desc = json.load(fp)
283 284 fp.close()
284 285
285 286 def cdf(l):
286 287 if not l:
287 288 return [], []
288 289 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
289 290 t = float(sum(probs, 0))
290 291 s, cdfs = 0, []
291 292 for v in probs:
292 293 s += v
293 294 cdfs.append(s / t)
294 295 return vals, cdfs
295 296
296 297 lineschanged = cdf(desc['lineschanged'])
297 298 fileschanged = cdf(desc['fileschanged'])
298 299 filesadded = cdf(desc['filesadded'])
299 300 dirsadded = cdf(desc['dirsadded'])
300 301 filesremoved = cdf(desc['filesremoved'])
301 302 linelengths = cdf(desc['linelengths'])
302 303 parents = cdf(desc['parents'])
303 304 p1distance = cdf(desc['p1distance'])
304 305 p2distance = cdf(desc['p2distance'])
305 306 interarrival = cdf(desc['interarrival'])
306 307 linesinfilesadded = cdf(desc['linesinfilesadded'])
307 308 tzoffset = cdf(desc['tzoffset'])
308 309
309 310 dictfile = opts.get('dict') or '/usr/share/dict/words'
310 311 try:
311 312 fp = open(dictfile, 'rU')
312 313 except IOError as err:
313 314 raise error.Abort('%s: %s' % (dictfile, err.strerror))
314 315 words = fp.read().splitlines()
315 316 fp.close()
316 317
317 318 initdirs = {}
318 319 if desc['initdirs']:
319 320 for k, v in desc['initdirs']:
320 321 initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
321 322 initdirs = renamedirs(initdirs, words)
322 323 initdirscdf = cdf(initdirs)
323 324
324 325 def pick(cdf):
325 326 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
326 327
327 328 def pickpath():
328 329 return os.path.join(pick(initdirscdf), random.choice(words))
329 330
330 331 def makeline(minimum=0):
331 332 total = max(minimum, pick(linelengths))
332 333 c, l = 0, []
333 334 while c < total:
334 335 w = random.choice(words)
335 336 c += len(w) + 1
336 337 l.append(w)
337 338 return ' '.join(l)
338 339
339 340 wlock = repo.wlock()
340 341 lock = repo.lock()
341 342
342 343 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
343 344
344 345 _synthesizing = _('synthesizing')
345 346 _files = _('initial files')
346 347 _changesets = _('changesets')
347 348
348 349 # Synthesize a single initial revision adding files to the repo according
349 350 # to the modeled directory structure.
350 351 initcount = int(opts['initfiles'])
351 352 if initcount and initdirs:
352 353 pctx = repo['.']
353 354 dirs = set(pctx.dirs())
354 355 files = {}
355 356
356 357 def validpath(path):
357 358 # Don't pick filenames which are already directory names.
358 359 if path in dirs:
359 360 return False
360 361 # Don't pick directories which were used as file names.
361 362 while path:
362 363 if path in files:
363 364 return False
364 365 path = os.path.dirname(path)
365 366 return True
366 367
367 368 progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)
368 for i in xrange(0, initcount):
369 for i in pycompat.xrange(0, initcount):
369 370 progress.update(i)
370 371
371 372 path = pickpath()
372 373 while not validpath(path):
373 374 path = pickpath()
374 375 data = '%s contents\n' % path
375 376 files[path] = data
376 377 dir = os.path.dirname(path)
377 378 while dir and dir not in dirs:
378 379 dirs.add(dir)
379 380 dir = os.path.dirname(dir)
380 381
381 382 def filectxfn(repo, memctx, path):
382 383 return context.memfilectx(repo, memctx, path, files[path])
383 384
384 385 progress.complete()
385 386 message = 'synthesized wide repo with %d files' % (len(files),)
386 387 mc = context.memctx(repo, [pctx.node(), nullid], message,
387 388 files, filectxfn, ui.username(),
388 389 '%d %d' % dateutil.makedate())
389 390 initnode = mc.commit()
390 391 if ui.debugflag:
391 392 hexfn = hex
392 393 else:
393 394 hexfn = short
394 395 ui.status(_('added commit %s with %d files\n')
395 396 % (hexfn(initnode), len(files)))
396 397
397 398 # Synthesize incremental revisions to the repository, adding repo depth.
398 399 count = int(opts['count'])
399 400 heads = set(map(repo.changelog.rev, repo.heads()))
400 401 progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)
401 for i in xrange(count):
402 for i in pycompat.xrange(count):
402 403 progress.update(i)
403 404
404 405 node = repo.changelog.node
405 406 revs = len(repo)
406 407
407 408 def pickhead(heads, distance):
408 409 if heads:
409 410 lheads = sorted(heads)
410 411 rev = revs - min(pick(distance), revs)
411 412 if rev < lheads[-1]:
412 413 rev = lheads[bisect.bisect_left(lheads, rev)]
413 414 else:
414 415 rev = lheads[-1]
415 416 return rev, node(rev)
416 417 return nullrev, nullid
417 418
418 419 r1 = revs - min(pick(p1distance), revs)
419 420 p1 = node(r1)
420 421
421 422 # the number of heads will grow without bound if we use a pure
422 423 # model, so artificially constrain their proliferation
423 424 toomanyheads = len(heads) > random.randint(1, 20)
424 425 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
425 426 r2, p2 = pickhead(heads.difference([r1]), p2distance)
426 427 else:
427 428 r2, p2 = nullrev, nullid
428 429
429 430 pl = [p1, p2]
430 431 pctx = repo[r1]
431 432 mf = pctx.manifest()
432 433 mfk = mf.keys()
433 434 changes = {}
434 435 if mfk:
435 for __ in xrange(pick(fileschanged)):
436 for __ in xrange(10):
436 for __ in pycompat.xrange(pick(fileschanged)):
437 for __ in pycompat.xrange(10):
437 438 fctx = pctx.filectx(random.choice(mfk))
438 439 path = fctx.path()
439 440 if not (path in nevertouch or fctx.isbinary() or
440 441 'l' in fctx.flags()):
441 442 break
442 443 lines = fctx.data().splitlines()
443 444 add, remove = pick(lineschanged)
444 for __ in xrange(remove):
445 for __ in pycompat.xrange(remove):
445 446 if not lines:
446 447 break
447 448 del lines[random.randrange(0, len(lines))]
448 for __ in xrange(add):
449 for __ in pycompat.xrange(add):
449 450 lines.insert(random.randint(0, len(lines)), makeline())
450 451 path = fctx.path()
451 452 changes[path] = '\n'.join(lines) + '\n'
452 for __ in xrange(pick(filesremoved)):
453 for __ in xrange(10):
453 for __ in pycompat.xrange(pick(filesremoved)):
454 for __ in pycompat.xrange(10):
454 455 path = random.choice(mfk)
455 456 if path not in changes:
456 457 break
457 458 if filesadded:
458 459 dirs = list(pctx.dirs())
459 460 dirs.insert(0, '')
460 for __ in xrange(pick(filesadded)):
461 for __ in pycompat.xrange(pick(filesadded)):
461 462 pathstr = ''
462 463 while pathstr in dirs:
463 464 path = [random.choice(dirs)]
464 465 if pick(dirsadded):
465 466 path.append(random.choice(words))
466 467 path.append(random.choice(words))
467 468 pathstr = '/'.join(filter(None, path))
468 data = '\n'.join(makeline()
469 for __ in xrange(pick(linesinfilesadded))) + '\n'
469 data = '\n'.join(
470 makeline()
471 for __ in pycompat.xrange(pick(linesinfilesadded))) + '\n'
470 472 changes[pathstr] = data
471 473 def filectxfn(repo, memctx, path):
472 474 if path not in changes:
473 475 return None
474 476 return context.memfilectx(repo, memctx, path, changes[path])
475 477 if not changes:
476 478 continue
477 479 if revs:
478 480 date = repo['tip'].date()[0] + pick(interarrival)
479 481 else:
480 482 date = time.time() - (86400 * count)
481 483 # dates in mercurial must be positive, fit in 32-bit signed integers.
482 484 date = min(0x7fffffff, max(0, date))
483 485 user = random.choice(words) + '@' + random.choice(words)
484 486 mc = context.memctx(repo, pl, makeline(minimum=2),
485 487 sorted(changes),
486 488 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
487 489 newnode = mc.commit()
488 490 heads.add(repo.changelog.rev(newnode))
489 491 heads.discard(r1)
490 492 heads.discard(r2)
491 493 progress.complete()
492 494
493 495 lock.release()
494 496 wlock.release()
495 497
496 498 def renamedirs(dirs, words):
497 499 '''Randomly rename the directory names in the per-dir file count dict.'''
498 500 wordgen = itertools.cycle(words)
499 501 replacements = {'': ''}
500 502 def rename(dirpath):
501 503 '''Recursively rename the directory and all path prefixes.
502 504
503 505 The mapping from path to renamed path is stored for all path prefixes
504 506 as in dynamic programming, ensuring linear runtime and consistent
505 507 renaming regardless of iteration order through the model.
506 508 '''
507 509 if dirpath in replacements:
508 510 return replacements[dirpath]
509 511 head, _ = os.path.split(dirpath)
510 512 if head:
511 513 head = rename(head)
512 514 else:
513 515 head = ''
514 516 renamed = os.path.join(head, next(wordgen))
515 517 replacements[dirpath] = renamed
516 518 return renamed
517 519 result = []
518 520 for dirpath, count in dirs.iteritems():
519 521 result.append([rename(dirpath.lstrip(os.sep)), count])
520 522 return result
General Comments 0
You need to be logged in to leave comments. Login now