##// END OF EJS Templates
synthrepo: create filectx instance in 'filectxfn' callback...
Martin von Zweigbergk -
r35399:2123e762 default
parent child Browse files
Show More
@@ -1,516 +1,516 b''
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26 - Number of files in each directory
27 27
28 28 A few obvious properties that are not currently handled realistically:
29 29
30 30 - Merges are treated as regular commits with two parents, which is not
31 31 realistic
32 32 - Modifications are not treated as operations on hunks of lines, but
33 33 as insertions and deletions of randomly chosen single lines
34 34 - Committer ID (always random)
35 35 - Executability of files
36 36 - Symlinks and binary files are ignored
37 37 '''
38 38
39 39 from __future__ import absolute_import
40 40 import bisect
41 41 import collections
42 42 import itertools
43 43 import json
44 44 import os
45 45 import random
46 46 import sys
47 47 import time
48 48
49 49 from mercurial.i18n import _
50 50 from mercurial.node import (
51 51 nullid,
52 52 nullrev,
53 53 short,
54 54 )
55 55 from mercurial import (
56 56 context,
57 57 error,
58 58 hg,
59 59 patch,
60 60 registrar,
61 61 scmutil,
62 62 util,
63 63 )
64 64
65 65 # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
66 66 # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
67 67 # be specifying the version(s) of Mercurial they are tested with, or
68 68 # leave the attribute unspecified.
69 69 testedwith = 'ships-with-hg-core'
70 70
71 71 cmdtable = {}
72 72 command = registrar.command(cmdtable)
73 73
74 74 newfile = {'new fi', 'rename', 'copy f', 'copy t'}
75 75
76 76 def zerodict():
77 77 return collections.defaultdict(lambda: 0)
78 78
79 79 def roundto(x, k):
80 80 if x > k * 2:
81 81 return int(round(x / float(k)) * k)
82 82 return int(round(x))
83 83
84 84 def parsegitdiff(lines):
85 85 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
86 86 binary = False
87 87 for line in lines:
88 88 start = line[:6]
89 89 if start == 'diff -':
90 90 if filename:
91 91 yield filename, mar, lineadd, lineremove, binary
92 92 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
93 93 filename = patch.gitre.match(line).group(1)
94 94 elif start in newfile:
95 95 mar = 'a'
96 96 elif start == 'GIT bi':
97 97 binary = True
98 98 elif start == 'delete':
99 99 mar = 'r'
100 100 elif start:
101 101 s = start[0]
102 102 if s == '-' and not line.startswith('--- '):
103 103 lineremove += 1
104 104 elif s == '+' and not line.startswith('+++ '):
105 105 lineadd[roundto(len(line) - 1, 5)] += 1
106 106 if filename:
107 107 yield filename, mar, lineadd, lineremove, binary
108 108
109 109 @command('analyze',
110 110 [('o', 'output', '', _('write output to given file'), _('FILE')),
111 111 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
112 112 _('hg analyze'), optionalrepo=True)
113 113 def analyze(ui, repo, *revs, **opts):
114 114 '''create a simple model of a repository to use for later synthesis
115 115
116 116 This command examines every changeset in the given range (or all
117 117 of history if none are specified) and creates a simple statistical
118 118 model of the history of the repository. It also measures the directory
119 119 structure of the repository as checked out.
120 120
121 121 The model is written out to a JSON file, and can be used by
122 122 :hg:`synthesize` to create or augment a repository with synthetic
123 123 commits that have a structure that is statistically similar to the
124 124 analyzed repository.
125 125 '''
126 126 root = repo.root
127 127 if not root.endswith(os.path.sep):
128 128 root += os.path.sep
129 129
130 130 revs = list(revs)
131 131 revs.extend(opts['rev'])
132 132 if not revs:
133 133 revs = [':']
134 134
135 135 output = opts['output']
136 136 if not output:
137 137 output = os.path.basename(root) + '.json'
138 138
139 139 if output == '-':
140 140 fp = sys.stdout
141 141 else:
142 142 fp = open(output, 'w')
143 143
144 144 # Always obtain file counts of each directory in the given root directory.
145 145 def onerror(e):
146 146 ui.warn(_('error walking directory structure: %s\n') % e)
147 147
148 148 dirs = {}
149 149 rootprefixlen = len(root)
150 150 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
151 151 dirpathfromroot = dirpath[rootprefixlen:]
152 152 dirs[dirpathfromroot] = len(filenames)
153 153 if '.hg' in dirnames:
154 154 dirnames.remove('.hg')
155 155
156 156 lineschanged = zerodict()
157 157 children = zerodict()
158 158 p1distance = zerodict()
159 159 p2distance = zerodict()
160 160 linesinfilesadded = zerodict()
161 161 fileschanged = zerodict()
162 162 filesadded = zerodict()
163 163 filesremoved = zerodict()
164 164 linelengths = zerodict()
165 165 interarrival = zerodict()
166 166 parents = zerodict()
167 167 dirsadded = zerodict()
168 168 tzoffset = zerodict()
169 169
170 170 # If a mercurial repo is available, also model the commit history.
171 171 if repo:
172 172 revs = scmutil.revrange(repo, revs)
173 173 revs.sort()
174 174
175 175 progress = ui.progress
176 176 _analyzing = _('analyzing')
177 177 _changesets = _('changesets')
178 178 _total = len(revs)
179 179
180 180 for i, rev in enumerate(revs):
181 181 progress(_analyzing, i, unit=_changesets, total=_total)
182 182 ctx = repo[rev]
183 183 pl = ctx.parents()
184 184 pctx = pl[0]
185 185 prev = pctx.rev()
186 186 children[prev] += 1
187 187 p1distance[rev - prev] += 1
188 188 parents[len(pl)] += 1
189 189 tzoffset[ctx.date()[1]] += 1
190 190 if len(pl) > 1:
191 191 p2distance[rev - pl[1].rev()] += 1
192 192 if prev == rev - 1:
193 193 lastctx = pctx
194 194 else:
195 195 lastctx = repo[rev - 1]
196 196 if lastctx.rev() != nullrev:
197 197 timedelta = ctx.date()[0] - lastctx.date()[0]
198 198 interarrival[roundto(timedelta, 300)] += 1
199 199 diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
200 200 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
201 201 for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
202 202 if isbin:
203 203 continue
204 204 added = sum(lineadd.itervalues(), 0)
205 205 if mar == 'm':
206 206 if added and lineremove:
207 207 lineschanged[roundto(added, 5),
208 208 roundto(lineremove, 5)] += 1
209 209 filechanges += 1
210 210 elif mar == 'a':
211 211 fileadds += 1
212 212 if '/' in filename:
213 213 filedir = filename.rsplit('/', 1)[0]
214 214 if filedir not in pctx.dirs():
215 215 diradds += 1
216 216 linesinfilesadded[roundto(added, 5)] += 1
217 217 elif mar == 'r':
218 218 fileremoves += 1
219 219 for length, count in lineadd.iteritems():
220 220 linelengths[length] += count
221 221 fileschanged[filechanges] += 1
222 222 filesadded[fileadds] += 1
223 223 dirsadded[diradds] += 1
224 224 filesremoved[fileremoves] += 1
225 225
226 226 invchildren = zerodict()
227 227
228 228 for rev, count in children.iteritems():
229 229 invchildren[count] += 1
230 230
231 231 if output != '-':
232 232 ui.status(_('writing output to %s\n') % output)
233 233
234 234 def pronk(d):
235 235 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
236 236
237 237 json.dump({'revs': len(revs),
238 238 'initdirs': pronk(dirs),
239 239 'lineschanged': pronk(lineschanged),
240 240 'children': pronk(invchildren),
241 241 'fileschanged': pronk(fileschanged),
242 242 'filesadded': pronk(filesadded),
243 243 'linesinfilesadded': pronk(linesinfilesadded),
244 244 'dirsadded': pronk(dirsadded),
245 245 'filesremoved': pronk(filesremoved),
246 246 'linelengths': pronk(linelengths),
247 247 'parents': pronk(parents),
248 248 'p1distance': pronk(p1distance),
249 249 'p2distance': pronk(p2distance),
250 250 'interarrival': pronk(interarrival),
251 251 'tzoffset': pronk(tzoffset),
252 252 },
253 253 fp)
254 254 fp.close()
255 255
256 256 @command('synthesize',
257 257 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
258 258 ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
259 259 ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
260 260 _('hg synthesize [OPTION].. DESCFILE'))
261 261 def synthesize(ui, repo, descpath, **opts):
262 262 '''synthesize commits based on a model of an existing repository
263 263
264 264 The model must have been generated by :hg:`analyze`. Commits will
265 265 be generated randomly according to the probabilities described in
266 266 the model. If --initfiles is set, the repository will be seeded with
267 267 the given number files following the modeled repository's directory
268 268 structure.
269 269
270 270 When synthesizing new content, commit descriptions, and user
271 271 names, words will be chosen randomly from a dictionary that is
272 272 presumed to contain one word per line. Use --dict to specify the
273 273 path to an alternate dictionary to use.
274 274 '''
275 275 try:
276 276 fp = hg.openpath(ui, descpath)
277 277 except Exception as err:
278 278 raise error.Abort('%s: %s' % (descpath, err[0].strerror))
279 279 desc = json.load(fp)
280 280 fp.close()
281 281
282 282 def cdf(l):
283 283 if not l:
284 284 return [], []
285 285 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
286 286 t = float(sum(probs, 0))
287 287 s, cdfs = 0, []
288 288 for v in probs:
289 289 s += v
290 290 cdfs.append(s / t)
291 291 return vals, cdfs
292 292
293 293 lineschanged = cdf(desc['lineschanged'])
294 294 fileschanged = cdf(desc['fileschanged'])
295 295 filesadded = cdf(desc['filesadded'])
296 296 dirsadded = cdf(desc['dirsadded'])
297 297 filesremoved = cdf(desc['filesremoved'])
298 298 linelengths = cdf(desc['linelengths'])
299 299 parents = cdf(desc['parents'])
300 300 p1distance = cdf(desc['p1distance'])
301 301 p2distance = cdf(desc['p2distance'])
302 302 interarrival = cdf(desc['interarrival'])
303 303 linesinfilesadded = cdf(desc['linesinfilesadded'])
304 304 tzoffset = cdf(desc['tzoffset'])
305 305
306 306 dictfile = opts.get('dict') or '/usr/share/dict/words'
307 307 try:
308 308 fp = open(dictfile, 'rU')
309 309 except IOError as err:
310 310 raise error.Abort('%s: %s' % (dictfile, err.strerror))
311 311 words = fp.read().splitlines()
312 312 fp.close()
313 313
314 314 initdirs = {}
315 315 if desc['initdirs']:
316 316 for k, v in desc['initdirs']:
317 317 initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
318 318 initdirs = renamedirs(initdirs, words)
319 319 initdirscdf = cdf(initdirs)
320 320
321 321 def pick(cdf):
322 322 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
323 323
324 324 def pickpath():
325 325 return os.path.join(pick(initdirscdf), random.choice(words))
326 326
327 327 def makeline(minimum=0):
328 328 total = max(minimum, pick(linelengths))
329 329 c, l = 0, []
330 330 while c < total:
331 331 w = random.choice(words)
332 332 c += len(w) + 1
333 333 l.append(w)
334 334 return ' '.join(l)
335 335
336 336 wlock = repo.wlock()
337 337 lock = repo.lock()
338 338
339 339 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
340 340
341 341 progress = ui.progress
342 342 _synthesizing = _('synthesizing')
343 343 _files = _('initial files')
344 344 _changesets = _('changesets')
345 345
346 346 # Synthesize a single initial revision adding files to the repo according
347 347 # to the modeled directory structure.
348 348 initcount = int(opts['initfiles'])
349 349 if initcount and initdirs:
350 350 pctx = repo[None].parents()[0]
351 351 dirs = set(pctx.dirs())
352 352 files = {}
353 353
354 354 def validpath(path):
355 355 # Don't pick filenames which are already directory names.
356 356 if path in dirs:
357 357 return False
358 358 # Don't pick directories which were used as file names.
359 359 while path:
360 360 if path in files:
361 361 return False
362 362 path = os.path.dirname(path)
363 363 return True
364 364
365 365 for i in xrange(0, initcount):
366 366 ui.progress(_synthesizing, i, unit=_files, total=initcount)
367 367
368 368 path = pickpath()
369 369 while not validpath(path):
370 370 path = pickpath()
371 371 data = '%s contents\n' % path
372 files[path] = context.memfilectx(repo, path, data)
372 files[path] = data
373 373 dir = os.path.dirname(path)
374 374 while dir and dir not in dirs:
375 375 dirs.add(dir)
376 376 dir = os.path.dirname(dir)
377 377
378 378 def filectxfn(repo, memctx, path):
379 return files[path]
379 return context.memfilectx(repo, path, files[path])
380 380
381 381 ui.progress(_synthesizing, None)
382 382 message = 'synthesized wide repo with %d files' % (len(files),)
383 383 mc = context.memctx(repo, [pctx.node(), nullid], message,
384 384 files.iterkeys(), filectxfn, ui.username(),
385 385 '%d %d' % util.makedate())
386 386 initnode = mc.commit()
387 387 if ui.debugflag:
388 388 hexfn = hex
389 389 else:
390 390 hexfn = short
391 391 ui.status(_('added commit %s with %d files\n')
392 392 % (hexfn(initnode), len(files)))
393 393
394 394 # Synthesize incremental revisions to the repository, adding repo depth.
395 395 count = int(opts['count'])
396 396 heads = set(map(repo.changelog.rev, repo.heads()))
397 397 for i in xrange(count):
398 398 progress(_synthesizing, i, unit=_changesets, total=count)
399 399
400 400 node = repo.changelog.node
401 401 revs = len(repo)
402 402
403 403 def pickhead(heads, distance):
404 404 if heads:
405 405 lheads = sorted(heads)
406 406 rev = revs - min(pick(distance), revs)
407 407 if rev < lheads[-1]:
408 408 rev = lheads[bisect.bisect_left(lheads, rev)]
409 409 else:
410 410 rev = lheads[-1]
411 411 return rev, node(rev)
412 412 return nullrev, nullid
413 413
414 414 r1 = revs - min(pick(p1distance), revs)
415 415 p1 = node(r1)
416 416
417 417 # the number of heads will grow without bound if we use a pure
418 418 # model, so artificially constrain their proliferation
419 419 toomanyheads = len(heads) > random.randint(1, 20)
420 420 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
421 421 r2, p2 = pickhead(heads.difference([r1]), p2distance)
422 422 else:
423 423 r2, p2 = nullrev, nullid
424 424
425 425 pl = [p1, p2]
426 426 pctx = repo[r1]
427 427 mf = pctx.manifest()
428 428 mfk = mf.keys()
429 429 changes = {}
430 430 if mfk:
431 431 for __ in xrange(pick(fileschanged)):
432 432 for __ in xrange(10):
433 433 fctx = pctx.filectx(random.choice(mfk))
434 434 path = fctx.path()
435 435 if not (path in nevertouch or fctx.isbinary() or
436 436 'l' in fctx.flags()):
437 437 break
438 438 lines = fctx.data().splitlines()
439 439 add, remove = pick(lineschanged)
440 440 for __ in xrange(remove):
441 441 if not lines:
442 442 break
443 443 del lines[random.randrange(0, len(lines))]
444 444 for __ in xrange(add):
445 445 lines.insert(random.randint(0, len(lines)), makeline())
446 446 path = fctx.path()
447 changes[path] = context.memfilectx(repo, path,
448 '\n'.join(lines) + '\n')
447 changes[path] = '\n'.join(lines) + '\n'
449 448 for __ in xrange(pick(filesremoved)):
450 449 path = random.choice(mfk)
451 450 for __ in xrange(10):
452 451 path = random.choice(mfk)
453 452 if path not in changes:
454 changes[path] = None
455 453 break
456 454 if filesadded:
457 455 dirs = list(pctx.dirs())
458 456 dirs.insert(0, '')
459 457 for __ in xrange(pick(filesadded)):
460 458 pathstr = ''
461 459 while pathstr in dirs:
462 460 path = [random.choice(dirs)]
463 461 if pick(dirsadded):
464 462 path.append(random.choice(words))
465 463 path.append(random.choice(words))
466 464 pathstr = '/'.join(filter(None, path))
467 465 data = '\n'.join(makeline()
468 466 for __ in xrange(pick(linesinfilesadded))) + '\n'
469 changes[pathstr] = context.memfilectx(repo, pathstr, data)
467 changes[pathstr] = data
470 468 def filectxfn(repo, memctx, path):
471 return changes[path]
469 if path not in changes:
470 return None
471 return context.memfilectx(repo, path, changes[path])
472 472 if not changes:
473 473 continue
474 474 if revs:
475 475 date = repo['tip'].date()[0] + pick(interarrival)
476 476 else:
477 477 date = time.time() - (86400 * count)
478 478 # dates in mercurial must be positive, fit in 32-bit signed integers.
479 479 date = min(0x7fffffff, max(0, date))
480 480 user = random.choice(words) + '@' + random.choice(words)
481 481 mc = context.memctx(repo, pl, makeline(minimum=2),
482 482 sorted(changes),
483 483 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
484 484 newnode = mc.commit()
485 485 heads.add(repo.changelog.rev(newnode))
486 486 heads.discard(r1)
487 487 heads.discard(r2)
488 488
489 489 lock.release()
490 490 wlock.release()
491 491
492 492 def renamedirs(dirs, words):
493 493 '''Randomly rename the directory names in the per-dir file count dict.'''
494 494 wordgen = itertools.cycle(words)
495 495 replacements = {'': ''}
496 496 def rename(dirpath):
497 497 '''Recursively rename the directory and all path prefixes.
498 498
499 499 The mapping from path to renamed path is stored for all path prefixes
500 500 as in dynamic programming, ensuring linear runtime and consistent
501 501 renaming regardless of iteration order through the model.
502 502 '''
503 503 if dirpath in replacements:
504 504 return replacements[dirpath]
505 505 head, _ = os.path.split(dirpath)
506 506 if head:
507 507 head = rename(head)
508 508 else:
509 509 head = ''
510 510 renamed = os.path.join(head, next(wordgen))
511 511 replacements[dirpath] = renamed
512 512 return renamed
513 513 result = []
514 514 for dirpath, count in dirs.iteritems():
515 515 result.append([rename(dirpath.lstrip(os.sep)), count])
516 516 return result
General Comments 0
You need to be logged in to leave comments. Login now