##// END OF EJS Templates
synthrepo: pass a diffopts object to context.diff
Boris Feld -
r38586:97469c54 default
parent child Browse files
Show More
@@ -1,517 +1,521
1 1 # synthrepo.py - repo synthesis
2 2 #
3 3 # Copyright 2012 Facebook
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 '''synthesize structurally interesting change history
9 9
10 10 This extension is useful for creating a repository with properties
11 11 that are statistically similar to an existing repository. During
12 12 analysis, a simple probability table is constructed from the history
13 13 of an existing repository. During synthesis, these properties are
14 14 reconstructed.
15 15
16 16 Properties that are analyzed and synthesized include the following:
17 17
18 18 - Lines added or removed when an existing file is modified
19 19 - Number and sizes of files added
20 20 - Number of files removed
21 21 - Line lengths
22 22 - Topological distance to parent changeset(s)
23 23 - Probability of a commit being a merge
24 24 - Probability of a newly added file being added to a new directory
25 25 - Interarrival time, and time zone, of commits
26 26 - Number of files in each directory
27 27
28 28 A few obvious properties that are not currently handled realistically:
29 29
30 30 - Merges are treated as regular commits with two parents, which is not
31 31 realistic
32 32 - Modifications are not treated as operations on hunks of lines, but
33 33 as insertions and deletions of randomly chosen single lines
34 34 - Committer ID (always random)
35 35 - Executability of files
36 36 - Symlinks and binary files are ignored
37 37 '''
38 38
39 39 from __future__ import absolute_import
40 40 import bisect
41 41 import collections
42 42 import itertools
43 43 import json
44 44 import os
45 45 import random
46 46 import sys
47 47 import time
48 48
49 49 from mercurial.i18n import _
50 50 from mercurial.node import (
51 51 nullid,
52 52 nullrev,
53 53 short,
54 54 )
55 55 from mercurial import (
56 56 context,
57 57 error,
58 58 hg,
59 59 patch,
60 60 registrar,
61 61 scmutil,
62 62 )
63 from mercurial.utils import dateutil
63 from mercurial.utils import (
64 dateutil,
65 diffutil,
66 )
64 67
65 68 # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
66 69 # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
67 70 # be specifying the version(s) of Mercurial they are tested with, or
68 71 # leave the attribute unspecified.
69 72 testedwith = 'ships-with-hg-core'
70 73
71 74 cmdtable = {}
72 75 command = registrar.command(cmdtable)
73 76
74 77 newfile = {'new fi', 'rename', 'copy f', 'copy t'}
75 78
76 79 def zerodict():
77 80 return collections.defaultdict(lambda: 0)
78 81
79 82 def roundto(x, k):
80 83 if x > k * 2:
81 84 return int(round(x / float(k)) * k)
82 85 return int(round(x))
83 86
84 87 def parsegitdiff(lines):
85 88 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
86 89 binary = False
87 90 for line in lines:
88 91 start = line[:6]
89 92 if start == 'diff -':
90 93 if filename:
91 94 yield filename, mar, lineadd, lineremove, binary
92 95 mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
93 96 filename = patch.gitre.match(line).group(1)
94 97 elif start in newfile:
95 98 mar = 'a'
96 99 elif start == 'GIT bi':
97 100 binary = True
98 101 elif start == 'delete':
99 102 mar = 'r'
100 103 elif start:
101 104 s = start[0]
102 105 if s == '-' and not line.startswith('--- '):
103 106 lineremove += 1
104 107 elif s == '+' and not line.startswith('+++ '):
105 108 lineadd[roundto(len(line) - 1, 5)] += 1
106 109 if filename:
107 110 yield filename, mar, lineadd, lineremove, binary
108 111
109 112 @command('analyze',
110 113 [('o', 'output', '', _('write output to given file'), _('FILE')),
111 114 ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
112 115 _('hg analyze'), optionalrepo=True)
113 116 def analyze(ui, repo, *revs, **opts):
114 117 '''create a simple model of a repository to use for later synthesis
115 118
116 119 This command examines every changeset in the given range (or all
117 120 of history if none are specified) and creates a simple statistical
118 121 model of the history of the repository. It also measures the directory
119 122 structure of the repository as checked out.
120 123
121 124 The model is written out to a JSON file, and can be used by
122 125 :hg:`synthesize` to create or augment a repository with synthetic
123 126 commits that have a structure that is statistically similar to the
124 127 analyzed repository.
125 128 '''
126 129 root = repo.root
127 130 if not root.endswith(os.path.sep):
128 131 root += os.path.sep
129 132
130 133 revs = list(revs)
131 134 revs.extend(opts['rev'])
132 135 if not revs:
133 136 revs = [':']
134 137
135 138 output = opts['output']
136 139 if not output:
137 140 output = os.path.basename(root) + '.json'
138 141
139 142 if output == '-':
140 143 fp = sys.stdout
141 144 else:
142 145 fp = open(output, 'w')
143 146
144 147 # Always obtain file counts of each directory in the given root directory.
145 148 def onerror(e):
146 149 ui.warn(_('error walking directory structure: %s\n') % e)
147 150
148 151 dirs = {}
149 152 rootprefixlen = len(root)
150 153 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
151 154 dirpathfromroot = dirpath[rootprefixlen:]
152 155 dirs[dirpathfromroot] = len(filenames)
153 156 if '.hg' in dirnames:
154 157 dirnames.remove('.hg')
155 158
156 159 lineschanged = zerodict()
157 160 children = zerodict()
158 161 p1distance = zerodict()
159 162 p2distance = zerodict()
160 163 linesinfilesadded = zerodict()
161 164 fileschanged = zerodict()
162 165 filesadded = zerodict()
163 166 filesremoved = zerodict()
164 167 linelengths = zerodict()
165 168 interarrival = zerodict()
166 169 parents = zerodict()
167 170 dirsadded = zerodict()
168 171 tzoffset = zerodict()
169 172
170 173 # If a mercurial repo is available, also model the commit history.
171 174 if repo:
172 175 revs = scmutil.revrange(repo, revs)
173 176 revs.sort()
174 177
175 178 progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),
176 179 total=len(revs))
177 180 for i, rev in enumerate(revs):
178 181 progress.update(i)
179 182 ctx = repo[rev]
180 183 pl = ctx.parents()
181 184 pctx = pl[0]
182 185 prev = pctx.rev()
183 186 children[prev] += 1
184 187 p1distance[rev - prev] += 1
185 188 parents[len(pl)] += 1
186 189 tzoffset[ctx.date()[1]] += 1
187 190 if len(pl) > 1:
188 191 p2distance[rev - pl[1].rev()] += 1
189 192 if prev == rev - 1:
190 193 lastctx = pctx
191 194 else:
192 195 lastctx = repo[rev - 1]
193 196 if lastctx.rev() != nullrev:
194 197 timedelta = ctx.date()[0] - lastctx.date()[0]
195 198 interarrival[roundto(timedelta, 300)] += 1
199 diffopts = diffutil.diffopts(ctx._repo.ui, {'git': True})
196 200 diff = sum((d.splitlines()
197 for d in ctx.diff(pctx, opts={'git': True})), [])
201 for d in ctx.diff(pctx, opts=diffopts)), [])
198 202 fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
199 203 for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
200 204 if isbin:
201 205 continue
202 206 added = sum(lineadd.itervalues(), 0)
203 207 if mar == 'm':
204 208 if added and lineremove:
205 209 lineschanged[roundto(added, 5),
206 210 roundto(lineremove, 5)] += 1
207 211 filechanges += 1
208 212 elif mar == 'a':
209 213 fileadds += 1
210 214 if '/' in filename:
211 215 filedir = filename.rsplit('/', 1)[0]
212 216 if filedir not in pctx.dirs():
213 217 diradds += 1
214 218 linesinfilesadded[roundto(added, 5)] += 1
215 219 elif mar == 'r':
216 220 fileremoves += 1
217 221 for length, count in lineadd.iteritems():
218 222 linelengths[length] += count
219 223 fileschanged[filechanges] += 1
220 224 filesadded[fileadds] += 1
221 225 dirsadded[diradds] += 1
222 226 filesremoved[fileremoves] += 1
223 227 progress.complete()
224 228
225 229 invchildren = zerodict()
226 230
227 231 for rev, count in children.iteritems():
228 232 invchildren[count] += 1
229 233
230 234 if output != '-':
231 235 ui.status(_('writing output to %s\n') % output)
232 236
233 237 def pronk(d):
234 238 return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
235 239
236 240 json.dump({'revs': len(revs),
237 241 'initdirs': pronk(dirs),
238 242 'lineschanged': pronk(lineschanged),
239 243 'children': pronk(invchildren),
240 244 'fileschanged': pronk(fileschanged),
241 245 'filesadded': pronk(filesadded),
242 246 'linesinfilesadded': pronk(linesinfilesadded),
243 247 'dirsadded': pronk(dirsadded),
244 248 'filesremoved': pronk(filesremoved),
245 249 'linelengths': pronk(linelengths),
246 250 'parents': pronk(parents),
247 251 'p1distance': pronk(p1distance),
248 252 'p2distance': pronk(p2distance),
249 253 'interarrival': pronk(interarrival),
250 254 'tzoffset': pronk(tzoffset),
251 255 },
252 256 fp)
253 257 fp.close()
254 258
255 259 @command('synthesize',
256 260 [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
257 261 ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
258 262 ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
259 263 _('hg synthesize [OPTION].. DESCFILE'))
260 264 def synthesize(ui, repo, descpath, **opts):
261 265 '''synthesize commits based on a model of an existing repository
262 266
263 267 The model must have been generated by :hg:`analyze`. Commits will
264 268 be generated randomly according to the probabilities described in
265 269 the model. If --initfiles is set, the repository will be seeded with
266 270 the given number files following the modeled repository's directory
267 271 structure.
268 272
269 273 When synthesizing new content, commit descriptions, and user
270 274 names, words will be chosen randomly from a dictionary that is
271 275 presumed to contain one word per line. Use --dict to specify the
272 276 path to an alternate dictionary to use.
273 277 '''
274 278 try:
275 279 fp = hg.openpath(ui, descpath)
276 280 except Exception as err:
277 281 raise error.Abort('%s: %s' % (descpath, err[0].strerror))
278 282 desc = json.load(fp)
279 283 fp.close()
280 284
281 285 def cdf(l):
282 286 if not l:
283 287 return [], []
284 288 vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
285 289 t = float(sum(probs, 0))
286 290 s, cdfs = 0, []
287 291 for v in probs:
288 292 s += v
289 293 cdfs.append(s / t)
290 294 return vals, cdfs
291 295
292 296 lineschanged = cdf(desc['lineschanged'])
293 297 fileschanged = cdf(desc['fileschanged'])
294 298 filesadded = cdf(desc['filesadded'])
295 299 dirsadded = cdf(desc['dirsadded'])
296 300 filesremoved = cdf(desc['filesremoved'])
297 301 linelengths = cdf(desc['linelengths'])
298 302 parents = cdf(desc['parents'])
299 303 p1distance = cdf(desc['p1distance'])
300 304 p2distance = cdf(desc['p2distance'])
301 305 interarrival = cdf(desc['interarrival'])
302 306 linesinfilesadded = cdf(desc['linesinfilesadded'])
303 307 tzoffset = cdf(desc['tzoffset'])
304 308
305 309 dictfile = opts.get('dict') or '/usr/share/dict/words'
306 310 try:
307 311 fp = open(dictfile, 'rU')
308 312 except IOError as err:
309 313 raise error.Abort('%s: %s' % (dictfile, err.strerror))
310 314 words = fp.read().splitlines()
311 315 fp.close()
312 316
313 317 initdirs = {}
314 318 if desc['initdirs']:
315 319 for k, v in desc['initdirs']:
316 320 initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
317 321 initdirs = renamedirs(initdirs, words)
318 322 initdirscdf = cdf(initdirs)
319 323
320 324 def pick(cdf):
321 325 return cdf[0][bisect.bisect_left(cdf[1], random.random())]
322 326
323 327 def pickpath():
324 328 return os.path.join(pick(initdirscdf), random.choice(words))
325 329
326 330 def makeline(minimum=0):
327 331 total = max(minimum, pick(linelengths))
328 332 c, l = 0, []
329 333 while c < total:
330 334 w = random.choice(words)
331 335 c += len(w) + 1
332 336 l.append(w)
333 337 return ' '.join(l)
334 338
335 339 wlock = repo.wlock()
336 340 lock = repo.lock()
337 341
338 342 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
339 343
340 344 _synthesizing = _('synthesizing')
341 345 _files = _('initial files')
342 346 _changesets = _('changesets')
343 347
344 348 # Synthesize a single initial revision adding files to the repo according
345 349 # to the modeled directory structure.
346 350 initcount = int(opts['initfiles'])
347 351 if initcount and initdirs:
348 352 pctx = repo[None].parents()[0]
349 353 dirs = set(pctx.dirs())
350 354 files = {}
351 355
352 356 def validpath(path):
353 357 # Don't pick filenames which are already directory names.
354 358 if path in dirs:
355 359 return False
356 360 # Don't pick directories which were used as file names.
357 361 while path:
358 362 if path in files:
359 363 return False
360 364 path = os.path.dirname(path)
361 365 return True
362 366
363 367 progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)
364 368 for i in xrange(0, initcount):
365 369 progress.update(i)
366 370
367 371 path = pickpath()
368 372 while not validpath(path):
369 373 path = pickpath()
370 374 data = '%s contents\n' % path
371 375 files[path] = data
372 376 dir = os.path.dirname(path)
373 377 while dir and dir not in dirs:
374 378 dirs.add(dir)
375 379 dir = os.path.dirname(dir)
376 380
377 381 def filectxfn(repo, memctx, path):
378 382 return context.memfilectx(repo, memctx, path, files[path])
379 383
380 384 progress.complete()
381 385 message = 'synthesized wide repo with %d files' % (len(files),)
382 386 mc = context.memctx(repo, [pctx.node(), nullid], message,
383 387 files, filectxfn, ui.username(),
384 388 '%d %d' % dateutil.makedate())
385 389 initnode = mc.commit()
386 390 if ui.debugflag:
387 391 hexfn = hex
388 392 else:
389 393 hexfn = short
390 394 ui.status(_('added commit %s with %d files\n')
391 395 % (hexfn(initnode), len(files)))
392 396
393 397 # Synthesize incremental revisions to the repository, adding repo depth.
394 398 count = int(opts['count'])
395 399 heads = set(map(repo.changelog.rev, repo.heads()))
396 400 progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)
397 401 for i in xrange(count):
398 402 progress.update(i)
399 403
400 404 node = repo.changelog.node
401 405 revs = len(repo)
402 406
403 407 def pickhead(heads, distance):
404 408 if heads:
405 409 lheads = sorted(heads)
406 410 rev = revs - min(pick(distance), revs)
407 411 if rev < lheads[-1]:
408 412 rev = lheads[bisect.bisect_left(lheads, rev)]
409 413 else:
410 414 rev = lheads[-1]
411 415 return rev, node(rev)
412 416 return nullrev, nullid
413 417
414 418 r1 = revs - min(pick(p1distance), revs)
415 419 p1 = node(r1)
416 420
417 421 # the number of heads will grow without bound if we use a pure
418 422 # model, so artificially constrain their proliferation
419 423 toomanyheads = len(heads) > random.randint(1, 20)
420 424 if p2distance[0] and (pick(parents) == 2 or toomanyheads):
421 425 r2, p2 = pickhead(heads.difference([r1]), p2distance)
422 426 else:
423 427 r2, p2 = nullrev, nullid
424 428
425 429 pl = [p1, p2]
426 430 pctx = repo[r1]
427 431 mf = pctx.manifest()
428 432 mfk = mf.keys()
429 433 changes = {}
430 434 if mfk:
431 435 for __ in xrange(pick(fileschanged)):
432 436 for __ in xrange(10):
433 437 fctx = pctx.filectx(random.choice(mfk))
434 438 path = fctx.path()
435 439 if not (path in nevertouch or fctx.isbinary() or
436 440 'l' in fctx.flags()):
437 441 break
438 442 lines = fctx.data().splitlines()
439 443 add, remove = pick(lineschanged)
440 444 for __ in xrange(remove):
441 445 if not lines:
442 446 break
443 447 del lines[random.randrange(0, len(lines))]
444 448 for __ in xrange(add):
445 449 lines.insert(random.randint(0, len(lines)), makeline())
446 450 path = fctx.path()
447 451 changes[path] = '\n'.join(lines) + '\n'
448 452 for __ in xrange(pick(filesremoved)):
449 453 path = random.choice(mfk)
450 454 for __ in xrange(10):
451 455 path = random.choice(mfk)
452 456 if path not in changes:
453 457 break
454 458 if filesadded:
455 459 dirs = list(pctx.dirs())
456 460 dirs.insert(0, '')
457 461 for __ in xrange(pick(filesadded)):
458 462 pathstr = ''
459 463 while pathstr in dirs:
460 464 path = [random.choice(dirs)]
461 465 if pick(dirsadded):
462 466 path.append(random.choice(words))
463 467 path.append(random.choice(words))
464 468 pathstr = '/'.join(filter(None, path))
465 469 data = '\n'.join(makeline()
466 470 for __ in xrange(pick(linesinfilesadded))) + '\n'
467 471 changes[pathstr] = data
468 472 def filectxfn(repo, memctx, path):
469 473 if path not in changes:
470 474 return None
471 475 return context.memfilectx(repo, memctx, path, changes[path])
472 476 if not changes:
473 477 continue
474 478 if revs:
475 479 date = repo['tip'].date()[0] + pick(interarrival)
476 480 else:
477 481 date = time.time() - (86400 * count)
478 482 # dates in mercurial must be positive, fit in 32-bit signed integers.
479 483 date = min(0x7fffffff, max(0, date))
480 484 user = random.choice(words) + '@' + random.choice(words)
481 485 mc = context.memctx(repo, pl, makeline(minimum=2),
482 486 sorted(changes),
483 487 filectxfn, user, '%d %d' % (date, pick(tzoffset)))
484 488 newnode = mc.commit()
485 489 heads.add(repo.changelog.rev(newnode))
486 490 heads.discard(r1)
487 491 heads.discard(r2)
488 492 progress.complete()
489 493
490 494 lock.release()
491 495 wlock.release()
492 496
493 497 def renamedirs(dirs, words):
494 498 '''Randomly rename the directory names in the per-dir file count dict.'''
495 499 wordgen = itertools.cycle(words)
496 500 replacements = {'': ''}
497 501 def rename(dirpath):
498 502 '''Recursively rename the directory and all path prefixes.
499 503
500 504 The mapping from path to renamed path is stored for all path prefixes
501 505 as in dynamic programming, ensuring linear runtime and consistent
502 506 renaming regardless of iteration order through the model.
503 507 '''
504 508 if dirpath in replacements:
505 509 return replacements[dirpath]
506 510 head, _ = os.path.split(dirpath)
507 511 if head:
508 512 head = rename(head)
509 513 else:
510 514 head = ''
511 515 renamed = os.path.join(head, next(wordgen))
512 516 replacements[dirpath] = renamed
513 517 return renamed
514 518 result = []
515 519 for dirpath, count in dirs.iteritems():
516 520 result.append([rename(dirpath.lstrip(os.sep)), count])
517 521 return result
General Comments 0
You need to be logged in to leave comments. Login now