synthrepo.py
516 lines
| 17.7 KiB
| text/x-python
|
PythonLexer
/ contrib / synthrepo.py
Bryan O'Sullivan
|
r17734 | # synthrepo.py - repo synthesis | ||
# | ||||
# Copyright 2012 Facebook | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
'''synthesize structurally interesting change history | ||||
This extension is useful for creating a repository with properties | ||||
that are statistically similar to an existing repository. During | ||||
analysis, a simple probability table is constructed from the history | ||||
of an existing repository. During synthesis, these properties are | ||||
reconstructed. | ||||
Properties that are analyzed and synthesized include the following: | ||||
- Lines added or removed when an existing file is modified | ||||
- Number and sizes of files added | ||||
- Number of files removed | ||||
- Line lengths | ||||
- Topological distance to parent changeset(s) | ||||
- Probability of a commit being a merge | ||||
- Probability of a newly added file being added to a new directory | ||||
- Interarrival time, and time zone, of commits | ||||
Mike Edgar
|
r22709 | - Number of files in each directory | ||
Bryan O'Sullivan
|
r17734 | |||
A few obvious properties that are not currently handled realistically: | ||||
- Merges are treated as regular commits with two parents, which is not | ||||
realistic | ||||
- Modifications are not treated as operations on hunks of lines, but | ||||
as insertions and deletions of randomly chosen single lines | ||||
- Committer ID (always random) | ||||
- Executability of files | ||||
- Symlinks and binary files are ignored | ||||
''' | ||||
Pulkit Goyal
|
r28563 | from __future__ import absolute_import | ||
import bisect | ||||
import collections | ||||
import itertools | ||||
import json | ||||
import os | ||||
import random | ||||
import sys | ||||
import time | ||||
Yuya Nishihara
|
r29205 | |||
from mercurial.i18n import _ | ||||
from mercurial.node import ( | ||||
nullid, | ||||
nullrev, | ||||
short, | ||||
) | ||||
Pulkit Goyal
|
r28563 | from mercurial import ( | ||
context, | ||||
error, | ||||
hg, | ||||
patch, | ||||
Yuya Nishihara
|
r32337 | registrar, | ||
Pulkit Goyal
|
r28563 | scmutil, | ||
) | ||||
Boris Feld
|
r36625 | from mercurial.utils import dateutil | ||
Bryan O'Sullivan
|
r17734 | |||
Augie Fackler
|
r29841 | # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for | ||
Augie Fackler
|
r25186 | # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should | ||
# be specifying the version(s) of Mercurial they are tested with, or | ||||
# leave the attribute unspecified. | ||||
Augie Fackler
|
r29841 | testedwith = 'ships-with-hg-core' | ||
Bryan O'Sullivan
|
r17734 | |||
cmdtable = {} | ||||
Yuya Nishihara
|
r32337 | command = registrar.command(cmdtable) | ||
Bryan O'Sullivan
|
r17734 | |||
Martin von Zweigbergk
|
r32291 | newfile = {'new fi', 'rename', 'copy f', 'copy t'} | ||
Bryan O'Sullivan
|
r17734 | |||
def zerodict(): | ||||
return collections.defaultdict(lambda: 0) | ||||
def roundto(x, k): | ||||
if x > k * 2: | ||||
return int(round(x / float(k)) * k) | ||||
return int(round(x)) | ||||
def parsegitdiff(lines): | ||||
filename, mar, lineadd, lineremove = None, None, zerodict(), 0 | ||||
binary = False | ||||
for line in lines: | ||||
start = line[:6] | ||||
if start == 'diff -': | ||||
if filename: | ||||
yield filename, mar, lineadd, lineremove, binary | ||||
mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False | ||||
filename = patch.gitre.match(line).group(1) | ||||
elif start in newfile: | ||||
mar = 'a' | ||||
elif start == 'GIT bi': | ||||
binary = True | ||||
elif start == 'delete': | ||||
mar = 'r' | ||||
elif start: | ||||
s = start[0] | ||||
if s == '-' and not line.startswith('--- '): | ||||
lineremove += 1 | ||||
elif s == '+' and not line.startswith('+++ '): | ||||
lineadd[roundto(len(line) - 1, 5)] += 1 | ||||
if filename: | ||||
yield filename, mar, lineadd, lineremove, binary | ||||
@command('analyze', | ||||
Mike Edgar
|
r22709 | [('o', 'output', '', _('write output to given file'), _('FILE')), | ||
Bryan O'Sullivan
|
r17734 | ('r', 'rev', [], _('analyze specified revisions'), _('REV'))], | ||
Mike Edgar
|
r22709 | _('hg analyze'), optionalrepo=True) | ||
Bryan O'Sullivan
|
r17734 | def analyze(ui, repo, *revs, **opts): | ||
'''create a simple model of a repository to use for later synthesis | ||||
This command examines every changeset in the given range (or all | ||||
of history if none are specified) and creates a simple statistical | ||||
Mike Edgar
|
r22709 | model of the history of the repository. It also measures the directory | ||
structure of the repository as checked out. | ||||
Bryan O'Sullivan
|
r17734 | |||
The model is written out to a JSON file, and can be used by | ||||
:hg:`synthesize` to create or augment a repository with synthetic | ||||
commits that have a structure that is statistically similar to the | ||||
analyzed repository. | ||||
''' | ||||
Mike Edgar
|
r22709 | root = repo.root | ||
if not root.endswith(os.path.sep): | ||||
root += os.path.sep | ||||
Bryan O'Sullivan
|
r17734 | |||
revs = list(revs) | ||||
revs.extend(opts['rev']) | ||||
if not revs: | ||||
revs = [':'] | ||||
output = opts['output'] | ||||
if not output: | ||||
Mike Edgar
|
r22709 | output = os.path.basename(root) + '.json' | ||
Bryan O'Sullivan
|
r17734 | |||
if output == '-': | ||||
fp = sys.stdout | ||||
else: | ||||
fp = open(output, 'w') | ||||
Mike Edgar
|
r22709 | # Always obtain file counts of each directory in the given root directory. | ||
def onerror(e): | ||||
ui.warn(_('error walking directory structure: %s\n') % e) | ||||
dirs = {} | ||||
rootprefixlen = len(root) | ||||
for dirpath, dirnames, filenames in os.walk(root, onerror=onerror): | ||||
dirpathfromroot = dirpath[rootprefixlen:] | ||||
dirs[dirpathfromroot] = len(filenames) | ||||
if '.hg' in dirnames: | ||||
dirnames.remove('.hg') | ||||
Bryan O'Sullivan
|
r17734 | |||
lineschanged = zerodict() | ||||
children = zerodict() | ||||
p1distance = zerodict() | ||||
p2distance = zerodict() | ||||
linesinfilesadded = zerodict() | ||||
fileschanged = zerodict() | ||||
filesadded = zerodict() | ||||
filesremoved = zerodict() | ||||
linelengths = zerodict() | ||||
interarrival = zerodict() | ||||
parents = zerodict() | ||||
dirsadded = zerodict() | ||||
tzoffset = zerodict() | ||||
Mike Edgar
|
r22709 | # If a mercurial repo is available, also model the commit history. | ||
if repo: | ||||
revs = scmutil.revrange(repo, revs) | ||||
revs.sort() | ||||
progress = ui.progress | ||||
_analyzing = _('analyzing') | ||||
_changesets = _('changesets') | ||||
_total = len(revs) | ||||
Bryan O'Sullivan
|
r17734 | |||
Mike Edgar
|
r22709 | for i, rev in enumerate(revs): | ||
progress(_analyzing, i, unit=_changesets, total=_total) | ||||
ctx = repo[rev] | ||||
pl = ctx.parents() | ||||
pctx = pl[0] | ||||
prev = pctx.rev() | ||||
children[prev] += 1 | ||||
p1distance[rev - prev] += 1 | ||||
parents[len(pl)] += 1 | ||||
tzoffset[ctx.date()[1]] += 1 | ||||
if len(pl) > 1: | ||||
p2distance[rev - pl[1].rev()] += 1 | ||||
if prev == rev - 1: | ||||
lastctx = pctx | ||||
else: | ||||
lastctx = repo[rev - 1] | ||||
if lastctx.rev() != nullrev: | ||||
timedelta = ctx.date()[0] - lastctx.date()[0] | ||||
interarrival[roundto(timedelta, 300)] += 1 | ||||
diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), []) | ||||
fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0 | ||||
for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff): | ||||
if isbin: | ||||
continue | ||||
added = sum(lineadd.itervalues(), 0) | ||||
if mar == 'm': | ||||
if added and lineremove: | ||||
lineschanged[roundto(added, 5), | ||||
roundto(lineremove, 5)] += 1 | ||||
filechanges += 1 | ||||
elif mar == 'a': | ||||
fileadds += 1 | ||||
if '/' in filename: | ||||
filedir = filename.rsplit('/', 1)[0] | ||||
if filedir not in pctx.dirs(): | ||||
diradds += 1 | ||||
linesinfilesadded[roundto(added, 5)] += 1 | ||||
elif mar == 'r': | ||||
fileremoves += 1 | ||||
for length, count in lineadd.iteritems(): | ||||
linelengths[length] += count | ||||
fileschanged[filechanges] += 1 | ||||
filesadded[fileadds] += 1 | ||||
dirsadded[diradds] += 1 | ||||
filesremoved[fileremoves] += 1 | ||||
Bryan O'Sullivan
|
r17734 | |||
invchildren = zerodict() | ||||
for rev, count in children.iteritems(): | ||||
invchildren[count] += 1 | ||||
if output != '-': | ||||
ui.status(_('writing output to %s\n') % output) | ||||
def pronk(d): | ||||
return sorted(d.iteritems(), key=lambda x: x[1], reverse=True) | ||||
Augie Fackler
|
r20672 | json.dump({'revs': len(revs), | ||
Mike Edgar
|
r22709 | 'initdirs': pronk(dirs), | ||
Augie Fackler
|
r20672 | 'lineschanged': pronk(lineschanged), | ||
'children': pronk(invchildren), | ||||
'fileschanged': pronk(fileschanged), | ||||
'filesadded': pronk(filesadded), | ||||
'linesinfilesadded': pronk(linesinfilesadded), | ||||
'dirsadded': pronk(dirsadded), | ||||
'filesremoved': pronk(filesremoved), | ||||
'linelengths': pronk(linelengths), | ||||
'parents': pronk(parents), | ||||
'p1distance': pronk(p1distance), | ||||
'p2distance': pronk(p2distance), | ||||
'interarrival': pronk(interarrival), | ||||
'tzoffset': pronk(tzoffset), | ||||
}, | ||||
Bryan O'Sullivan
|
r17734 | fp) | ||
fp.close() | ||||
@command('synthesize', | ||||
[('c', 'count', 0, _('create given number of commits'), _('COUNT')), | ||||
Mike Edgar
|
r22708 | ('', 'dict', '', _('path to a dictionary of words'), _('FILE')), | ||
('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))], | ||||
Bryan O'Sullivan
|
r17734 | _('hg synthesize [OPTION].. DESCFILE')) | ||
def synthesize(ui, repo, descpath, **opts): | ||||
'''synthesize commits based on a model of an existing repository | ||||
The model must have been generated by :hg:`analyze`. Commits will | ||||
be generated randomly according to the probabilities described in | ||||
Mike Edgar
|
r22708 | the model. If --initfiles is set, the repository will be seeded with | ||
the given number files following the modeled repository's directory | ||||
structure. | ||||
Bryan O'Sullivan
|
r17734 | |||
When synthesizing new content, commit descriptions, and user | ||||
names, words will be chosen randomly from a dictionary that is | ||||
presumed to contain one word per line. Use --dict to specify the | ||||
path to an alternate dictionary to use. | ||||
''' | ||||
try: | ||||
Siddharth Agarwal
|
r17887 | fp = hg.openpath(ui, descpath) | ||
Gregory Szorc
|
r25660 | except Exception as err: | ||
Pierre-Yves David
|
r26587 | raise error.Abort('%s: %s' % (descpath, err[0].strerror)) | ||
Bryan O'Sullivan
|
r17734 | desc = json.load(fp) | ||
fp.close() | ||||
def cdf(l): | ||||
Bryan O'Sullivan
|
r18047 | if not l: | ||
return [], [] | ||||
Bryan O'Sullivan
|
r17734 | vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True)) | ||
t = float(sum(probs, 0)) | ||||
s, cdfs = 0, [] | ||||
for v in probs: | ||||
s += v | ||||
cdfs.append(s / t) | ||||
return vals, cdfs | ||||
lineschanged = cdf(desc['lineschanged']) | ||||
fileschanged = cdf(desc['fileschanged']) | ||||
filesadded = cdf(desc['filesadded']) | ||||
dirsadded = cdf(desc['dirsadded']) | ||||
filesremoved = cdf(desc['filesremoved']) | ||||
linelengths = cdf(desc['linelengths']) | ||||
parents = cdf(desc['parents']) | ||||
p1distance = cdf(desc['p1distance']) | ||||
p2distance = cdf(desc['p2distance']) | ||||
interarrival = cdf(desc['interarrival']) | ||||
linesinfilesadded = cdf(desc['linesinfilesadded']) | ||||
tzoffset = cdf(desc['tzoffset']) | ||||
dictfile = opts.get('dict') or '/usr/share/dict/words' | ||||
try: | ||||
fp = open(dictfile, 'rU') | ||||
Gregory Szorc
|
r25660 | except IOError as err: | ||
Pierre-Yves David
|
r26587 | raise error.Abort('%s: %s' % (dictfile, err.strerror)) | ||
Bryan O'Sullivan
|
r17734 | words = fp.read().splitlines() | ||
fp.close() | ||||
Mike Edgar
|
r22708 | initdirs = {} | ||
if desc['initdirs']: | ||||
for k, v in desc['initdirs']: | ||||
initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v | ||||
initdirs = renamedirs(initdirs, words) | ||||
initdirscdf = cdf(initdirs) | ||||
Bryan O'Sullivan
|
r17734 | def pick(cdf): | ||
return cdf[0][bisect.bisect_left(cdf[1], random.random())] | ||||
Mike Edgar
|
r22708 | def pickpath(): | ||
return os.path.join(pick(initdirscdf), random.choice(words)) | ||||
Bryan O'Sullivan
|
r17734 | def makeline(minimum=0): | ||
total = max(minimum, pick(linelengths)) | ||||
c, l = 0, [] | ||||
while c < total: | ||||
w = random.choice(words) | ||||
c += len(w) + 1 | ||||
l.append(w) | ||||
return ' '.join(l) | ||||
wlock = repo.wlock() | ||||
lock = repo.lock() | ||||
Martin von Zweigbergk
|
r32291 | nevertouch = {'.hgsub', '.hgignore', '.hgtags'} | ||
Bryan O'Sullivan
|
r17734 | |||
progress = ui.progress | ||||
_synthesizing = _('synthesizing') | ||||
Mike Edgar
|
r22708 | _files = _('initial files') | ||
Bryan O'Sullivan
|
r17734 | _changesets = _('changesets') | ||
Mike Edgar
|
r22708 | # Synthesize a single initial revision adding files to the repo according | ||
# to the modeled directory structure. | ||||
initcount = int(opts['initfiles']) | ||||
if initcount and initdirs: | ||||
pctx = repo[None].parents()[0] | ||||
Mike Edgar
|
r23778 | dirs = set(pctx.dirs()) | ||
Mike Edgar
|
r22708 | files = {} | ||
Mike Edgar
|
r23778 | |||
def validpath(path): | ||||
# Don't pick filenames which are already directory names. | ||||
if path in dirs: | ||||
return False | ||||
# Don't pick directories which were used as file names. | ||||
while path: | ||||
if path in files: | ||||
return False | ||||
path = os.path.dirname(path) | ||||
return True | ||||
Mike Edgar
|
r22708 | for i in xrange(0, initcount): | ||
ui.progress(_synthesizing, i, unit=_files, total=initcount) | ||||
path = pickpath() | ||||
Mike Edgar
|
r23778 | while not validpath(path): | ||
Mike Edgar
|
r22708 | path = pickpath() | ||
data = '%s contents\n' % path | ||||
Martin von Zweigbergk
|
r35399 | files[path] = data | ||
Mike Edgar
|
r23778 | dir = os.path.dirname(path) | ||
while dir and dir not in dirs: | ||||
dirs.add(dir) | ||||
dir = os.path.dirname(dir) | ||||
Mike Edgar
|
r22708 | |||
def filectxfn(repo, memctx, path): | ||||
Martin von Zweigbergk
|
r35401 | return context.memfilectx(repo, memctx, path, files[path]) | ||
Mike Edgar
|
r22708 | |||
ui.progress(_synthesizing, None) | ||||
message = 'synthesized wide repo with %d files' % (len(files),) | ||||
mc = context.memctx(repo, [pctx.node(), nullid], message, | ||||
Augie Fackler
|
r36313 | files, filectxfn, ui.username(), | ||
Boris Feld
|
r36625 | '%d %d' % dateutil.makedate()) | ||
Mike Edgar
|
r22708 | initnode = mc.commit() | ||
Jordi GutiƩrrez Hermoso
|
r24306 | if ui.debugflag: | ||
hexfn = hex | ||||
else: | ||||
hexfn = short | ||||
Mike Edgar
|
r22708 | ui.status(_('added commit %s with %d files\n') | ||
% (hexfn(initnode), len(files))) | ||||
# Synthesize incremental revisions to the repository, adding repo depth. | ||||
Bryan O'Sullivan
|
r17734 | count = int(opts['count']) | ||
heads = set(map(repo.changelog.rev, repo.heads())) | ||||
for i in xrange(count): | ||||
progress(_synthesizing, i, unit=_changesets, total=count) | ||||
node = repo.changelog.node | ||||
revs = len(repo) | ||||
def pickhead(heads, distance): | ||||
if heads: | ||||
lheads = sorted(heads) | ||||
rev = revs - min(pick(distance), revs) | ||||
if rev < lheads[-1]: | ||||
rev = lheads[bisect.bisect_left(lheads, rev)] | ||||
else: | ||||
rev = lheads[-1] | ||||
return rev, node(rev) | ||||
return nullrev, nullid | ||||
r1 = revs - min(pick(p1distance), revs) | ||||
p1 = node(r1) | ||||
# the number of heads will grow without bound if we use a pure | ||||
# model, so artificially constrain their proliferation | ||||
Mike Edgar
|
r22472 | toomanyheads = len(heads) > random.randint(1, 20) | ||
if p2distance[0] and (pick(parents) == 2 or toomanyheads): | ||||
Bryan O'Sullivan
|
r17734 | r2, p2 = pickhead(heads.difference([r1]), p2distance) | ||
else: | ||||
r2, p2 = nullrev, nullid | ||||
pl = [p1, p2] | ||||
pctx = repo[r1] | ||||
mf = pctx.manifest() | ||||
mfk = mf.keys() | ||||
changes = {} | ||||
if mfk: | ||||
for __ in xrange(pick(fileschanged)): | ||||
for __ in xrange(10): | ||||
fctx = pctx.filectx(random.choice(mfk)) | ||||
path = fctx.path() | ||||
if not (path in nevertouch or fctx.isbinary() or | ||||
'l' in fctx.flags()): | ||||
break | ||||
lines = fctx.data().splitlines() | ||||
add, remove = pick(lineschanged) | ||||
for __ in xrange(remove): | ||||
if not lines: | ||||
break | ||||
del lines[random.randrange(0, len(lines))] | ||||
for __ in xrange(add): | ||||
lines.insert(random.randint(0, len(lines)), makeline()) | ||||
path = fctx.path() | ||||
Martin von Zweigbergk
|
r35399 | changes[path] = '\n'.join(lines) + '\n' | ||
Bryan O'Sullivan
|
r17734 | for __ in xrange(pick(filesremoved)): | ||
path = random.choice(mfk) | ||||
for __ in xrange(10): | ||||
path = random.choice(mfk) | ||||
if path not in changes: | ||||
break | ||||
if filesadded: | ||||
dirs = list(pctx.dirs()) | ||||
Mike Edgar
|
r23235 | dirs.insert(0, '') | ||
Bryan O'Sullivan
|
r17734 | for __ in xrange(pick(filesadded)): | ||
Mike Edgar
|
r23235 | pathstr = '' | ||
while pathstr in dirs: | ||||
path = [random.choice(dirs)] | ||||
if pick(dirsadded): | ||||
path.append(random.choice(words)) | ||||
Bryan O'Sullivan
|
r17734 | path.append(random.choice(words)) | ||
Mike Edgar
|
r23235 | pathstr = '/'.join(filter(None, path)) | ||
Bryan O'Sullivan
|
r17734 | data = '\n'.join(makeline() | ||
for __ in xrange(pick(linesinfilesadded))) + '\n' | ||||
Martin von Zweigbergk
|
r35399 | changes[pathstr] = data | ||
Bryan O'Sullivan
|
r17734 | def filectxfn(repo, memctx, path): | ||
Martin von Zweigbergk
|
r35399 | if path not in changes: | ||
return None | ||||
Martin von Zweigbergk
|
r35401 | return context.memfilectx(repo, memctx, path, changes[path]) | ||
Bryan O'Sullivan
|
r17734 | if not changes: | ||
continue | ||||
if revs: | ||||
date = repo['tip'].date()[0] + pick(interarrival) | ||||
else: | ||||
date = time.time() - (86400 * count) | ||||
Mike Edgar
|
r23234 | # dates in mercurial must be positive, fit in 32-bit signed integers. | ||
date = min(0x7fffffff, max(0, date)) | ||||
Bryan O'Sullivan
|
r17734 | user = random.choice(words) + '@' + random.choice(words) | ||
mc = context.memctx(repo, pl, makeline(minimum=2), | ||||
Augie Fackler
|
r34025 | sorted(changes), | ||
Bryan O'Sullivan
|
r17734 | filectxfn, user, '%d %d' % (date, pick(tzoffset))) | ||
newnode = mc.commit() | ||||
heads.add(repo.changelog.rev(newnode)) | ||||
heads.discard(r1) | ||||
heads.discard(r2) | ||||
lock.release() | ||||
wlock.release() | ||||
Mike Edgar
|
r22708 | |||
def renamedirs(dirs, words): | ||||
'''Randomly rename the directory names in the per-dir file count dict.''' | ||||
wordgen = itertools.cycle(words) | ||||
replacements = {'': ''} | ||||
def rename(dirpath): | ||||
'''Recursively rename the directory and all path prefixes. | ||||
The mapping from path to renamed path is stored for all path prefixes | ||||
as in dynamic programming, ensuring linear runtime and consistent | ||||
renaming regardless of iteration order through the model. | ||||
''' | ||||
if dirpath in replacements: | ||||
return replacements[dirpath] | ||||
head, _ = os.path.split(dirpath) | ||||
Jordi GutiƩrrez Hermoso
|
r24306 | if head: | ||
head = rename(head) | ||||
else: | ||||
head = '' | ||||
timeless
|
r29216 | renamed = os.path.join(head, next(wordgen)) | ||
Mike Edgar
|
r22708 | replacements[dirpath] = renamed | ||
return renamed | ||||
result = [] | ||||
for dirpath, count in dirs.iteritems(): | ||||
result.append([rename(dirpath.lstrip(os.sep)), count]) | ||||
return result | ||||