# HG changeset patch # User Mike Edgar # Date 2014-09-13 02:04:29 # Node ID 4c66e70c348829173b2eb35f2114d26035690940 # Parent 38e0363dcbe0edf55f33545a7b769bce0c7b3fee contrib/synthrepo: generate initial repo contents using directory shape model Augments the synthesize command to use an additional parameter to the analyzed repo model: the number of files in each directory at a given snapshot. Before synthesizing history, an arbitrary number of files will be generated in a distribution matching the analyzed directory structure. Intended for developing, testing and measuring scaling improvements when importing/converting a large repository to Mercurial. diff --git a/contrib/synthrepo.py b/contrib/synthrepo.py --- a/contrib/synthrepo.py +++ b/contrib/synthrepo.py @@ -35,10 +35,10 @@ A few obvious properties that are not cu - Symlinks and binary files are ignored ''' -import bisect, collections, json, os, random, time, sys +import bisect, collections, itertools, json, os, random, time, sys from mercurial import cmdutil, context, patch, scmutil, util, hg from mercurial.i18n import _ -from mercurial.node import nullrev, nullid +from mercurial.node import nullrev, nullid, short testedwith = 'internal' @@ -208,14 +208,17 @@ def analyze(ui, repo, *revs, **opts): @command('synthesize', [('c', 'count', 0, _('create given number of commits'), _('COUNT')), - ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))], + ('', 'dict', '', _('path to a dictionary of words'), _('FILE')), + ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))], _('hg synthesize [OPTION].. DESCFILE')) def synthesize(ui, repo, descpath, **opts): '''synthesize commits based on a model of an existing repository The model must have been generated by :hg:`analyze`. Commits will be generated randomly according to the probabilities described in - the model. + the model. If --initfiles is set, the repository will be seeded with + the given number files following the modeled repository's directory + structure. When synthesizing new content, commit descriptions, and user names, words will be chosen randomly from a dictionary that is @@ -261,9 +264,19 @@ def synthesize(ui, repo, descpath, **opt words = fp.read().splitlines() fp.close() + initdirs = {} + if desc['initdirs']: + for k, v in desc['initdirs']: + initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v + initdirs = renamedirs(initdirs, words) + initdirscdf = cdf(initdirs) + def pick(cdf): return cdf[0][bisect.bisect_left(cdf[1], random.random())] + def pickpath(): + return os.path.join(pick(initdirscdf), random.choice(words)) + def makeline(minimum=0): total = max(minimum, pick(linelengths)) c, l = 0, [] @@ -280,8 +293,38 @@ def synthesize(ui, repo, descpath, **opt progress = ui.progress _synthesizing = _('synthesizing') + _files = _('initial files') _changesets = _('changesets') + # Synthesize a single initial revision adding files to the repo according + # to the modeled directory structure. + initcount = int(opts['initfiles']) + if initcount and initdirs: + pctx = repo[None].parents()[0] + files = {} + for i in xrange(0, initcount): + ui.progress(_synthesizing, i, unit=_files, total=initcount) + + path = pickpath() + while path in pctx.dirs(): + path = pickpath() + data = '%s contents\n' % path + files[path] = context.memfilectx(repo, path, data) + + def filectxfn(repo, memctx, path): + return files[path] + + ui.progress(_synthesizing, None) + message = 'synthesized wide repo with %d files' % (len(files),) + mc = context.memctx(repo, [pctx.node(), nullid], message, + files.iterkeys(), filectxfn, ui.username(), + '%d %d' % util.makedate()) + initnode = mc.commit() + hexfn = ui.debugflag and hex or short + ui.status(_('added commit %s with %d files\n') + % (hexfn(initnode), len(files))) + + # Synthesize incremental revisions to the repository, adding repo depth. count = int(opts['count']) heads = set(map(repo.changelog.rev, repo.heads())) for i in xrange(count): @@ -374,3 +417,26 @@ def synthesize(ui, repo, descpath, **opt lock.release() wlock.release() + +def renamedirs(dirs, words): + '''Randomly rename the directory names in the per-dir file count dict.''' + wordgen = itertools.cycle(words) + replacements = {'': ''} + def rename(dirpath): + '''Recursively rename the directory and all path prefixes. + + The mapping from path to renamed path is stored for all path prefixes + as in dynamic programming, ensuring linear runtime and consistent + renaming regardless of iteration order through the model. + ''' + if dirpath in replacements: + return replacements[dirpath] + head, _ = os.path.split(dirpath) + head = head and rename(head) or '' + renamed = os.path.join(head, wordgen.next()) + replacements[dirpath] = renamed + return renamed + result = [] + for dirpath, count in dirs.iteritems(): + result.append([rename(dirpath.lstrip(os.sep)), count]) + return result