upstream/mercurial-mirror Commit - r22709:889789a2

1

# synthrepo.py - repo synthesis

1

# synthrepo.py - repo synthesis

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

'''synthesize structurally interesting change history

8

'''synthesize structurally interesting change history

9

10

This extension is useful for creating a repository with properties

10

This extension is useful for creating a repository with properties

11

that are statistically similar to an existing repository. During

11

that are statistically similar to an existing repository. During

12

analysis, a simple probability table is constructed from the history

12

analysis, a simple probability table is constructed from the history

13

of an existing repository. During synthesis, these properties are

13

of an existing repository. During synthesis, these properties are

14

reconstructed.

14

reconstructed.

15

16

Properties that are analyzed and synthesized include the following:

16

Properties that are analyzed and synthesized include the following:

17

18

- Lines added or removed when an existing file is modified

18

- Lines added or removed when an existing file is modified

19

- Number and sizes of files added

19

- Number and sizes of files added

20

- Number of files removed

20

- Number of files removed

21

- Line lengths

21

- Line lengths

22

- Topological distance to parent changeset(s)

22

- Topological distance to parent changeset(s)

23

- Probability of a commit being a merge

23

- Probability of a commit being a merge

24

- Probability of a newly added file being added to a new directory

24

- Probability of a newly added file being added to a new directory

25

- Interarrival time, and time zone, of commits

25

- Interarrival time, and time zone, of commits

26

- Number of files in each directory

26

27

A few obvious properties that are not currently handled realistically:

28

A few obvious properties that are not currently handled realistically:

28

29

- Merges are treated as regular commits with two parents, which is not

30

- Merges are treated as regular commits with two parents, which is not

30

realistic

31

realistic

31

- Modifications are not treated as operations on hunks of lines, but

32

- Modifications are not treated as operations on hunks of lines, but

32

as insertions and deletions of randomly chosen single lines

33

as insertions and deletions of randomly chosen single lines

33

- Committer ID (always random)

34

- Committer ID (always random)

34

- Executability of files

35

- Executability of files

35

- Symlinks and binary files are ignored

36

- Symlinks and binary files are ignored

36

'''

37

'''

37

38

import bisect, collections, itertools, json, os, random, time, sys

39

import bisect, collections, itertools, json, os, random, time, sys

39

from mercurial import cmdutil, context, patch, scmutil, util, hg

40

from mercurial import cmdutil, context, patch, scmutil, util, hg

40

from mercurial.i18n import _

41

from mercurial.i18n import _

41

from mercurial.node import nullrev, nullid, short

42

from mercurial.node import nullrev, nullid, short

42

43

testedwith = 'internal'

44

testedwith = 'internal'

44

45

cmdtable = {}

46

cmdtable = {}

46

command = cmdutil.command(cmdtable)

47

command = cmdutil.command(cmdtable)

47

48

newfile = set(('new fi', 'rename', 'copy f', 'copy t'))

49

newfile = set(('new fi', 'rename', 'copy f', 'copy t'))

49

50

def zerodict():

51

def zerodict():

51

return collections.defaultdict(lambda: 0)

52

return collections.defaultdict(lambda: 0)

52

53

def roundto(x, k):

54

def roundto(x, k):

54

if x > k * 2:

55

if x > k * 2:

55

return int(round(x / float(k)) * k)

56

return int(round(x / float(k)) * k)

56

return int(round(x))

57

return int(round(x))

57

58

def parsegitdiff(lines):

59

def parsegitdiff(lines):

59

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

60

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

60

binary = False

61

binary = False

61

for line in lines:

62

for line in lines:

62

start = line[:6]

63

start = line[:6]

63

if start == 'diff -':

64

if start == 'diff -':

64

if filename:

65

if filename:

65

yield filename, mar, lineadd, lineremove, binary

66

yield filename, mar, lineadd, lineremove, binary

66

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

67

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

67

filename = patch.gitre.match(line).group(1)

68

filename = patch.gitre.match(line).group(1)

68

elif start in newfile:

69

elif start in newfile:

69

mar = 'a'

70

mar = 'a'

70

elif start == 'GIT bi':

71

elif start == 'GIT bi':

71

binary = True

72

binary = True

72

elif start == 'delete':

73

elif start == 'delete':

73

mar = 'r'

74

mar = 'r'

74

elif start:

75

elif start:

75

s = start[0]

76

s = start[0]

76

if s == '-' and not line.startswith('--- '):

77

if s == '-' and not line.startswith('--- '):

77

lineremove += 1

78

lineremove += 1

78

elif s == '+' and not line.startswith('+++ '):

79

elif s == '+' and not line.startswith('+++ '):

79

lineadd[roundto(len(line) - 1, 5)] += 1

80

lineadd[roundto(len(line) - 1, 5)] += 1

80

if filename:

81

if filename:

81

yield filename, mar, lineadd, lineremove, binary

82

yield filename, mar, lineadd, lineremove, binary

82

83

@command('analyze',

84

@command('analyze',

84

[('o', 'output', [], _('write output to given file'), _('FILE')),

85

[('o', 'output', '', _('write output to given file'), _('FILE')),

85

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

86

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

86

_('hg analyze'))

87

_('hg analyze'), optionalrepo=True)

87

def analyze(ui, repo, *revs, **opts):

88

def analyze(ui, repo, *revs, **opts):

88

'''create a simple model of a repository to use for later synthesis

89

'''create a simple model of a repository to use for later synthesis

89

90

This command examines every changeset in the given range (or all

91

This command examines every changeset in the given range (or all

91

of history if none are specified) and creates a simple statistical

92

of history if none are specified) and creates a simple statistical

92

model of the history of the repository.

93

model of the history of the repository. It also measures the directory

94

structure of the repository as checked out.

93

95

94

The model is written out to a JSON file, and can be used by

96

The model is written out to a JSON file, and can be used by

95

:hg:`synthesize` to create or augment a repository with synthetic

97

:hg:`synthesize` to create or augment a repository with synthetic

96

commits that have a structure that is statistically similar to the

98

commits that have a structure that is statistically similar to the

97

analyzed repository.

99

analyzed repository.

98

'''

100

'''

101

root = repo.root

102

if not root.endswith(os.path.sep):

103

root += os.path.sep

99

104

100

revs = list(revs)

105

revs = list(revs)

101

revs.extend(opts['rev'])

106

revs.extend(opts['rev'])

102

if not revs:

107

if not revs:

103

revs = [':']

108

revs = [':']

104

109

105

output = opts['output']

110

output = opts['output']

106

if not output:

111

if not output:

107

output = os.path.basename(~~repo~~.root) + '.json'

112

output = os.path.basename(root) + '.json'

108

113

109

if output == '-':

114

if output == '-':

110

fp = sys.stdout

115

fp = sys.stdout

111

else:

116

else:

112

fp = open(output, 'w')

117

fp = open(output, 'w')

113

118

114

revs = scmutil.revrange(repo, revs)

119

# Always obtain file counts of each directory in the given root directory.

115

revs.sort()

120

def onerror(e):

121

ui.warn(_('error walking directory structure: %s\n') % e)

122

123

dirs = {}

124

rootprefixlen = len(root)

125

for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):

126

dirpathfromroot = dirpath[rootprefixlen:]

127

dirs[dirpathfromroot] = len(filenames)

128

if '.hg' in dirnames:

129

dirnames.remove('.hg')

116

130

117

lineschanged = zerodict()

131

lineschanged = zerodict()

118

children = zerodict()

132

children = zerodict()

119

p1distance = zerodict()

133

p1distance = zerodict()

120

p2distance = zerodict()

134

p2distance = zerodict()

121

linesinfilesadded = zerodict()

135

linesinfilesadded = zerodict()

122

fileschanged = zerodict()

136

fileschanged = zerodict()

123

filesadded = zerodict()

137

filesadded = zerodict()

124

filesremoved = zerodict()

138

filesremoved = zerodict()

125

linelengths = zerodict()

139

linelengths = zerodict()

126

interarrival = zerodict()

140

interarrival = zerodict()

127

parents = zerodict()

141

parents = zerodict()

128

dirsadded = zerodict()

142

dirsadded = zerodict()

129

tzoffset = zerodict()

143

tzoffset = zerodict()

130

144

131

progress = ui.progress

145

# If a mercurial repo is available, also model the commit history.

132

_analyzing = _('analyzing')

146

if repo:

133

_changesets = _('changesets')

147

revs = scmutil.revrange(repo, revs)

134

_total = len(revs)

148

revs.sort()

149

150

progress = ui.progress

151

_analyzing = _('analyzing')

152

_changesets = _('changesets')

153

_total = len(revs)

135

154

136

for i, rev in enumerate(revs):

155

for i, rev in enumerate(revs):

137

progress(_analyzing, i, unit=_changesets, total=_total)

156

progress(_analyzing, i, unit=_changesets, total=_total)

138

ctx = repo[rev]

157

ctx = repo[rev]

139

pl = ctx.parents()

158

pl = ctx.parents()

140

pctx = pl[0]

159

pctx = pl[0]

141

prev = pctx.rev()

160

prev = pctx.rev()

142

children[prev] += 1

161

children[prev] += 1

143

p1distance[rev - prev] += 1

162

p1distance[rev - prev] += 1

144

parents[len(pl)] += 1

163

parents[len(pl)] += 1

145

tzoffset[ctx.date()[1]] += 1

164

tzoffset[ctx.date()[1]] += 1

146

if len(pl) > 1:

165

if len(pl) > 1:

147

p2distance[rev - pl[1].rev()] += 1

166

p2distance[rev - pl[1].rev()] += 1

148

if prev == rev - 1:

167

if prev == rev - 1:

149

lastctx = pctx

168

lastctx = pctx

150

else:

169

else:

151

lastctx = repo[rev - 1]

170

lastctx = repo[rev - 1]

152

if lastctx.rev() != nullrev:

171

if lastctx.rev() != nullrev:

153

~~interarrival~~[~~roundto~~(ctx.date()[0] - lastctx.date()[0], ~~300~~)] += 1

172

timedelta = ctx.date()[0] - lastctx.date()[0]

154

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

173

interarrival[roundto(timedelta, 300)] += 1

155

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

174

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

156

for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):

175

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

157

if binary:

176

for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):

158

~~continue~~

177

if isbin:

159

added = sum(lineadd.itervalues(), 0)

178

continue

160

if mar == 'm':

179

added = sum(lineadd.itervalues(), 0)

161

if ~~added~~ ~~and~~ ~~lineremove~~:

180

if mar == 'm':

162

~~lineschanged~~[~~roundto~~(added, 5), ~~roundto~~(lineremove, 5)] += 1

181

if added and lineremove:

163

~~filechanges~~ += 1

182

lineschanged[roundto(added, 5),

164

elif mar == 'a':

183

roundto(lineremove, 5)] += 1

165

~~fileadd~~s += 1

184

filechanges += 1

166

if '/' in ~~filename~~:

185

elif mar == 'a':

167

file~~dir~~ = ~~filename~~.~~rsplit~~(~~'/'~~, 1)[0]

186

fileadds += 1

168

if ~~filedir~~ ~~not~~ in ~~pctx~~.~~dirs~~():

187

if '/' in filename:

169

dir~~adds~~ += 1

188

filedir = filename.rsplit('/', 1)[0]

170

linesinfilesadded[roundto(added, 5)] += 1

189

if filedir not in pctx.dirs():

171

elif mar == 'r':

190

diradds += 1

172

fileremoves += 1

191

linesinfilesadded[roundto(added, 5)] += 1

173

for length, count in lineadd.iteritems():

192

elif mar == 'r':

174

~~linelengths~~[~~length~~] += ~~count~~

193

fileremoves += 1

175

fileschanged[filechanges] += 1

194

for length, count in lineadd.iteritems():

176

filesadded[fileadds] += 1

195

linelengths[length] += count

177

~~dirsadd~~ed[~~diradd~~s] += 1

196

fileschanged[filechanges] += 1

178

~~filesremov~~ed[file~~remove~~s] += 1

197

filesadded[fileadds] += 1

198

dirsadded[diradds] += 1

199

filesremoved[fileremoves] += 1

179

200

180

invchildren = zerodict()

201

invchildren = zerodict()

181

202

182

for rev, count in children.iteritems():

203

for rev, count in children.iteritems():

183

invchildren[count] += 1

204

invchildren[count] += 1

184

205

185

if output != '-':

206

if output != '-':

186

ui.status(_('writing output to %s\n') % output)

207

ui.status(_('writing output to %s\n') % output)

187

208

188

def pronk(d):

209

def pronk(d):

189

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

210

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

190

211

191

json.dump({'revs': len(revs),

212

json.dump({'revs': len(revs),

213

'initdirs': pronk(dirs),

192

'lineschanged': pronk(lineschanged),

214

'lineschanged': pronk(lineschanged),

193

'children': pronk(invchildren),

215

'children': pronk(invchildren),

194

'fileschanged': pronk(fileschanged),

216

'fileschanged': pronk(fileschanged),

195

'filesadded': pronk(filesadded),

217

'filesadded': pronk(filesadded),

196

'linesinfilesadded': pronk(linesinfilesadded),

218

'linesinfilesadded': pronk(linesinfilesadded),

197

'dirsadded': pronk(dirsadded),

219

'dirsadded': pronk(dirsadded),

198

'filesremoved': pronk(filesremoved),

220

'filesremoved': pronk(filesremoved),

199

'linelengths': pronk(linelengths),

221

'linelengths': pronk(linelengths),

200

'parents': pronk(parents),

222

'parents': pronk(parents),

201

'p1distance': pronk(p1distance),

223

'p1distance': pronk(p1distance),

202

'p2distance': pronk(p2distance),

224

'p2distance': pronk(p2distance),

203

'interarrival': pronk(interarrival),

225

'interarrival': pronk(interarrival),

204

'tzoffset': pronk(tzoffset),

226

'tzoffset': pronk(tzoffset),

205

},

227

},

206

fp)

228

fp)

207

fp.close()

229

fp.close()

208

230

209

@command('synthesize',

231

@command('synthesize',

210

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

232

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

211

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

233

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

212

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

234

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

213

_('hg synthesize [OPTION].. DESCFILE'))

235

_('hg synthesize [OPTION].. DESCFILE'))

214

def synthesize(ui, repo, descpath, **opts):

236

def synthesize(ui, repo, descpath, **opts):

215

'''synthesize commits based on a model of an existing repository

237

'''synthesize commits based on a model of an existing repository

216

238

217

The model must have been generated by :hg:`analyze`. Commits will

239

The model must have been generated by :hg:`analyze`. Commits will

218

be generated randomly according to the probabilities described in

240

be generated randomly according to the probabilities described in

219

the model. If --initfiles is set, the repository will be seeded with

241

the model. If --initfiles is set, the repository will be seeded with

220

the given number files following the modeled repository's directory

242

the given number files following the modeled repository's directory

221

structure.

243

structure.

222

244

223

When synthesizing new content, commit descriptions, and user

245

When synthesizing new content, commit descriptions, and user

224

names, words will be chosen randomly from a dictionary that is

246

names, words will be chosen randomly from a dictionary that is

225

presumed to contain one word per line. Use --dict to specify the

247

presumed to contain one word per line. Use --dict to specify the

226

path to an alternate dictionary to use.

248

path to an alternate dictionary to use.

227

'''

249

'''

228

try:

250

try:

229

fp = hg.openpath(ui, descpath)

251

fp = hg.openpath(ui, descpath)

230

except Exception, err:

252

except Exception, err:

231

raise util.Abort('%s: %s' % (descpath, err[0].strerror))

253

raise util.Abort('%s: %s' % (descpath, err[0].strerror))

232

desc = json.load(fp)

254

desc = json.load(fp)

233

fp.close()

255

fp.close()

234

256

235

def cdf(l):

257

def cdf(l):

236

if not l:

258

if not l:

237

return [], []

259

return [], []

238

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

260

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

239

t = float(sum(probs, 0))

261

t = float(sum(probs, 0))

240

s, cdfs = 0, []

262

s, cdfs = 0, []

241

for v in probs:

263

for v in probs:

242

s += v

264

s += v

243

cdfs.append(s / t)

265

cdfs.append(s / t)

244

return vals, cdfs

266

return vals, cdfs

245

267

246

lineschanged = cdf(desc['lineschanged'])

268

lineschanged = cdf(desc['lineschanged'])

247

fileschanged = cdf(desc['fileschanged'])

269

fileschanged = cdf(desc['fileschanged'])

248

filesadded = cdf(desc['filesadded'])

270

filesadded = cdf(desc['filesadded'])

249

dirsadded = cdf(desc['dirsadded'])

271

dirsadded = cdf(desc['dirsadded'])

250

filesremoved = cdf(desc['filesremoved'])

272

filesremoved = cdf(desc['filesremoved'])

251

linelengths = cdf(desc['linelengths'])

273

linelengths = cdf(desc['linelengths'])

252

parents = cdf(desc['parents'])

274

parents = cdf(desc['parents'])

253

p1distance = cdf(desc['p1distance'])

275

p1distance = cdf(desc['p1distance'])

254

p2distance = cdf(desc['p2distance'])

276

p2distance = cdf(desc['p2distance'])

255

interarrival = cdf(desc['interarrival'])

277

interarrival = cdf(desc['interarrival'])

256

linesinfilesadded = cdf(desc['linesinfilesadded'])

278

linesinfilesadded = cdf(desc['linesinfilesadded'])

257

tzoffset = cdf(desc['tzoffset'])

279

tzoffset = cdf(desc['tzoffset'])

258

280

259

dictfile = opts.get('dict') or '/usr/share/dict/words'

281

dictfile = opts.get('dict') or '/usr/share/dict/words'

260

try:

282

try:

261

fp = open(dictfile, 'rU')

283

fp = open(dictfile, 'rU')

262

except IOError, err:

284

except IOError, err:

263

raise util.Abort('%s: %s' % (dictfile, err.strerror))

285

raise util.Abort('%s: %s' % (dictfile, err.strerror))

264

words = fp.read().splitlines()

286

words = fp.read().splitlines()

265

fp.close()

287

fp.close()

266

288

267

initdirs = {}

289

initdirs = {}

268

if desc['initdirs']:

290

if desc['initdirs']:

269

for k, v in desc['initdirs']:

291

for k, v in desc['initdirs']:

270

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

292

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

271

initdirs = renamedirs(initdirs, words)

293

initdirs = renamedirs(initdirs, words)

272

initdirscdf = cdf(initdirs)

294

initdirscdf = cdf(initdirs)

273

295

274

def pick(cdf):

296

def pick(cdf):

275

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

297

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

276

298

277

def pickpath():

299

def pickpath():

278

return os.path.join(pick(initdirscdf), random.choice(words))

300

return os.path.join(pick(initdirscdf), random.choice(words))

279

301

280

def makeline(minimum=0):

302

def makeline(minimum=0):

281

total = max(minimum, pick(linelengths))

303

total = max(minimum, pick(linelengths))

282

c, l = 0, []

304

c, l = 0, []

283

while c < total:

305

while c < total:

284

w = random.choice(words)

306

w = random.choice(words)

285

c += len(w) + 1

307

c += len(w) + 1

286

l.append(w)

308

l.append(w)

287

return ' '.join(l)

309

return ' '.join(l)

288

310

289

wlock = repo.wlock()

311

wlock = repo.wlock()

290

lock = repo.lock()

312

lock = repo.lock()

291

313

292

nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))

314

nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))

293

315

294

progress = ui.progress

316

progress = ui.progress

295

_synthesizing = _('synthesizing')

317

_synthesizing = _('synthesizing')

296

_files = _('initial files')

318

_files = _('initial files')

297

_changesets = _('changesets')

319

_changesets = _('changesets')

298

320

299

# Synthesize a single initial revision adding files to the repo according

321

# Synthesize a single initial revision adding files to the repo according

300

# to the modeled directory structure.

322

# to the modeled directory structure.

301

initcount = int(opts['initfiles'])

323

initcount = int(opts['initfiles'])

302

if initcount and initdirs:

324

if initcount and initdirs:

303

pctx = repo[None].parents()[0]

325

pctx = repo[None].parents()[0]

304

files = {}

326

files = {}

305

for i in xrange(0, initcount):

327

for i in xrange(0, initcount):

306

ui.progress(_synthesizing, i, unit=_files, total=initcount)

328

ui.progress(_synthesizing, i, unit=_files, total=initcount)

307

329

308

path = pickpath()

330

path = pickpath()

309

while path in pctx.dirs():

331

while path in pctx.dirs():

310

path = pickpath()

332

path = pickpath()

311

data = '%s contents\n' % path

333

data = '%s contents\n' % path

312

files[path] = context.memfilectx(repo, path, data)

334

files[path] = context.memfilectx(repo, path, data)

313

335

314

def filectxfn(repo, memctx, path):

336

def filectxfn(repo, memctx, path):

315

return files[path]

337

return files[path]

316

338

317

ui.progress(_synthesizing, None)

339

ui.progress(_synthesizing, None)

318

message = 'synthesized wide repo with %d files' % (len(files),)

340

message = 'synthesized wide repo with %d files' % (len(files),)

319

mc = context.memctx(repo, [pctx.node(), nullid], message,

341

mc = context.memctx(repo, [pctx.node(), nullid], message,

320

files.iterkeys(), filectxfn, ui.username(),

342

files.iterkeys(), filectxfn, ui.username(),

321

'%d %d' % util.makedate())

343

'%d %d' % util.makedate())

322

initnode = mc.commit()

344

initnode = mc.commit()

323

hexfn = ui.debugflag and hex or short

345

hexfn = ui.debugflag and hex or short

324

ui.status(_('added commit %s with %d files\n')

346

ui.status(_('added commit %s with %d files\n')

325

% (hexfn(initnode), len(files)))

347

% (hexfn(initnode), len(files)))

326

348

327

# Synthesize incremental revisions to the repository, adding repo depth.

349

# Synthesize incremental revisions to the repository, adding repo depth.

328

count = int(opts['count'])

350

count = int(opts['count'])

329

heads = set(map(repo.changelog.rev, repo.heads()))

351

heads = set(map(repo.changelog.rev, repo.heads()))

330

for i in xrange(count):

352

for i in xrange(count):

331

progress(_synthesizing, i, unit=_changesets, total=count)

353

progress(_synthesizing, i, unit=_changesets, total=count)

332

354

333

node = repo.changelog.node

355

node = repo.changelog.node

334

revs = len(repo)

356

revs = len(repo)

335

357

336

def pickhead(heads, distance):

358

def pickhead(heads, distance):

337

if heads:

359

if heads:

338

lheads = sorted(heads)

360

lheads = sorted(heads)

339

rev = revs - min(pick(distance), revs)

361

rev = revs - min(pick(distance), revs)

340

if rev < lheads[-1]:

362

if rev < lheads[-1]:

341

rev = lheads[bisect.bisect_left(lheads, rev)]

363

rev = lheads[bisect.bisect_left(lheads, rev)]

342

else:

364

else:

343

rev = lheads[-1]

365

rev = lheads[-1]

344

return rev, node(rev)

366

return rev, node(rev)

345

return nullrev, nullid

367

return nullrev, nullid

346

368

347

r1 = revs - min(pick(p1distance), revs)

369

r1 = revs - min(pick(p1distance), revs)

348

p1 = node(r1)

370

p1 = node(r1)

349

371

350

# the number of heads will grow without bound if we use a pure

372

# the number of heads will grow without bound if we use a pure

351

# model, so artificially constrain their proliferation

373

# model, so artificially constrain their proliferation

352

toomanyheads = len(heads) > random.randint(1, 20)

374

toomanyheads = len(heads) > random.randint(1, 20)

353

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

375

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

354

r2, p2 = pickhead(heads.difference([r1]), p2distance)

376

r2, p2 = pickhead(heads.difference([r1]), p2distance)

355

else:

377

else:

356

r2, p2 = nullrev, nullid

378

r2, p2 = nullrev, nullid

357

379

358

pl = [p1, p2]

380

pl = [p1, p2]

359

pctx = repo[r1]

381

pctx = repo[r1]

360

mf = pctx.manifest()

382

mf = pctx.manifest()

361

mfk = mf.keys()

383

mfk = mf.keys()

362

changes = {}

384

changes = {}

363

if mfk:

385

if mfk:

364

for __ in xrange(pick(fileschanged)):

386

for __ in xrange(pick(fileschanged)):

365

for __ in xrange(10):

387

for __ in xrange(10):

366

fctx = pctx.filectx(random.choice(mfk))

388

fctx = pctx.filectx(random.choice(mfk))

367

path = fctx.path()

389

path = fctx.path()

368

if not (path in nevertouch or fctx.isbinary() or

390

if not (path in nevertouch or fctx.isbinary() or

369

'l' in fctx.flags()):

391

'l' in fctx.flags()):

370

break

392

break

371

lines = fctx.data().splitlines()

393

lines = fctx.data().splitlines()

372

add, remove = pick(lineschanged)

394

add, remove = pick(lineschanged)

373

for __ in xrange(remove):

395

for __ in xrange(remove):

374

if not lines:

396

if not lines:

375

break

397

break

376

del lines[random.randrange(0, len(lines))]

398

del lines[random.randrange(0, len(lines))]

377

for __ in xrange(add):

399

for __ in xrange(add):

378

lines.insert(random.randint(0, len(lines)), makeline())

400

lines.insert(random.randint(0, len(lines)), makeline())

379

path = fctx.path()

401

path = fctx.path()

380

changes[path] = context.memfilectx(repo, path,

402

changes[path] = context.memfilectx(repo, path,

381

'\n'.join(lines) + '\n')

403

'\n'.join(lines) + '\n')

382

for __ in xrange(pick(filesremoved)):

404

for __ in xrange(pick(filesremoved)):

383

path = random.choice(mfk)

405

path = random.choice(mfk)

384

for __ in xrange(10):

406

for __ in xrange(10):

385

path = random.choice(mfk)

407

path = random.choice(mfk)

386

if path not in changes:

408

if path not in changes:

387

changes[path] = None

409

changes[path] = None

388

break

410

break

389

if filesadded:

411

if filesadded:

390

dirs = list(pctx.dirs())

412

dirs = list(pctx.dirs())

391

dirs.append('')

413

dirs.append('')

392

for __ in xrange(pick(filesadded)):

414

for __ in xrange(pick(filesadded)):

393

path = [random.choice(dirs)]

415

path = [random.choice(dirs)]

394

if pick(dirsadded):

416

if pick(dirsadded):

395

path.append(random.choice(words))

417

path.append(random.choice(words))

396

path.append(random.choice(words))

418

path.append(random.choice(words))

397

path = '/'.join(filter(None, path))

419

path = '/'.join(filter(None, path))

398

data = '\n'.join(makeline()

420

data = '\n'.join(makeline()

399

for __ in xrange(pick(linesinfilesadded))) + '\n'

421

for __ in xrange(pick(linesinfilesadded))) + '\n'

400

changes[path] = context.memfilectx(repo, path, data)

422

changes[path] = context.memfilectx(repo, path, data)

401

def filectxfn(repo, memctx, path):

423

def filectxfn(repo, memctx, path):

402

return changes[path]

424

return changes[path]

403

if not changes:

425

if not changes:

404

continue

426

continue

405

if revs:

427

if revs:

406

date = repo['tip'].date()[0] + pick(interarrival)

428

date = repo['tip'].date()[0] + pick(interarrival)

407

else:

429

else:

408

date = time.time() - (86400 * count)

430

date = time.time() - (86400 * count)

409

user = random.choice(words) + '@' + random.choice(words)

431

user = random.choice(words) + '@' + random.choice(words)

410

mc = context.memctx(repo, pl, makeline(minimum=2),

432

mc = context.memctx(repo, pl, makeline(minimum=2),

411

sorted(changes.iterkeys()),

433

sorted(changes.iterkeys()),

412

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

434

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

413

newnode = mc.commit()

435

newnode = mc.commit()

414

heads.add(repo.changelog.rev(newnode))

436

heads.add(repo.changelog.rev(newnode))

415

heads.discard(r1)

437

heads.discard(r1)

416

heads.discard(r2)

438

heads.discard(r2)

417

439

418

lock.release()

440

lock.release()

419

wlock.release()

441

wlock.release()

420

442

421

def renamedirs(dirs, words):

443

def renamedirs(dirs, words):

422

'''Randomly rename the directory names in the per-dir file count dict.'''

444

'''Randomly rename the directory names in the per-dir file count dict.'''

423

wordgen = itertools.cycle(words)

445

wordgen = itertools.cycle(words)

424

replacements = {'': ''}

446

replacements = {'': ''}

425

def rename(dirpath):

447

def rename(dirpath):

426

'''Recursively rename the directory and all path prefixes.

448

'''Recursively rename the directory and all path prefixes.

427

449

428

The mapping from path to renamed path is stored for all path prefixes

450

The mapping from path to renamed path is stored for all path prefixes

429

as in dynamic programming, ensuring linear runtime and consistent

451

as in dynamic programming, ensuring linear runtime and consistent

430

renaming regardless of iteration order through the model.

452

renaming regardless of iteration order through the model.

431

'''

453

'''

432

if dirpath in replacements:

454

if dirpath in replacements:

433

return replacements[dirpath]

455

return replacements[dirpath]

434

head, _ = os.path.split(dirpath)

456

head, _ = os.path.split(dirpath)

435

head = head and rename(head) or ''

457

head = head and rename(head) or ''

436

renamed = os.path.join(head, wordgen.next())

458

renamed = os.path.join(head, wordgen.next())

437

replacements[dirpath] = renamed

459

replacements[dirpath] = renamed

438

return renamed

460

return renamed

439

result = []

461

result = []

440

for dirpath, count in dirs.iteritems():

462

for dirpath, count in dirs.iteritems():

441

result.append([rename(dirpath.lstrip(os.sep)), count])

463

result.append([rename(dirpath.lstrip(os.sep)), count])

442

return result

464

return result

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # synthrepo.py - repo synthesis
             #
             # Copyright 2012 Facebook
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             '''synthesize structurally interesting change history
             This extension is useful for creating a repository with properties
             that are statistically similar to an existing repository. During
             analysis, a simple probability table is constructed from the history
             of an existing repository.  During synthesis, these properties are
             reconstructed.
             Properties that are analyzed and synthesized include the following:
             - Lines added or removed when an existing file is modified
             - Number and sizes of files added
             - Number of files removed
             - Line lengths
             - Topological distance to parent changeset(s)
             - Probability of a commit being a merge
             - Probability of a newly added file being added to a new directory
             - Interarrival time, and time zone, of commits
+            - Number of files in each directory
             A few obvious properties that are not currently handled realistically:
             - Merges are treated as regular commits with two parents, which is not
               realistic
             - Modifications are not treated as operations on hunks of lines, but
               as insertions and deletions of randomly chosen single lines
             - Committer ID (always random)
             - Executability of files
             - Symlinks and binary files are ignored
             '''
             import bisect, collections, itertools, json, os, random, time, sys
             from mercurial import cmdutil, context, patch, scmutil, util, hg
             from mercurial.i18n import _
             from mercurial.node import nullrev, nullid, short
             testedwith = 'internal'
             cmdtable = {}
             command = cmdutil.command(cmdtable)
             newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
             def zerodict():
                 return collections.defaultdict(lambda: 0)
             def roundto(x, k):
                 if x > k * 2:
                     return int(round(x / float(k)) * k)
                 return int(round(x))
             def parsegitdiff(lines):
                 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
                 binary = False
                 for line in lines:
                     start = line[:6]
                     if start == 'diff -':
                         if filename:
                             yield filename, mar, lineadd, lineremove, binary
                         mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
                         filename = patch.gitre.match(line).group(1)
                     elif start in newfile:
                         mar = 'a'
                     elif start == 'GIT bi':
                         binary = True
                     elif start == 'delete':
                         mar = 'r'
                     elif start:
                         s = start[0]
                         if s == '-' and not line.startswith('--- '):
                             lineremove += 1
                         elif s == '+' and not line.startswith('+++ '):
                             lineadd[roundto(len(line) - 1, 5)] += 1
                 if filename:
                     yield filename, mar, lineadd, lineremove, binary
             @command('analyze',
-                     [('o', 'output', [], _('write output to given file'), _('FILE')),
+                     [('o', 'output', '', _('write output to given file'), _('FILE')),
                       ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
-                     _('hg analyze'))
+                     _('hg analyze'), optionalrepo=True)
             def analyze(ui, repo, *revs, **opts):
                 '''create a simple model of a repository to use for later synthesis
                 This command examines every changeset in the given range (or all
                 of history if none are specified) and creates a simple statistical
-                model of the history of the repository.
+                model of the history of the repository. It also measures the directory
+                structure of the repository as checked out.
                 The model is written out to a JSON file, and can be used by
                 :hg:`synthesize` to create or augment a repository with synthetic
                 commits that have a structure that is statistically similar to the
                 analyzed repository.
                 '''
+                root = repo.root
+                if not root.endswith(os.path.sep):
+                    root += os.path.sep
                 revs = list(revs)
                 revs.extend(opts['rev'])
                 if not revs:
                     revs = [':']
                 output = opts['output']
                 if not output:
-                    output = os.path.basename(repo.root) + '.json'
+                    output = os.path.basename(root) + '.json'
                 if output == '-':
                     fp = sys.stdout
                 else:
                     fp = open(output, 'w')
-                revs = scmutil.revrange(repo, revs)
+                # Always obtain file counts of each directory in the given root directory.
-                revs.sort()
+                def onerror(e):
+                    ui.warn(_('error walking directory structure: %s\n') % e)
+                dirs = {}
+                rootprefixlen = len(root)
+                for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
+                    dirpathfromroot = dirpath[rootprefixlen:]
+                    dirs[dirpathfromroot] = len(filenames)
+                    if '.hg' in dirnames:
+                        dirnames.remove('.hg')
                 lineschanged = zerodict()
                 children = zerodict()
                 p1distance = zerodict()
                 p2distance = zerodict()
                 linesinfilesadded = zerodict()
                 fileschanged = zerodict()
                 filesadded = zerodict()
                 filesremoved = zerodict()
                 linelengths = zerodict()
                 interarrival = zerodict()
                 parents = zerodict()
                 dirsadded = zerodict()
                 tzoffset = zerodict()
-                progress = ui.progress
+                # If a mercurial repo is available, also model the commit history.
-                _analyzing = _('analyzing')
+                if repo:
-                _changesets = _('changesets')
+                    revs = scmutil.revrange(repo, revs)
-                _total = len(revs)
+                    revs.sort()
+                    progress = ui.progress
+                    _analyzing = _('analyzing')
+                    _changesets = _('changesets')
+                    _total = len(revs)
-                for i, rev in enumerate(revs):
+                    for i, rev in enumerate(revs):
-                    progress(_analyzing, i, unit=_changesets, total=_total)
+                        progress(_analyzing, i, unit=_changesets, total=_total)
-                    ctx = repo[rev]
+                        ctx = repo[rev]
-                    pl = ctx.parents()
+                        pl = ctx.parents()
-                    pctx = pl[0]
+                        pctx = pl[0]
-                    prev = pctx.rev()
+                        prev = pctx.rev()
-                    children[prev] += 1
+                        children[prev] += 1
-                    p1distance[rev - prev] += 1
+                        p1distance[rev - prev] += 1
-                    parents[len(pl)] += 1
+                        parents[len(pl)] += 1
-                    tzoffset[ctx.date()[1]] += 1
+                        tzoffset[ctx.date()[1]] += 1
-                    if len(pl) > 1:
+                        if len(pl) > 1:
-                        p2distance[rev - pl[1].rev()] += 1
+                            p2distance[rev - pl[1].rev()] += 1
-                    if prev == rev - 1:
+                        if prev == rev - 1:
-                        lastctx = pctx
+                            lastctx = pctx
-                    else:
+                        else:
-                        lastctx = repo[rev - 1]
+                            lastctx = repo[rev - 1]
-                    if lastctx.rev() != nullrev:
+                        if lastctx.rev() != nullrev:
-                        interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
+                            timedelta = ctx.date()[0] - lastctx.date()[0]
-                    diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
+                            interarrival[roundto(timedelta, 300)] += 1
-                    fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
+                        diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
-                    for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
+                        fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
-                        if binary:
+                        for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
-                            continue
+                            if isbin:
-                        added = sum(lineadd.itervalues(), 0)
+                                continue
-                        if mar == 'm':
+                            added = sum(lineadd.itervalues(), 0)
-                            if added and lineremove:
+                            if mar == 'm':
-                                lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
+                                if added and lineremove:
-                                filechanges += 1
+                                    lineschanged[roundto(added, 5),
-                        elif mar == 'a':
+                                                 roundto(lineremove, 5)] += 1
-                            fileadds += 1
+                                    filechanges += 1
-                            if '/' in filename:
+                            elif mar == 'a':
-                                filedir = filename.rsplit('/', 1)[0]
+                                fileadds += 1
-                                if filedir not in pctx.dirs():
+                                if '/' in filename:
-                                    diradds += 1
+                                    filedir = filename.rsplit('/', 1)[0]
-                            linesinfilesadded[roundto(added, 5)] += 1
+                                    if filedir not in pctx.dirs():
-                        elif mar == 'r':
+                                        diradds += 1
-                            fileremoves += 1
+                                linesinfilesadded[roundto(added, 5)] += 1
-                        for length, count in lineadd.iteritems():
+                            elif mar == 'r':
-                            linelengths[length] += count
+                                fileremoves += 1
-                    fileschanged[filechanges] += 1
+                            for length, count in lineadd.iteritems():
-                    filesadded[fileadds] += 1
+                                linelengths[length] += count
-                    dirsadded[diradds] += 1
+                        fileschanged[filechanges] += 1
-                    filesremoved[fileremoves] += 1
+                        filesadded[fileadds] += 1
+                        dirsadded[diradds] += 1
+                        filesremoved[fileremoves] += 1
                 invchildren = zerodict()
                 for rev, count in children.iteritems():
                     invchildren[count] += 1
                 if output != '-':
                     ui.status(_('writing output to %s\n') % output)
                 def pronk(d):
                     return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
                 json.dump({'revs': len(revs),
+                           'initdirs': pronk(dirs),
                            'lineschanged': pronk(lineschanged),
                            'children': pronk(invchildren),
                            'fileschanged': pronk(fileschanged),
                            'filesadded': pronk(filesadded),
                            'linesinfilesadded': pronk(linesinfilesadded),
                            'dirsadded': pronk(dirsadded),
                            'filesremoved': pronk(filesremoved),
                            'linelengths': pronk(linelengths),
                            'parents': pronk(parents),
                            'p1distance': pronk(p1distance),
                            'p2distance': pronk(p2distance),
                            'interarrival': pronk(interarrival),
                            'tzoffset': pronk(tzoffset),
                            },
                           fp)
                 fp.close()
             @command('synthesize',
                      [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
                       ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
                       ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
                      _('hg synthesize [OPTION].. DESCFILE'))
             def synthesize(ui, repo, descpath, **opts):
                 '''synthesize commits based on a model of an existing repository
                 The model must have been generated by :hg:`analyze`. Commits will
                 be generated randomly according to the probabilities described in
                 the model. If --initfiles is set, the repository will be seeded with
                 the given number files following the modeled repository's directory
                 structure.
                 When synthesizing new content, commit descriptions, and user
                 names, words will be chosen randomly from a dictionary that is
                 presumed to contain one word per line. Use --dict to specify the
                 path to an alternate dictionary to use.
                 '''
                 try:
                     fp = hg.openpath(ui, descpath)
                 except Exception, err:
                     raise util.Abort('%s: %s' % (descpath, err[0].strerror))
                 desc = json.load(fp)
                 fp.close()
                 def cdf(l):
                     if not l:
                         return [], []
                     vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
                     t = float(sum(probs, 0))
                     s, cdfs = 0, []
                     for v in probs:
                         s += v
                         cdfs.append(s / t)
                     return vals, cdfs
                 lineschanged = cdf(desc['lineschanged'])
                 fileschanged = cdf(desc['fileschanged'])
                 filesadded = cdf(desc['filesadded'])
                 dirsadded = cdf(desc['dirsadded'])
                 filesremoved = cdf(desc['filesremoved'])
                 linelengths = cdf(desc['linelengths'])
                 parents = cdf(desc['parents'])
                 p1distance = cdf(desc['p1distance'])
                 p2distance = cdf(desc['p2distance'])
                 interarrival = cdf(desc['interarrival'])
                 linesinfilesadded = cdf(desc['linesinfilesadded'])
                 tzoffset = cdf(desc['tzoffset'])
                 dictfile = opts.get('dict') or '/usr/share/dict/words'
                 try:
                     fp = open(dictfile, 'rU')
                 except IOError, err:
                     raise util.Abort('%s: %s' % (dictfile, err.strerror))
                 words = fp.read().splitlines()
                 fp.close()
                 initdirs = {}
                 if desc['initdirs']:
                     for k, v in desc['initdirs']:
                         initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
                     initdirs = renamedirs(initdirs, words)
                 initdirscdf = cdf(initdirs)
                 def pick(cdf):
                     return cdf[0][bisect.bisect_left(cdf[1], random.random())]
                 def pickpath():
                     return os.path.join(pick(initdirscdf), random.choice(words))
                 def makeline(minimum=0):
                     total = max(minimum, pick(linelengths))
                     c, l = 0, []
                     while c < total:
                         w = random.choice(words)
                         c += len(w) + 1
                         l.append(w)
                     return ' '.join(l)
                 wlock = repo.wlock()
                 lock = repo.lock()
                 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
                 progress = ui.progress
                 _synthesizing = _('synthesizing')
                 _files = _('initial files')
                 _changesets = _('changesets')
                 # Synthesize a single initial revision adding files to the repo according
                 # to the modeled directory structure.
                 initcount = int(opts['initfiles'])
                 if initcount and initdirs:
                     pctx = repo[None].parents()[0]
                     files = {}
                     for i in xrange(0, initcount):
                         ui.progress(_synthesizing, i, unit=_files, total=initcount)
                         path = pickpath()
                         while path in pctx.dirs():
                             path = pickpath()
                         data = '%s contents\n' % path
                         files[path] = context.memfilectx(repo, path, data)
                     def filectxfn(repo, memctx, path):
                         return files[path]
                     ui.progress(_synthesizing, None)
                     message = 'synthesized wide repo with %d files' % (len(files),)
                     mc = context.memctx(repo, [pctx.node(), nullid], message,
                                         files.iterkeys(), filectxfn, ui.username(),
                                         '%d %d' % util.makedate())
                     initnode = mc.commit()
                     hexfn = ui.debugflag and hex or short
                     ui.status(_('added commit %s with %d files\n')
                               % (hexfn(initnode), len(files)))
                 # Synthesize incremental revisions to the repository, adding repo depth.
                 count = int(opts['count'])
                 heads = set(map(repo.changelog.rev, repo.heads()))
                 for i in xrange(count):
                     progress(_synthesizing, i, unit=_changesets, total=count)
                     node = repo.changelog.node
                     revs = len(repo)
                     def pickhead(heads, distance):
                         if heads:
                             lheads = sorted(heads)
                             rev = revs - min(pick(distance), revs)
                             if rev < lheads[-1]:
                                 rev = lheads[bisect.bisect_left(lheads, rev)]
                             else:
                                 rev = lheads[-1]
                             return rev, node(rev)
                         return nullrev, nullid
                     r1 = revs - min(pick(p1distance), revs)
                     p1 = node(r1)
                     # the number of heads will grow without bound if we use a pure
                     # model, so artificially constrain their proliferation
                     toomanyheads = len(heads) > random.randint(1, 20)
                     if p2distance[0] and (pick(parents) == 2 or toomanyheads):
                         r2, p2 = pickhead(heads.difference([r1]), p2distance)
                     else:
                         r2, p2 = nullrev, nullid
                     pl = [p1, p2]
                     pctx = repo[r1]
                     mf = pctx.manifest()
                     mfk = mf.keys()
                     changes = {}
                     if mfk:
                         for __ in xrange(pick(fileschanged)):
                             for __ in xrange(10):
                                 fctx = pctx.filectx(random.choice(mfk))
                                 path = fctx.path()
                                 if not (path in nevertouch or fctx.isbinary() or
                                         'l' in fctx.flags()):
                                     break
                             lines = fctx.data().splitlines()
                             add, remove = pick(lineschanged)
                             for __ in xrange(remove):
                                 if not lines:
                                     break
                                 del lines[random.randrange(0, len(lines))]
                             for __ in xrange(add):
                                 lines.insert(random.randint(0, len(lines)), makeline())
                             path = fctx.path()
                             changes[path] = context.memfilectx(repo, path,
                                                                '\n'.join(lines) + '\n')
                         for __ in xrange(pick(filesremoved)):
                             path = random.choice(mfk)
                             for __ in xrange(10):
                                 path = random.choice(mfk)
                                 if path not in changes:
                                     changes[path] = None
                                     break
                     if filesadded:
                         dirs = list(pctx.dirs())
                         dirs.append('')
                     for __ in xrange(pick(filesadded)):
                         path = [random.choice(dirs)]
                         if pick(dirsadded):
                             path.append(random.choice(words))
                         path.append(random.choice(words))
                         path = '/'.join(filter(None, path))
                         data = '\n'.join(makeline()
                                          for __ in xrange(pick(linesinfilesadded))) + '\n'
                         changes[path] = context.memfilectx(repo, path, data)
                     def filectxfn(repo, memctx, path):
                         return changes[path]
                     if not changes:
                         continue
                     if revs:
                         date = repo['tip'].date()[0] + pick(interarrival)
                     else:
                         date = time.time() - (86400 * count)
                     user = random.choice(words) + '@' + random.choice(words)
                     mc = context.memctx(repo, pl, makeline(minimum=2),
                                         sorted(changes.iterkeys()),
                                         filectxfn, user, '%d %d' % (date, pick(tzoffset)))
                     newnode = mc.commit()
                     heads.add(repo.changelog.rev(newnode))
                     heads.discard(r1)
                     heads.discard(r2)
                 lock.release()
                 wlock.release()
             def renamedirs(dirs, words):
                 '''Randomly rename the directory names in the per-dir file count dict.'''
                 wordgen = itertools.cycle(words)
                 replacements = {'': ''}
                 def rename(dirpath):
                     '''Recursively rename the directory and all path prefixes.
                     The mapping from path to renamed path is stored for all path prefixes
                     as in dynamic programming, ensuring linear runtime and consistent
                     renaming regardless of iteration order through the model.
                     '''
                     if dirpath in replacements:
                         return replacements[dirpath]
                     head, _ = os.path.split(dirpath)
                     head = head and rename(head) or ''
                     renamed = os.path.join(head, wordgen.next())
                     replacements[dirpath] = renamed
                     return renamed
                 result = []
                 for dirpath, count in dirs.iteritems():
                     result.append([rename(dirpath.lstrip(os.sep)), count])
                 return result