upstream/mercurial-mirror Commit - r35399:2123e762

1

# synthrepo.py - repo synthesis

1

# synthrepo.py - repo synthesis

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

'''synthesize structurally interesting change history

8

'''synthesize structurally interesting change history

9

10

This extension is useful for creating a repository with properties

10

This extension is useful for creating a repository with properties

11

that are statistically similar to an existing repository. During

11

that are statistically similar to an existing repository. During

12

analysis, a simple probability table is constructed from the history

12

analysis, a simple probability table is constructed from the history

13

of an existing repository. During synthesis, these properties are

13

of an existing repository. During synthesis, these properties are

14

reconstructed.

14

reconstructed.

15

16

Properties that are analyzed and synthesized include the following:

16

Properties that are analyzed and synthesized include the following:

17

18

- Lines added or removed when an existing file is modified

18

- Lines added or removed when an existing file is modified

19

- Number and sizes of files added

19

- Number and sizes of files added

20

- Number of files removed

20

- Number of files removed

21

- Line lengths

21

- Line lengths

22

- Topological distance to parent changeset(s)

22

- Topological distance to parent changeset(s)

23

- Probability of a commit being a merge

23

- Probability of a commit being a merge

24

- Probability of a newly added file being added to a new directory

24

- Probability of a newly added file being added to a new directory

25

- Interarrival time, and time zone, of commits

25

- Interarrival time, and time zone, of commits

26

- Number of files in each directory

26

- Number of files in each directory

27

28

A few obvious properties that are not currently handled realistically:

28

A few obvious properties that are not currently handled realistically:

29

30

- Merges are treated as regular commits with two parents, which is not

30

- Merges are treated as regular commits with two parents, which is not

31

realistic

31

realistic

32

- Modifications are not treated as operations on hunks of lines, but

32

- Modifications are not treated as operations on hunks of lines, but

33

as insertions and deletions of randomly chosen single lines

33

as insertions and deletions of randomly chosen single lines

34

- Committer ID (always random)

34

- Committer ID (always random)

35

- Executability of files

35

- Executability of files

36

- Symlinks and binary files are ignored

36

- Symlinks and binary files are ignored

37

'''

37

'''

38

39

from __future__ import absolute_import

39

from __future__ import absolute_import

40

import bisect

40

import bisect

41

import collections

41

import collections

42

import itertools

42

import itertools

43

import json

43

import json

44

import os

44

import os

45

import random

45

import random

46

import sys

46

import sys

47

import time

47

import time

48

49

from mercurial.i18n import _

49

from mercurial.i18n import _

50

from mercurial.node import (

50

from mercurial.node import (

51

nullid,

51

nullid,

52

nullrev,

52

nullrev,

53

short,

53

short,

54

)

54

)

55

from mercurial import (

55

from mercurial import (

56

context,

56

context,

57

error,

57

error,

58

hg,

58

hg,

59

patch,

59

patch,

60

registrar,

60

registrar,

61

scmutil,

61

scmutil,

62

util,

62

util,

63

)

63

)

64

65

# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for

65

# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for

66

# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should

66

# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should

67

# be specifying the version(s) of Mercurial they are tested with, or

67

# be specifying the version(s) of Mercurial they are tested with, or

68

# leave the attribute unspecified.

68

# leave the attribute unspecified.

69

testedwith = 'ships-with-hg-core'

69

testedwith = 'ships-with-hg-core'

70

71

cmdtable = {}

71

cmdtable = {}

72

command = registrar.command(cmdtable)

72

command = registrar.command(cmdtable)

73

74

newfile = {'new fi', 'rename', 'copy f', 'copy t'}

74

newfile = {'new fi', 'rename', 'copy f', 'copy t'}

75

76

def zerodict():

76

def zerodict():

77

return collections.defaultdict(lambda: 0)

77

return collections.defaultdict(lambda: 0)

78

79

def roundto(x, k):

79

def roundto(x, k):

80

if x > k * 2:

80

if x > k * 2:

81

return int(round(x / float(k)) * k)

81

return int(round(x / float(k)) * k)

82

return int(round(x))

82

return int(round(x))

83

84

def parsegitdiff(lines):

84

def parsegitdiff(lines):

85

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

85

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

86

binary = False

86

binary = False

87

for line in lines:

87

for line in lines:

88

start = line[:6]

88

start = line[:6]

89

if start == 'diff -':

89

if start == 'diff -':

90

if filename:

90

if filename:

91

yield filename, mar, lineadd, lineremove, binary

91

yield filename, mar, lineadd, lineremove, binary

92

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

92

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

93

filename = patch.gitre.match(line).group(1)

93

filename = patch.gitre.match(line).group(1)

94

elif start in newfile:

94

elif start in newfile:

95

mar = 'a'

95

mar = 'a'

96

elif start == 'GIT bi':

96

elif start == 'GIT bi':

97

binary = True

97

binary = True

98

elif start == 'delete':

98

elif start == 'delete':

99

mar = 'r'

99

mar = 'r'

100

elif start:

100

elif start:

101

s = start[0]

101

s = start[0]

102

if s == '-' and not line.startswith('--- '):

102

if s == '-' and not line.startswith('--- '):

103

lineremove += 1

103

lineremove += 1

104

elif s == '+' and not line.startswith('+++ '):

104

elif s == '+' and not line.startswith('+++ '):

105

lineadd[roundto(len(line) - 1, 5)] += 1

105

lineadd[roundto(len(line) - 1, 5)] += 1

106

if filename:

106

if filename:

107

yield filename, mar, lineadd, lineremove, binary

107

yield filename, mar, lineadd, lineremove, binary

108

109

@command('analyze',

109

@command('analyze',

110

[('o', 'output', '', _('write output to given file'), _('FILE')),

110

[('o', 'output', '', _('write output to given file'), _('FILE')),

111

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

111

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

112

_('hg analyze'), optionalrepo=True)

112

_('hg analyze'), optionalrepo=True)

113

def analyze(ui, repo, *revs, **opts):

113

def analyze(ui, repo, *revs, **opts):

114

'''create a simple model of a repository to use for later synthesis

114

'''create a simple model of a repository to use for later synthesis

115

116

This command examines every changeset in the given range (or all

116

This command examines every changeset in the given range (or all

117

of history if none are specified) and creates a simple statistical

117

of history if none are specified) and creates a simple statistical

118

model of the history of the repository. It also measures the directory

118

model of the history of the repository. It also measures the directory

119

structure of the repository as checked out.

119

structure of the repository as checked out.

120

121

The model is written out to a JSON file, and can be used by

121

The model is written out to a JSON file, and can be used by

122

:hg:`synthesize` to create or augment a repository with synthetic

122

:hg:`synthesize` to create or augment a repository with synthetic

123

commits that have a structure that is statistically similar to the

123

commits that have a structure that is statistically similar to the

124

analyzed repository.

124

analyzed repository.

125

'''

125

'''

126

root = repo.root

126

root = repo.root

127

if not root.endswith(os.path.sep):

127

if not root.endswith(os.path.sep):

128

root += os.path.sep

128

root += os.path.sep

129

130

revs = list(revs)

130

revs = list(revs)

131

revs.extend(opts['rev'])

131

revs.extend(opts['rev'])

132

if not revs:

132

if not revs:

133

revs = [':']

133

revs = [':']

134

135

output = opts['output']

135

output = opts['output']

136

if not output:

136

if not output:

137

output = os.path.basename(root) + '.json'

137

output = os.path.basename(root) + '.json'

138

139

if output == '-':

139

if output == '-':

140

fp = sys.stdout

140

fp = sys.stdout

141

else:

141

else:

142

fp = open(output, 'w')

142

fp = open(output, 'w')

143

144

# Always obtain file counts of each directory in the given root directory.

144

# Always obtain file counts of each directory in the given root directory.

145

def onerror(e):

145

def onerror(e):

146

ui.warn(_('error walking directory structure: %s\n') % e)

146

ui.warn(_('error walking directory structure: %s\n') % e)

147

148

dirs = {}

148

dirs = {}

149

rootprefixlen = len(root)

149

rootprefixlen = len(root)

150

for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):

150

for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):

151

dirpathfromroot = dirpath[rootprefixlen:]

151

dirpathfromroot = dirpath[rootprefixlen:]

152

dirs[dirpathfromroot] = len(filenames)

152

dirs[dirpathfromroot] = len(filenames)

153

if '.hg' in dirnames:

153

if '.hg' in dirnames:

154

dirnames.remove('.hg')

154

dirnames.remove('.hg')

155

156

lineschanged = zerodict()

156

lineschanged = zerodict()

157

children = zerodict()

157

children = zerodict()

158

p1distance = zerodict()

158

p1distance = zerodict()

159

p2distance = zerodict()

159

p2distance = zerodict()

160

linesinfilesadded = zerodict()

160

linesinfilesadded = zerodict()

161

fileschanged = zerodict()

161

fileschanged = zerodict()

162

filesadded = zerodict()

162

filesadded = zerodict()

163

filesremoved = zerodict()

163

filesremoved = zerodict()

164

linelengths = zerodict()

164

linelengths = zerodict()

165

interarrival = zerodict()

165

interarrival = zerodict()

166

parents = zerodict()

166

parents = zerodict()

167

dirsadded = zerodict()

167

dirsadded = zerodict()

168

tzoffset = zerodict()

168

tzoffset = zerodict()

169

170

# If a mercurial repo is available, also model the commit history.

170

# If a mercurial repo is available, also model the commit history.

171

if repo:

171

if repo:

172

revs = scmutil.revrange(repo, revs)

172

revs = scmutil.revrange(repo, revs)

173

revs.sort()

173

revs.sort()

174

175

progress = ui.progress

175

progress = ui.progress

176

_analyzing = _('analyzing')

176

_analyzing = _('analyzing')

177

_changesets = _('changesets')

177

_changesets = _('changesets')

178

_total = len(revs)

178

_total = len(revs)

179

180

for i, rev in enumerate(revs):

180

for i, rev in enumerate(revs):

181

progress(_analyzing, i, unit=_changesets, total=_total)

181

progress(_analyzing, i, unit=_changesets, total=_total)

182

ctx = repo[rev]

182

ctx = repo[rev]

183

pl = ctx.parents()

183

pl = ctx.parents()

184

pctx = pl[0]

184

pctx = pl[0]

185

prev = pctx.rev()

185

prev = pctx.rev()

186

children[prev] += 1

186

children[prev] += 1

187

p1distance[rev - prev] += 1

187

p1distance[rev - prev] += 1

188

parents[len(pl)] += 1

188

parents[len(pl)] += 1

189

tzoffset[ctx.date()[1]] += 1

189

tzoffset[ctx.date()[1]] += 1

190

if len(pl) > 1:

190

if len(pl) > 1:

191

p2distance[rev - pl[1].rev()] += 1

191

p2distance[rev - pl[1].rev()] += 1

192

if prev == rev - 1:

192

if prev == rev - 1:

193

lastctx = pctx

193

lastctx = pctx

194

else:

194

else:

195

lastctx = repo[rev - 1]

195

lastctx = repo[rev - 1]

196

if lastctx.rev() != nullrev:

196

if lastctx.rev() != nullrev:

197

timedelta = ctx.date()[0] - lastctx.date()[0]

197

timedelta = ctx.date()[0] - lastctx.date()[0]

198

interarrival[roundto(timedelta, 300)] += 1

198

interarrival[roundto(timedelta, 300)] += 1

199

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

199

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

200

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

200

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

201

for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):

201

for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):

202

if isbin:

202

if isbin:

203

continue

203

continue

204

added = sum(lineadd.itervalues(), 0)

204

added = sum(lineadd.itervalues(), 0)

205

if mar == 'm':

205

if mar == 'm':

206

if added and lineremove:

206

if added and lineremove:

207

lineschanged[roundto(added, 5),

207

lineschanged[roundto(added, 5),

208

roundto(lineremove, 5)] += 1

208

roundto(lineremove, 5)] += 1

209

filechanges += 1

209

filechanges += 1

210

elif mar == 'a':

210

elif mar == 'a':

211

fileadds += 1

211

fileadds += 1

212

if '/' in filename:

212

if '/' in filename:

213

filedir = filename.rsplit('/', 1)[0]

213

filedir = filename.rsplit('/', 1)[0]

214

if filedir not in pctx.dirs():

214

if filedir not in pctx.dirs():

215

diradds += 1

215

diradds += 1

216

linesinfilesadded[roundto(added, 5)] += 1

216

linesinfilesadded[roundto(added, 5)] += 1

217

elif mar == 'r':

217

elif mar == 'r':

218

fileremoves += 1

218

fileremoves += 1

219

for length, count in lineadd.iteritems():

219

for length, count in lineadd.iteritems():

220

linelengths[length] += count

220

linelengths[length] += count

221

fileschanged[filechanges] += 1

221

fileschanged[filechanges] += 1

222

filesadded[fileadds] += 1

222

filesadded[fileadds] += 1

223

dirsadded[diradds] += 1

223

dirsadded[diradds] += 1

224

filesremoved[fileremoves] += 1

224

filesremoved[fileremoves] += 1

225

226

invchildren = zerodict()

226

invchildren = zerodict()

227

228

for rev, count in children.iteritems():

228

for rev, count in children.iteritems():

229

invchildren[count] += 1

229

invchildren[count] += 1

230

231

if output != '-':

231

if output != '-':

232

ui.status(_('writing output to %s\n') % output)

232

ui.status(_('writing output to %s\n') % output)

233

234

def pronk(d):

234

def pronk(d):

235

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

235

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

236

237

json.dump({'revs': len(revs),

237

json.dump({'revs': len(revs),

238

'initdirs': pronk(dirs),

238

'initdirs': pronk(dirs),

239

'lineschanged': pronk(lineschanged),

239

'lineschanged': pronk(lineschanged),

240

'children': pronk(invchildren),

240

'children': pronk(invchildren),

241

'fileschanged': pronk(fileschanged),

241

'fileschanged': pronk(fileschanged),

242

'filesadded': pronk(filesadded),

242

'filesadded': pronk(filesadded),

243

'linesinfilesadded': pronk(linesinfilesadded),

243

'linesinfilesadded': pronk(linesinfilesadded),

244

'dirsadded': pronk(dirsadded),

244

'dirsadded': pronk(dirsadded),

245

'filesremoved': pronk(filesremoved),

245

'filesremoved': pronk(filesremoved),

246

'linelengths': pronk(linelengths),

246

'linelengths': pronk(linelengths),

247

'parents': pronk(parents),

247

'parents': pronk(parents),

248

'p1distance': pronk(p1distance),

248

'p1distance': pronk(p1distance),

249

'p2distance': pronk(p2distance),

249

'p2distance': pronk(p2distance),

250

'interarrival': pronk(interarrival),

250

'interarrival': pronk(interarrival),

251

'tzoffset': pronk(tzoffset),

251

'tzoffset': pronk(tzoffset),

252

},

252

},

253

fp)

253

fp)

254

fp.close()

254

fp.close()

255

256

@command('synthesize',

256

@command('synthesize',

257

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

257

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

258

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

258

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

259

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

259

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

260

_('hg synthesize [OPTION].. DESCFILE'))

260

_('hg synthesize [OPTION].. DESCFILE'))

261

def synthesize(ui, repo, descpath, **opts):

261

def synthesize(ui, repo, descpath, **opts):

262

'''synthesize commits based on a model of an existing repository

262

'''synthesize commits based on a model of an existing repository

263

264

The model must have been generated by :hg:`analyze`. Commits will

264

The model must have been generated by :hg:`analyze`. Commits will

265

be generated randomly according to the probabilities described in

265

be generated randomly according to the probabilities described in

266

the model. If --initfiles is set, the repository will be seeded with

266

the model. If --initfiles is set, the repository will be seeded with

267

the given number files following the modeled repository's directory

267

the given number files following the modeled repository's directory

268

structure.

268

structure.

269

270

When synthesizing new content, commit descriptions, and user

270

When synthesizing new content, commit descriptions, and user

271

names, words will be chosen randomly from a dictionary that is

271

names, words will be chosen randomly from a dictionary that is

272

presumed to contain one word per line. Use --dict to specify the

272

presumed to contain one word per line. Use --dict to specify the

273

path to an alternate dictionary to use.

273

path to an alternate dictionary to use.

274

'''

274

'''

275

try:

275

try:

276

fp = hg.openpath(ui, descpath)

276

fp = hg.openpath(ui, descpath)

277

except Exception as err:

277

except Exception as err:

278

raise error.Abort('%s: %s' % (descpath, err[0].strerror))

278

raise error.Abort('%s: %s' % (descpath, err[0].strerror))

279

desc = json.load(fp)

279

desc = json.load(fp)

280

fp.close()

280

fp.close()

281

282

def cdf(l):

282

def cdf(l):

283

if not l:

283

if not l:

284

return [], []

284

return [], []

285

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

285

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

286

t = float(sum(probs, 0))

286

t = float(sum(probs, 0))

287

s, cdfs = 0, []

287

s, cdfs = 0, []

288

for v in probs:

288

for v in probs:

289

s += v

289

s += v

290

cdfs.append(s / t)

290

cdfs.append(s / t)

291

return vals, cdfs

291

return vals, cdfs

292

293

lineschanged = cdf(desc['lineschanged'])

293

lineschanged = cdf(desc['lineschanged'])

294

fileschanged = cdf(desc['fileschanged'])

294

fileschanged = cdf(desc['fileschanged'])

295

filesadded = cdf(desc['filesadded'])

295

filesadded = cdf(desc['filesadded'])

296

dirsadded = cdf(desc['dirsadded'])

296

dirsadded = cdf(desc['dirsadded'])

297

filesremoved = cdf(desc['filesremoved'])

297

filesremoved = cdf(desc['filesremoved'])

298

linelengths = cdf(desc['linelengths'])

298

linelengths = cdf(desc['linelengths'])

299

parents = cdf(desc['parents'])

299

parents = cdf(desc['parents'])

300

p1distance = cdf(desc['p1distance'])

300

p1distance = cdf(desc['p1distance'])

301

p2distance = cdf(desc['p2distance'])

301

p2distance = cdf(desc['p2distance'])

302

interarrival = cdf(desc['interarrival'])

302

interarrival = cdf(desc['interarrival'])

303

linesinfilesadded = cdf(desc['linesinfilesadded'])

303

linesinfilesadded = cdf(desc['linesinfilesadded'])

304

tzoffset = cdf(desc['tzoffset'])

304

tzoffset = cdf(desc['tzoffset'])

305

306

dictfile = opts.get('dict') or '/usr/share/dict/words'

306

dictfile = opts.get('dict') or '/usr/share/dict/words'

307

try:

307

try:

308

fp = open(dictfile, 'rU')

308

fp = open(dictfile, 'rU')

309

except IOError as err:

309

except IOError as err:

310

raise error.Abort('%s: %s' % (dictfile, err.strerror))

310

raise error.Abort('%s: %s' % (dictfile, err.strerror))

311

words = fp.read().splitlines()

311

words = fp.read().splitlines()

312

fp.close()

312

fp.close()

313

314

initdirs = {}

314

initdirs = {}

315

if desc['initdirs']:

315

if desc['initdirs']:

316

for k, v in desc['initdirs']:

316

for k, v in desc['initdirs']:

317

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

317

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

318

initdirs = renamedirs(initdirs, words)

318

initdirs = renamedirs(initdirs, words)

319

initdirscdf = cdf(initdirs)

319

initdirscdf = cdf(initdirs)

320

321

def pick(cdf):

321

def pick(cdf):

322

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

322

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

323

324

def pickpath():

324

def pickpath():

325

return os.path.join(pick(initdirscdf), random.choice(words))

325

return os.path.join(pick(initdirscdf), random.choice(words))

326

327

def makeline(minimum=0):

327

def makeline(minimum=0):

328

total = max(minimum, pick(linelengths))

328

total = max(minimum, pick(linelengths))

329

c, l = 0, []

329

c, l = 0, []

330

while c < total:

330

while c < total:

331

w = random.choice(words)

331

w = random.choice(words)

332

c += len(w) + 1

332

c += len(w) + 1

333

l.append(w)

333

l.append(w)

334

return ' '.join(l)

334

return ' '.join(l)

335

336

wlock = repo.wlock()

336

wlock = repo.wlock()

337

lock = repo.lock()

337

lock = repo.lock()

338

339

nevertouch = {'.hgsub', '.hgignore', '.hgtags'}

339

nevertouch = {'.hgsub', '.hgignore', '.hgtags'}

340

341

progress = ui.progress

341

progress = ui.progress

342

_synthesizing = _('synthesizing')

342

_synthesizing = _('synthesizing')

343

_files = _('initial files')

343

_files = _('initial files')

344

_changesets = _('changesets')

344

_changesets = _('changesets')

345

346

# Synthesize a single initial revision adding files to the repo according

346

# Synthesize a single initial revision adding files to the repo according

347

# to the modeled directory structure.

347

# to the modeled directory structure.

348

initcount = int(opts['initfiles'])

348

initcount = int(opts['initfiles'])

349

if initcount and initdirs:

349

if initcount and initdirs:

350

pctx = repo[None].parents()[0]

350

pctx = repo[None].parents()[0]

351

dirs = set(pctx.dirs())

351

dirs = set(pctx.dirs())

352

files = {}

352

files = {}

353

354

def validpath(path):

354

def validpath(path):

355

# Don't pick filenames which are already directory names.

355

# Don't pick filenames which are already directory names.

356

if path in dirs:

356

if path in dirs:

357

return False

357

return False

358

# Don't pick directories which were used as file names.

358

# Don't pick directories which were used as file names.

359

while path:

359

while path:

360

if path in files:

360

if path in files:

361

return False

361

return False

362

path = os.path.dirname(path)

362

path = os.path.dirname(path)

363

return True

363

return True

364

365

for i in xrange(0, initcount):

365

for i in xrange(0, initcount):

366

ui.progress(_synthesizing, i, unit=_files, total=initcount)

366

ui.progress(_synthesizing, i, unit=_files, total=initcount)

367

368

path = pickpath()

368

path = pickpath()

369

while not validpath(path):

369

while not validpath(path):

370

path = pickpath()

370

path = pickpath()

371

data = '%s contents\n' % path

371

data = '%s contents\n' % path

372

files[path] = ~~context~~.~~memfilectx~~(~~repo~~, ~~path~~, data)

372

files[path] = data

373

dir = os.path.dirname(path)

373

dir = os.path.dirname(path)

374

while dir and dir not in dirs:

374

while dir and dir not in dirs:

375

dirs.add(dir)

375

dirs.add(dir)

376

dir = os.path.dirname(dir)

376

dir = os.path.dirname(dir)

377

378

def filectxfn(repo, memctx, path):

378

def filectxfn(repo, memctx, path):

379

return files[path]

379

return context.memfilectx(repo, path, files[path])

380

381

ui.progress(_synthesizing, None)

381

ui.progress(_synthesizing, None)

382

message = 'synthesized wide repo with %d files' % (len(files),)

382

message = 'synthesized wide repo with %d files' % (len(files),)

383

mc = context.memctx(repo, [pctx.node(), nullid], message,

383

mc = context.memctx(repo, [pctx.node(), nullid], message,

384

files.iterkeys(), filectxfn, ui.username(),

384

files.iterkeys(), filectxfn, ui.username(),

385

'%d %d' % util.makedate())

385

'%d %d' % util.makedate())

386

initnode = mc.commit()

386

initnode = mc.commit()

387

if ui.debugflag:

387

if ui.debugflag:

388

hexfn = hex

388

hexfn = hex

389

else:

389

else:

390

hexfn = short

390

hexfn = short

391

ui.status(_('added commit %s with %d files\n')

391

ui.status(_('added commit %s with %d files\n')

392

% (hexfn(initnode), len(files)))

392

% (hexfn(initnode), len(files)))

393

394

# Synthesize incremental revisions to the repository, adding repo depth.

394

# Synthesize incremental revisions to the repository, adding repo depth.

395

count = int(opts['count'])

395

count = int(opts['count'])

396

heads = set(map(repo.changelog.rev, repo.heads()))

396

heads = set(map(repo.changelog.rev, repo.heads()))

397

for i in xrange(count):

397

for i in xrange(count):

398

progress(_synthesizing, i, unit=_changesets, total=count)

398

progress(_synthesizing, i, unit=_changesets, total=count)

399

400

node = repo.changelog.node

400

node = repo.changelog.node

401

revs = len(repo)

401

revs = len(repo)

402

403

def pickhead(heads, distance):

403

def pickhead(heads, distance):

404

if heads:

404

if heads:

405

lheads = sorted(heads)

405

lheads = sorted(heads)

406

rev = revs - min(pick(distance), revs)

406

rev = revs - min(pick(distance), revs)

407

if rev < lheads[-1]:

407

if rev < lheads[-1]:

408

rev = lheads[bisect.bisect_left(lheads, rev)]

408

rev = lheads[bisect.bisect_left(lheads, rev)]

409

else:

409

else:

410

rev = lheads[-1]

410

rev = lheads[-1]

411

return rev, node(rev)

411

return rev, node(rev)

412

return nullrev, nullid

412

return nullrev, nullid

413

414

r1 = revs - min(pick(p1distance), revs)

414

r1 = revs - min(pick(p1distance), revs)

415

p1 = node(r1)

415

p1 = node(r1)

416

417

# the number of heads will grow without bound if we use a pure

417

# the number of heads will grow without bound if we use a pure

418

# model, so artificially constrain their proliferation

418

# model, so artificially constrain their proliferation

419

toomanyheads = len(heads) > random.randint(1, 20)

419

toomanyheads = len(heads) > random.randint(1, 20)

420

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

420

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

421

r2, p2 = pickhead(heads.difference([r1]), p2distance)

421

r2, p2 = pickhead(heads.difference([r1]), p2distance)

422

else:

422

else:

423

r2, p2 = nullrev, nullid

423

r2, p2 = nullrev, nullid

424

425

pl = [p1, p2]

425

pl = [p1, p2]

426

pctx = repo[r1]

426

pctx = repo[r1]

427

mf = pctx.manifest()

427

mf = pctx.manifest()

428

mfk = mf.keys()

428

mfk = mf.keys()

429

changes = {}

429

changes = {}

430

if mfk:

430

if mfk:

431

for __ in xrange(pick(fileschanged)):

431

for __ in xrange(pick(fileschanged)):

432

for __ in xrange(10):

432

for __ in xrange(10):

433

fctx = pctx.filectx(random.choice(mfk))

433

fctx = pctx.filectx(random.choice(mfk))

434

path = fctx.path()

434

path = fctx.path()

435

if not (path in nevertouch or fctx.isbinary() or

435

if not (path in nevertouch or fctx.isbinary() or

436

'l' in fctx.flags()):

436

'l' in fctx.flags()):

437

break

437

break

438

lines = fctx.data().splitlines()

438

lines = fctx.data().splitlines()

439

add, remove = pick(lineschanged)

439

add, remove = pick(lineschanged)

440

for __ in xrange(remove):

440

for __ in xrange(remove):

441

if not lines:

441

if not lines:

442

break

442

break

443

del lines[random.randrange(0, len(lines))]

443

del lines[random.randrange(0, len(lines))]

444

for __ in xrange(add):

444

for __ in xrange(add):

445

lines.insert(random.randint(0, len(lines)), makeline())

445

lines.insert(random.randint(0, len(lines)), makeline())

446

path = fctx.path()

446

path = fctx.path()

447

changes[path] = ~~context~~.~~memfilectx~~(~~repo~~, ~~path~~,

447

changes[path] = '\n'.join(lines) + '\n'

448

'\n'.join(lines) + '\n')

449

for __ in xrange(pick(filesremoved)):

448

for __ in xrange(pick(filesremoved)):

450

path = random.choice(mfk)

449

path = random.choice(mfk)

451

for __ in xrange(10):

450

for __ in xrange(10):

452

path = random.choice(mfk)

451

path = random.choice(mfk)

453

if path not in changes:

452

if path not in changes:

454

changes[path] = None

455

break

453

break

456

if filesadded:

454

if filesadded:

457

dirs = list(pctx.dirs())

455

dirs = list(pctx.dirs())

458

dirs.insert(0, '')

456

dirs.insert(0, '')

459

for __ in xrange(pick(filesadded)):

457

for __ in xrange(pick(filesadded)):

460

pathstr = ''

458

pathstr = ''

461

while pathstr in dirs:

459

while pathstr in dirs:

462

path = [random.choice(dirs)]

460

path = [random.choice(dirs)]

463

if pick(dirsadded):

461

if pick(dirsadded):

464

path.append(random.choice(words))

462

path.append(random.choice(words))

465

path.append(random.choice(words))

463

path.append(random.choice(words))

466

pathstr = '/'.join(filter(None, path))

464

pathstr = '/'.join(filter(None, path))

467

data = '\n'.join(makeline()

465

data = '\n'.join(makeline()

468

for __ in xrange(pick(linesinfilesadded))) + '\n'

466

for __ in xrange(pick(linesinfilesadded))) + '\n'

469

changes[pathstr] = ~~context~~.~~memfilectx~~(~~repo~~, ~~pathstr~~, data)

467

changes[pathstr] = data

470

def filectxfn(repo, memctx, path):

468

def filectxfn(repo, memctx, path):

471

~~retur~~n changes[~~path~~]

469

if path not in changes:

470

return None

471

return context.memfilectx(repo, path, changes[path])

472

if not changes:

472

if not changes:

473

continue

473

continue

474

if revs:

474

if revs:

475

date = repo['tip'].date()[0] + pick(interarrival)

475

date = repo['tip'].date()[0] + pick(interarrival)

476

else:

476

else:

477

date = time.time() - (86400 * count)

477

date = time.time() - (86400 * count)

478

# dates in mercurial must be positive, fit in 32-bit signed integers.

478

# dates in mercurial must be positive, fit in 32-bit signed integers.

479

date = min(0x7fffffff, max(0, date))

479

date = min(0x7fffffff, max(0, date))

480

user = random.choice(words) + '@' + random.choice(words)

480

user = random.choice(words) + '@' + random.choice(words)

481

mc = context.memctx(repo, pl, makeline(minimum=2),

481

mc = context.memctx(repo, pl, makeline(minimum=2),

482

sorted(changes),

482

sorted(changes),

483

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

483

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

484

newnode = mc.commit()

484

newnode = mc.commit()

485

heads.add(repo.changelog.rev(newnode))

485

heads.add(repo.changelog.rev(newnode))

486

heads.discard(r1)

486

heads.discard(r1)

487

heads.discard(r2)

487

heads.discard(r2)

488

489

lock.release()

489

lock.release()

490

wlock.release()

490

wlock.release()

491

492

def renamedirs(dirs, words):

492

def renamedirs(dirs, words):

493

'''Randomly rename the directory names in the per-dir file count dict.'''

493

'''Randomly rename the directory names in the per-dir file count dict.'''

494

wordgen = itertools.cycle(words)

494

wordgen = itertools.cycle(words)

495

replacements = {'': ''}

495

replacements = {'': ''}

496

def rename(dirpath):

496

def rename(dirpath):

497

'''Recursively rename the directory and all path prefixes.

497

'''Recursively rename the directory and all path prefixes.

498

499

The mapping from path to renamed path is stored for all path prefixes

499

The mapping from path to renamed path is stored for all path prefixes

500

as in dynamic programming, ensuring linear runtime and consistent

500

as in dynamic programming, ensuring linear runtime and consistent

501

renaming regardless of iteration order through the model.

501

renaming regardless of iteration order through the model.

502

'''

502

'''

503

if dirpath in replacements:

503

if dirpath in replacements:

504

return replacements[dirpath]

504

return replacements[dirpath]

505

head, _ = os.path.split(dirpath)

505

head, _ = os.path.split(dirpath)

506

if head:

506

if head:

507

head = rename(head)

507

head = rename(head)

508

else:

508

else:

509

head = ''

509

head = ''

510

renamed = os.path.join(head, next(wordgen))

510

renamed = os.path.join(head, next(wordgen))

511

replacements[dirpath] = renamed

511

replacements[dirpath] = renamed

512

return renamed

512

return renamed

513

result = []

513

result = []

514

for dirpath, count in dirs.iteritems():

514

for dirpath, count in dirs.iteritems():

515

result.append([rename(dirpath.lstrip(os.sep)), count])

515

result.append([rename(dirpath.lstrip(os.sep)), count])

516

return result

516

return result

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # synthrepo.py - repo synthesis
             #
             # Copyright 2012 Facebook
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             '''synthesize structurally interesting change history
             This extension is useful for creating a repository with properties
             that are statistically similar to an existing repository. During
             analysis, a simple probability table is constructed from the history
             of an existing repository.  During synthesis, these properties are
             reconstructed.
             Properties that are analyzed and synthesized include the following:
             - Lines added or removed when an existing file is modified
             - Number and sizes of files added
             - Number of files removed
             - Line lengths
             - Topological distance to parent changeset(s)
             - Probability of a commit being a merge
             - Probability of a newly added file being added to a new directory
             - Interarrival time, and time zone, of commits
             - Number of files in each directory
             A few obvious properties that are not currently handled realistically:
             - Merges are treated as regular commits with two parents, which is not
               realistic
             - Modifications are not treated as operations on hunks of lines, but
               as insertions and deletions of randomly chosen single lines
             - Committer ID (always random)
             - Executability of files
             - Symlinks and binary files are ignored
             '''
             from __future__ import absolute_import
             import bisect
             import collections
             import itertools
             import json
             import os
             import random
             import sys
             import time
             from mercurial.i18n import _
             from mercurial.node import (
                 nullid,
                 nullrev,
                 short,
             )
             from mercurial import (
                 context,
                 error,
                 hg,
                 patch,
                 registrar,
                 scmutil,
                 util,
             )
             # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
             # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
             # be specifying the version(s) of Mercurial they are tested with, or
             # leave the attribute unspecified.
             testedwith = 'ships-with-hg-core'
             cmdtable = {}
             command = registrar.command(cmdtable)
             newfile = {'new fi', 'rename', 'copy f', 'copy t'}
             def zerodict():
                 return collections.defaultdict(lambda: 0)
             def roundto(x, k):
                 if x > k * 2:
                     return int(round(x / float(k)) * k)
                 return int(round(x))
             def parsegitdiff(lines):
                 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
                 binary = False
                 for line in lines:
                     start = line[:6]
                     if start == 'diff -':
                         if filename:
                             yield filename, mar, lineadd, lineremove, binary
                         mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
                         filename = patch.gitre.match(line).group(1)
                     elif start in newfile:
                         mar = 'a'
                     elif start == 'GIT bi':
                         binary = True
                     elif start == 'delete':
                         mar = 'r'
                     elif start:
                         s = start[0]
                         if s == '-' and not line.startswith('--- '):
                             lineremove += 1
                         elif s == '+' and not line.startswith('+++ '):
                             lineadd[roundto(len(line) - 1, 5)] += 1
                 if filename:
                     yield filename, mar, lineadd, lineremove, binary
             @command('analyze',
                      [('o', 'output', '', _('write output to given file'), _('FILE')),
                       ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
                      _('hg analyze'), optionalrepo=True)
             def analyze(ui, repo, *revs, **opts):
                 '''create a simple model of a repository to use for later synthesis
                 This command examines every changeset in the given range (or all
                 of history if none are specified) and creates a simple statistical
                 model of the history of the repository. It also measures the directory
                 structure of the repository as checked out.
                 The model is written out to a JSON file, and can be used by
                 :hg:`synthesize` to create or augment a repository with synthetic
                 commits that have a structure that is statistically similar to the
                 analyzed repository.
                 '''
                 root = repo.root
                 if not root.endswith(os.path.sep):
                     root += os.path.sep
                 revs = list(revs)
                 revs.extend(opts['rev'])
                 if not revs:
                     revs = [':']
                 output = opts['output']
                 if not output:
                     output = os.path.basename(root) + '.json'
                 if output == '-':
                     fp = sys.stdout
                 else:
                     fp = open(output, 'w')
                 # Always obtain file counts of each directory in the given root directory.
                 def onerror(e):
                     ui.warn(_('error walking directory structure: %s\n') % e)
                 dirs = {}
                 rootprefixlen = len(root)
                 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
                     dirpathfromroot = dirpath[rootprefixlen:]
                     dirs[dirpathfromroot] = len(filenames)
                     if '.hg' in dirnames:
                         dirnames.remove('.hg')
                 lineschanged = zerodict()
                 children = zerodict()
                 p1distance = zerodict()
                 p2distance = zerodict()
                 linesinfilesadded = zerodict()
                 fileschanged = zerodict()
                 filesadded = zerodict()
                 filesremoved = zerodict()
                 linelengths = zerodict()
                 interarrival = zerodict()
                 parents = zerodict()
                 dirsadded = zerodict()
                 tzoffset = zerodict()
                 # If a mercurial repo is available, also model the commit history.
                 if repo:
                     revs = scmutil.revrange(repo, revs)
                     revs.sort()
                     progress = ui.progress
                     _analyzing = _('analyzing')
                     _changesets = _('changesets')
                     _total = len(revs)
                     for i, rev in enumerate(revs):
                         progress(_analyzing, i, unit=_changesets, total=_total)
                         ctx = repo[rev]
                         pl = ctx.parents()
                         pctx = pl[0]
                         prev = pctx.rev()
                         children[prev] += 1
                         p1distance[rev - prev] += 1
                         parents[len(pl)] += 1
                         tzoffset[ctx.date()[1]] += 1
                         if len(pl) > 1:
                             p2distance[rev - pl[1].rev()] += 1
                         if prev == rev - 1:
                             lastctx = pctx
                         else:
                             lastctx = repo[rev - 1]
                         if lastctx.rev() != nullrev:
                             timedelta = ctx.date()[0] - lastctx.date()[0]
                             interarrival[roundto(timedelta, 300)] += 1
                         diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
                         fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
                         for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
                             if isbin:
                                 continue
                             added = sum(lineadd.itervalues(), 0)
                             if mar == 'm':
                                 if added and lineremove:
                                     lineschanged[roundto(added, 5),
                                                  roundto(lineremove, 5)] += 1
                                     filechanges += 1
                             elif mar == 'a':
                                 fileadds += 1
                                 if '/' in filename:
                                     filedir = filename.rsplit('/', 1)[0]
                                     if filedir not in pctx.dirs():
                                         diradds += 1
                                 linesinfilesadded[roundto(added, 5)] += 1
                             elif mar == 'r':
                                 fileremoves += 1
                             for length, count in lineadd.iteritems():
                                 linelengths[length] += count
                         fileschanged[filechanges] += 1
                         filesadded[fileadds] += 1
                         dirsadded[diradds] += 1
                         filesremoved[fileremoves] += 1
                 invchildren = zerodict()
                 for rev, count in children.iteritems():
                     invchildren[count] += 1
                 if output != '-':
                     ui.status(_('writing output to %s\n') % output)
                 def pronk(d):
                     return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
                 json.dump({'revs': len(revs),
                            'initdirs': pronk(dirs),
                            'lineschanged': pronk(lineschanged),
                            'children': pronk(invchildren),
                            'fileschanged': pronk(fileschanged),
                            'filesadded': pronk(filesadded),
                            'linesinfilesadded': pronk(linesinfilesadded),
                            'dirsadded': pronk(dirsadded),
                            'filesremoved': pronk(filesremoved),
                            'linelengths': pronk(linelengths),
                            'parents': pronk(parents),
                            'p1distance': pronk(p1distance),
                            'p2distance': pronk(p2distance),
                            'interarrival': pronk(interarrival),
                            'tzoffset': pronk(tzoffset),
                            },
                           fp)
                 fp.close()
             @command('synthesize',
                      [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
                       ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
                       ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
                      _('hg synthesize [OPTION].. DESCFILE'))
             def synthesize(ui, repo, descpath, **opts):
                 '''synthesize commits based on a model of an existing repository
                 The model must have been generated by :hg:`analyze`. Commits will
                 be generated randomly according to the probabilities described in
                 the model. If --initfiles is set, the repository will be seeded with
                 the given number files following the modeled repository's directory
                 structure.
                 When synthesizing new content, commit descriptions, and user
                 names, words will be chosen randomly from a dictionary that is
                 presumed to contain one word per line. Use --dict to specify the
                 path to an alternate dictionary to use.
                 '''
                 try:
                     fp = hg.openpath(ui, descpath)
                 except Exception as err:
                     raise error.Abort('%s: %s' % (descpath, err[0].strerror))
                 desc = json.load(fp)
                 fp.close()
                 def cdf(l):
                     if not l:
                         return [], []
                     vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
                     t = float(sum(probs, 0))
                     s, cdfs = 0, []
                     for v in probs:
                         s += v
                         cdfs.append(s / t)
                     return vals, cdfs
                 lineschanged = cdf(desc['lineschanged'])
                 fileschanged = cdf(desc['fileschanged'])
                 filesadded = cdf(desc['filesadded'])
                 dirsadded = cdf(desc['dirsadded'])
                 filesremoved = cdf(desc['filesremoved'])
                 linelengths = cdf(desc['linelengths'])
                 parents = cdf(desc['parents'])
                 p1distance = cdf(desc['p1distance'])
                 p2distance = cdf(desc['p2distance'])
                 interarrival = cdf(desc['interarrival'])
                 linesinfilesadded = cdf(desc['linesinfilesadded'])
                 tzoffset = cdf(desc['tzoffset'])
                 dictfile = opts.get('dict') or '/usr/share/dict/words'
                 try:
                     fp = open(dictfile, 'rU')
                 except IOError as err:
                     raise error.Abort('%s: %s' % (dictfile, err.strerror))
                 words = fp.read().splitlines()
                 fp.close()
                 initdirs = {}
                 if desc['initdirs']:
                     for k, v in desc['initdirs']:
                         initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
                     initdirs = renamedirs(initdirs, words)
                 initdirscdf = cdf(initdirs)
                 def pick(cdf):
                     return cdf[0][bisect.bisect_left(cdf[1], random.random())]
                 def pickpath():
                     return os.path.join(pick(initdirscdf), random.choice(words))
                 def makeline(minimum=0):
                     total = max(minimum, pick(linelengths))
                     c, l = 0, []
                     while c < total:
                         w = random.choice(words)
                         c += len(w) + 1
                         l.append(w)
                     return ' '.join(l)
                 wlock = repo.wlock()
                 lock = repo.lock()
                 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
                 progress = ui.progress
                 _synthesizing = _('synthesizing')
                 _files = _('initial files')
                 _changesets = _('changesets')
                 # Synthesize a single initial revision adding files to the repo according
                 # to the modeled directory structure.
                 initcount = int(opts['initfiles'])
                 if initcount and initdirs:
                     pctx = repo[None].parents()[0]
                     dirs = set(pctx.dirs())
                     files = {}
                     def validpath(path):
                         # Don't pick filenames which are already directory names.
                         if path in dirs:
                             return False
                         # Don't pick directories which were used as file names.
                         while path:
                             if path in files:
                                 return False
                             path = os.path.dirname(path)
                         return True
                     for i in xrange(0, initcount):
                         ui.progress(_synthesizing, i, unit=_files, total=initcount)
                         path = pickpath()
                         while not validpath(path):
                             path = pickpath()
                         data = '%s contents\n' % path
-                        files[path] = context.memfilectx(repo, path, data)
+                        files[path] = data
                         dir = os.path.dirname(path)
                         while dir and dir not in dirs:
                             dirs.add(dir)
                             dir = os.path.dirname(dir)
                     def filectxfn(repo, memctx, path):
-                        return files[path]
+                        return context.memfilectx(repo, path, files[path])
                     ui.progress(_synthesizing, None)
                     message = 'synthesized wide repo with %d files' % (len(files),)
                     mc = context.memctx(repo, [pctx.node(), nullid], message,
                                         files.iterkeys(), filectxfn, ui.username(),
                                         '%d %d' % util.makedate())
                     initnode = mc.commit()
                     if ui.debugflag:
                         hexfn = hex
                     else:
                         hexfn = short
                     ui.status(_('added commit %s with %d files\n')
                               % (hexfn(initnode), len(files)))
                 # Synthesize incremental revisions to the repository, adding repo depth.
                 count = int(opts['count'])
                 heads = set(map(repo.changelog.rev, repo.heads()))
                 for i in xrange(count):
                     progress(_synthesizing, i, unit=_changesets, total=count)
                     node = repo.changelog.node
                     revs = len(repo)
                     def pickhead(heads, distance):
                         if heads:
                             lheads = sorted(heads)
                             rev = revs - min(pick(distance), revs)
                             if rev < lheads[-1]:
                                 rev = lheads[bisect.bisect_left(lheads, rev)]
                             else:
                                 rev = lheads[-1]
                             return rev, node(rev)
                         return nullrev, nullid
                     r1 = revs - min(pick(p1distance), revs)
                     p1 = node(r1)
                     # the number of heads will grow without bound if we use a pure
                     # model, so artificially constrain their proliferation
                     toomanyheads = len(heads) > random.randint(1, 20)
                     if p2distance[0] and (pick(parents) == 2 or toomanyheads):
                         r2, p2 = pickhead(heads.difference([r1]), p2distance)
                     else:
                         r2, p2 = nullrev, nullid
                     pl = [p1, p2]
                     pctx = repo[r1]
                     mf = pctx.manifest()
                     mfk = mf.keys()
                     changes = {}
                     if mfk:
                         for __ in xrange(pick(fileschanged)):
                             for __ in xrange(10):
                                 fctx = pctx.filectx(random.choice(mfk))
                                 path = fctx.path()
                                 if not (path in nevertouch or fctx.isbinary() or
                                         'l' in fctx.flags()):
                                     break
                             lines = fctx.data().splitlines()
                             add, remove = pick(lineschanged)
                             for __ in xrange(remove):
                                 if not lines:
                                     break
                                 del lines[random.randrange(0, len(lines))]
                             for __ in xrange(add):
                                 lines.insert(random.randint(0, len(lines)), makeline())
                             path = fctx.path()
-                            changes[path] = context.memfilectx(repo, path,
+                            changes[path] = '\n'.join(lines) + '\n'
-                                                               '\n'.join(lines) + '\n')
                         for __ in xrange(pick(filesremoved)):
                             path = random.choice(mfk)
                             for __ in xrange(10):
                                 path = random.choice(mfk)
                                 if path not in changes:
-                                    changes[path] = None
                                     break
                     if filesadded:
                         dirs = list(pctx.dirs())
                         dirs.insert(0, '')
                     for __ in xrange(pick(filesadded)):
                         pathstr = ''
                         while pathstr in dirs:
                             path = [random.choice(dirs)]
                             if pick(dirsadded):
                                 path.append(random.choice(words))
                             path.append(random.choice(words))
                             pathstr = '/'.join(filter(None, path))
                         data = '\n'.join(makeline()
                                          for __ in xrange(pick(linesinfilesadded))) + '\n'
-                        changes[pathstr] = context.memfilectx(repo, pathstr, data)
+                        changes[pathstr] = data
                     def filectxfn(repo, memctx, path):
-                        return changes[path]
+                        if path not in changes:
+                            return None
+                        return context.memfilectx(repo, path, changes[path])
                     if not changes:
                         continue
                     if revs:
                         date = repo['tip'].date()[0] + pick(interarrival)
                     else:
                         date = time.time() - (86400 * count)
                     # dates in mercurial must be positive, fit in 32-bit signed integers.
                     date = min(0x7fffffff, max(0, date))
                     user = random.choice(words) + '@' + random.choice(words)
                     mc = context.memctx(repo, pl, makeline(minimum=2),
                                         sorted(changes),
                                         filectxfn, user, '%d %d' % (date, pick(tzoffset)))
                     newnode = mc.commit()
                     heads.add(repo.changelog.rev(newnode))
                     heads.discard(r1)
                     heads.discard(r2)
                 lock.release()
                 wlock.release()
             def renamedirs(dirs, words):
                 '''Randomly rename the directory names in the per-dir file count dict.'''
                 wordgen = itertools.cycle(words)
                 replacements = {'': ''}
                 def rename(dirpath):
                     '''Recursively rename the directory and all path prefixes.
                     The mapping from path to renamed path is stored for all path prefixes
                     as in dynamic programming, ensuring linear runtime and consistent
                     renaming regardless of iteration order through the model.
                     '''
                     if dirpath in replacements:
                         return replacements[dirpath]
                     head, _ = os.path.split(dirpath)
                     if head:
                         head = rename(head)
                     else:
                         head = ''
                     renamed = os.path.join(head, next(wordgen))
                     replacements[dirpath] = renamed
                     return renamed
                 result = []
                 for dirpath, count in dirs.iteritems():
                     result.append([rename(dirpath.lstrip(os.sep)), count])
                 return result