upstream/mercurial-mirror Commit - r38428:ce65c25d

1

# synthrepo.py - repo synthesis

1

# synthrepo.py - repo synthesis

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

'''synthesize structurally interesting change history

8

'''synthesize structurally interesting change history

9

10

This extension is useful for creating a repository with properties

10

This extension is useful for creating a repository with properties

11

that are statistically similar to an existing repository. During

11

that are statistically similar to an existing repository. During

12

analysis, a simple probability table is constructed from the history

12

analysis, a simple probability table is constructed from the history

13

of an existing repository. During synthesis, these properties are

13

of an existing repository. During synthesis, these properties are

14

reconstructed.

14

reconstructed.

15

16

Properties that are analyzed and synthesized include the following:

16

Properties that are analyzed and synthesized include the following:

17

18

- Lines added or removed when an existing file is modified

18

- Lines added or removed when an existing file is modified

19

- Number and sizes of files added

19

- Number and sizes of files added

20

- Number of files removed

20

- Number of files removed

21

- Line lengths

21

- Line lengths

22

- Topological distance to parent changeset(s)

22

- Topological distance to parent changeset(s)

23

- Probability of a commit being a merge

23

- Probability of a commit being a merge

24

- Probability of a newly added file being added to a new directory

24

- Probability of a newly added file being added to a new directory

25

- Interarrival time, and time zone, of commits

25

- Interarrival time, and time zone, of commits

26

- Number of files in each directory

26

- Number of files in each directory

27

28

A few obvious properties that are not currently handled realistically:

28

A few obvious properties that are not currently handled realistically:

29

30

- Merges are treated as regular commits with two parents, which is not

30

- Merges are treated as regular commits with two parents, which is not

31

realistic

31

realistic

32

- Modifications are not treated as operations on hunks of lines, but

32

- Modifications are not treated as operations on hunks of lines, but

33

as insertions and deletions of randomly chosen single lines

33

as insertions and deletions of randomly chosen single lines

34

- Committer ID (always random)

34

- Committer ID (always random)

35

- Executability of files

35

- Executability of files

36

- Symlinks and binary files are ignored

36

- Symlinks and binary files are ignored

37

'''

37

'''

38

39

from __future__ import absolute_import

39

from __future__ import absolute_import

40

import bisect

40

import bisect

41

import collections

41

import collections

42

import itertools

42

import itertools

43

import json

43

import json

44

import os

44

import os

45

import random

45

import random

46

import sys

46

import sys

47

import time

47

import time

48

49

from mercurial.i18n import _

49

from mercurial.i18n import _

50

from mercurial.node import (

50

from mercurial.node import (

51

nullid,

51

nullid,

52

nullrev,

52

nullrev,

53

short,

53

short,

54

)

54

)

55

from mercurial import (

55

from mercurial import (

56

context,

56

context,

57

error,

57

error,

58

hg,

58

hg,

59

patch,

59

patch,

60

registrar,

60

registrar,

61

scmutil,

61

scmutil,

62

)

62

)

63

from mercurial.utils import dateutil

63

from mercurial.utils import dateutil

64

65

# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for

65

# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for

66

# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should

66

# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should

67

# be specifying the version(s) of Mercurial they are tested with, or

67

# be specifying the version(s) of Mercurial they are tested with, or

68

# leave the attribute unspecified.

68

# leave the attribute unspecified.

69

testedwith = 'ships-with-hg-core'

69

testedwith = 'ships-with-hg-core'

70

71

cmdtable = {}

71

cmdtable = {}

72

command = registrar.command(cmdtable)

72

command = registrar.command(cmdtable)

73

74

newfile = {'new fi', 'rename', 'copy f', 'copy t'}

74

newfile = {'new fi', 'rename', 'copy f', 'copy t'}

75

76

def zerodict():

76

def zerodict():

77

return collections.defaultdict(lambda: 0)

77

return collections.defaultdict(lambda: 0)

78

79

def roundto(x, k):

79

def roundto(x, k):

80

if x > k * 2:

80

if x > k * 2:

81

return int(round(x / float(k)) * k)

81

return int(round(x / float(k)) * k)

82

return int(round(x))

82

return int(round(x))

83

84

def parsegitdiff(lines):

84

def parsegitdiff(lines):

85

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

85

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

86

binary = False

86

binary = False

87

for line in lines:

87

for line in lines:

88

start = line[:6]

88

start = line[:6]

89

if start == 'diff -':

89

if start == 'diff -':

90

if filename:

90

if filename:

91

yield filename, mar, lineadd, lineremove, binary

91

yield filename, mar, lineadd, lineremove, binary

92

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

92

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

93

filename = patch.gitre.match(line).group(1)

93

filename = patch.gitre.match(line).group(1)

94

elif start in newfile:

94

elif start in newfile:

95

mar = 'a'

95

mar = 'a'

96

elif start == 'GIT bi':

96

elif start == 'GIT bi':

97

binary = True

97

binary = True

98

elif start == 'delete':

98

elif start == 'delete':

99

mar = 'r'

99

mar = 'r'

100

elif start:

100

elif start:

101

s = start[0]

101

s = start[0]

102

if s == '-' and not line.startswith('--- '):

102

if s == '-' and not line.startswith('--- '):

103

lineremove += 1

103

lineremove += 1

104

elif s == '+' and not line.startswith('+++ '):

104

elif s == '+' and not line.startswith('+++ '):

105

lineadd[roundto(len(line) - 1, 5)] += 1

105

lineadd[roundto(len(line) - 1, 5)] += 1

106

if filename:

106

if filename:

107

yield filename, mar, lineadd, lineremove, binary

107

yield filename, mar, lineadd, lineremove, binary

108

109

@command('analyze',

109

@command('analyze',

110

[('o', 'output', '', _('write output to given file'), _('FILE')),

110

[('o', 'output', '', _('write output to given file'), _('FILE')),

111

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

111

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

112

_('hg analyze'), optionalrepo=True)

112

_('hg analyze'), optionalrepo=True)

113

def analyze(ui, repo, *revs, **opts):

113

def analyze(ui, repo, *revs, **opts):

114

'''create a simple model of a repository to use for later synthesis

114

'''create a simple model of a repository to use for later synthesis

115

116

This command examines every changeset in the given range (or all

116

This command examines every changeset in the given range (or all

117

of history if none are specified) and creates a simple statistical

117

of history if none are specified) and creates a simple statistical

118

model of the history of the repository. It also measures the directory

118

model of the history of the repository. It also measures the directory

119

structure of the repository as checked out.

119

structure of the repository as checked out.

120

121

The model is written out to a JSON file, and can be used by

121

The model is written out to a JSON file, and can be used by

122

:hg:`synthesize` to create or augment a repository with synthetic

122

:hg:`synthesize` to create or augment a repository with synthetic

123

commits that have a structure that is statistically similar to the

123

commits that have a structure that is statistically similar to the

124

analyzed repository.

124

analyzed repository.

125

'''

125

'''

126

root = repo.root

126

root = repo.root

127

if not root.endswith(os.path.sep):

127

if not root.endswith(os.path.sep):

128

root += os.path.sep

128

root += os.path.sep

129

130

revs = list(revs)

130

revs = list(revs)

131

revs.extend(opts['rev'])

131

revs.extend(opts['rev'])

132

if not revs:

132

if not revs:

133

revs = [':']

133

revs = [':']

134

135

output = opts['output']

135

output = opts['output']

136

if not output:

136

if not output:

137

output = os.path.basename(root) + '.json'

137

output = os.path.basename(root) + '.json'

138

139

if output == '-':

139

if output == '-':

140

fp = sys.stdout

140

fp = sys.stdout

141

else:

141

else:

142

fp = open(output, 'w')

142

fp = open(output, 'w')

143

144

# Always obtain file counts of each directory in the given root directory.

144

# Always obtain file counts of each directory in the given root directory.

145

def onerror(e):

145

def onerror(e):

146

ui.warn(_('error walking directory structure: %s\n') % e)

146

ui.warn(_('error walking directory structure: %s\n') % e)

147

148

dirs = {}

148

dirs = {}

149

rootprefixlen = len(root)

149

rootprefixlen = len(root)

150

for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):

150

for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):

151

dirpathfromroot = dirpath[rootprefixlen:]

151

dirpathfromroot = dirpath[rootprefixlen:]

152

dirs[dirpathfromroot] = len(filenames)

152

dirs[dirpathfromroot] = len(filenames)

153

if '.hg' in dirnames:

153

if '.hg' in dirnames:

154

dirnames.remove('.hg')

154

dirnames.remove('.hg')

155

156

lineschanged = zerodict()

156

lineschanged = zerodict()

157

children = zerodict()

157

children = zerodict()

158

p1distance = zerodict()

158

p1distance = zerodict()

159

p2distance = zerodict()

159

p2distance = zerodict()

160

linesinfilesadded = zerodict()

160

linesinfilesadded = zerodict()

161

fileschanged = zerodict()

161

fileschanged = zerodict()

162

filesadded = zerodict()

162

filesadded = zerodict()

163

filesremoved = zerodict()

163

filesremoved = zerodict()

164

linelengths = zerodict()

164

linelengths = zerodict()

165

interarrival = zerodict()

165

interarrival = zerodict()

166

parents = zerodict()

166

parents = zerodict()

167

dirsadded = zerodict()

167

dirsadded = zerodict()

168

tzoffset = zerodict()

168

tzoffset = zerodict()

169

170

# If a mercurial repo is available, also model the commit history.

170

# If a mercurial repo is available, also model the commit history.

171

if repo:

171

if repo:

172

revs = scmutil.revrange(repo, revs)

172

revs = scmutil.revrange(repo, revs)

173

revs.sort()

173

revs.sort()

174

175

progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),

175

progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),

176

total=len(revs))

176

total=len(revs))

177

for i, rev in enumerate(revs):

177

for i, rev in enumerate(revs):

178

progress.update(i)

178

progress.update(i)

179

ctx = repo[rev]

179

ctx = repo[rev]

180

pl = ctx.parents()

180

pl = ctx.parents()

181

pctx = pl[0]

181

pctx = pl[0]

182

prev = pctx.rev()

182

prev = pctx.rev()

183

children[prev] += 1

183

children[prev] += 1

184

p1distance[rev - prev] += 1

184

p1distance[rev - prev] += 1

185

parents[len(pl)] += 1

185

parents[len(pl)] += 1

186

tzoffset[ctx.date()[1]] += 1

186

tzoffset[ctx.date()[1]] += 1

187

if len(pl) > 1:

187

if len(pl) > 1:

188

p2distance[rev - pl[1].rev()] += 1

188

p2distance[rev - pl[1].rev()] += 1

189

if prev == rev - 1:

189

if prev == rev - 1:

190

lastctx = pctx

190

lastctx = pctx

191

else:

191

else:

192

lastctx = repo[rev - 1]

192

lastctx = repo[rev - 1]

193

if lastctx.rev() != nullrev:

193

if lastctx.rev() != nullrev:

194

timedelta = ctx.date()[0] - lastctx.date()[0]

194

timedelta = ctx.date()[0] - lastctx.date()[0]

195

interarrival[roundto(timedelta, 300)] += 1

195

interarrival[roundto(timedelta, 300)] += 1

196

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

196

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

197

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

197

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

198

for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):

198

for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):

199

if isbin:

199

if isbin:

200

continue

200

continue

201

added = sum(lineadd.itervalues(), 0)

201

added = sum(lineadd.itervalues(), 0)

202

if mar == 'm':

202

if mar == 'm':

203

if added and lineremove:

203

if added and lineremove:

204

lineschanged[roundto(added, 5),

204

lineschanged[roundto(added, 5),

205

roundto(lineremove, 5)] += 1

205

roundto(lineremove, 5)] += 1

206

filechanges += 1

206

filechanges += 1

207

elif mar == 'a':

207

elif mar == 'a':

208

fileadds += 1

208

fileadds += 1

209

if '/' in filename:

209

if '/' in filename:

210

filedir = filename.rsplit('/', 1)[0]

210

filedir = filename.rsplit('/', 1)[0]

211

if filedir not in pctx.dirs():

211

if filedir not in pctx.dirs():

212

diradds += 1

212

diradds += 1

213

linesinfilesadded[roundto(added, 5)] += 1

213

linesinfilesadded[roundto(added, 5)] += 1

214

elif mar == 'r':

214

elif mar == 'r':

215

fileremoves += 1

215

fileremoves += 1

216

for length, count in lineadd.iteritems():

216

for length, count in lineadd.iteritems():

217

linelengths[length] += count

217

linelengths[length] += count

218

fileschanged[filechanges] += 1

218

fileschanged[filechanges] += 1

219

filesadded[fileadds] += 1

219

filesadded[fileadds] += 1

220

dirsadded[diradds] += 1

220

dirsadded[diradds] += 1

221

filesremoved[fileremoves] += 1

221

filesremoved[fileremoves] += 1

222

progress.complete()

222

223

invchildren = zerodict()

224

invchildren = zerodict()

224

225

for rev, count in children.iteritems():

226

for rev, count in children.iteritems():

226

invchildren[count] += 1

227

invchildren[count] += 1

227

228

if output != '-':

229

if output != '-':

229

ui.status(_('writing output to %s\n') % output)

230

ui.status(_('writing output to %s\n') % output)

230

231

def pronk(d):

232

def pronk(d):

232

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

233

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

233

234

json.dump({'revs': len(revs),

235

json.dump({'revs': len(revs),

235

'initdirs': pronk(dirs),

236

'initdirs': pronk(dirs),

236

'lineschanged': pronk(lineschanged),

237

'lineschanged': pronk(lineschanged),

237

'children': pronk(invchildren),

238

'children': pronk(invchildren),

238

'fileschanged': pronk(fileschanged),

239

'fileschanged': pronk(fileschanged),

239

'filesadded': pronk(filesadded),

240

'filesadded': pronk(filesadded),

240

'linesinfilesadded': pronk(linesinfilesadded),

241

'linesinfilesadded': pronk(linesinfilesadded),

241

'dirsadded': pronk(dirsadded),

242

'dirsadded': pronk(dirsadded),

242

'filesremoved': pronk(filesremoved),

243

'filesremoved': pronk(filesremoved),

243

'linelengths': pronk(linelengths),

244

'linelengths': pronk(linelengths),

244

'parents': pronk(parents),

245

'parents': pronk(parents),

245

'p1distance': pronk(p1distance),

246

'p1distance': pronk(p1distance),

246

'p2distance': pronk(p2distance),

247

'p2distance': pronk(p2distance),

247

'interarrival': pronk(interarrival),

248

'interarrival': pronk(interarrival),

248

'tzoffset': pronk(tzoffset),

249

'tzoffset': pronk(tzoffset),

249

},

250

},

250

fp)

251

fp)

251

fp.close()

252

fp.close()

252

253

@command('synthesize',

254

@command('synthesize',

254

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

255

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

255

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

256

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

256

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

257

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

257

_('hg synthesize [OPTION].. DESCFILE'))

258

_('hg synthesize [OPTION].. DESCFILE'))

258

def synthesize(ui, repo, descpath, **opts):

259

def synthesize(ui, repo, descpath, **opts):

259

'''synthesize commits based on a model of an existing repository

260

'''synthesize commits based on a model of an existing repository

260

261

The model must have been generated by :hg:`analyze`. Commits will

262

The model must have been generated by :hg:`analyze`. Commits will

262

be generated randomly according to the probabilities described in

263

be generated randomly according to the probabilities described in

263

the model. If --initfiles is set, the repository will be seeded with

264

the model. If --initfiles is set, the repository will be seeded with

264

the given number files following the modeled repository's directory

265

the given number files following the modeled repository's directory

265

structure.

266

structure.

266

267

When synthesizing new content, commit descriptions, and user

268

When synthesizing new content, commit descriptions, and user

268

names, words will be chosen randomly from a dictionary that is

269

names, words will be chosen randomly from a dictionary that is

269

presumed to contain one word per line. Use --dict to specify the

270

presumed to contain one word per line. Use --dict to specify the

270

path to an alternate dictionary to use.

271

path to an alternate dictionary to use.

271

'''

272

'''

272

try:

273

try:

273

fp = hg.openpath(ui, descpath)

274

fp = hg.openpath(ui, descpath)

274

except Exception as err:

275

except Exception as err:

275

raise error.Abort('%s: %s' % (descpath, err[0].strerror))

276

raise error.Abort('%s: %s' % (descpath, err[0].strerror))

276

desc = json.load(fp)

277

desc = json.load(fp)

277

fp.close()

278

fp.close()

278

279

def cdf(l):

280

def cdf(l):

280

if not l:

281

if not l:

281

return [], []

282

return [], []

282

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

283

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

283

t = float(sum(probs, 0))

284

t = float(sum(probs, 0))

284

s, cdfs = 0, []

285

s, cdfs = 0, []

285

for v in probs:

286

for v in probs:

286

s += v

287

s += v

287

cdfs.append(s / t)

288

cdfs.append(s / t)

288

return vals, cdfs

289

return vals, cdfs

289

290

lineschanged = cdf(desc['lineschanged'])

291

lineschanged = cdf(desc['lineschanged'])

291

fileschanged = cdf(desc['fileschanged'])

292

fileschanged = cdf(desc['fileschanged'])

292

filesadded = cdf(desc['filesadded'])

293

filesadded = cdf(desc['filesadded'])

293

dirsadded = cdf(desc['dirsadded'])

294

dirsadded = cdf(desc['dirsadded'])

294

filesremoved = cdf(desc['filesremoved'])

295

filesremoved = cdf(desc['filesremoved'])

295

linelengths = cdf(desc['linelengths'])

296

linelengths = cdf(desc['linelengths'])

296

parents = cdf(desc['parents'])

297

parents = cdf(desc['parents'])

297

p1distance = cdf(desc['p1distance'])

298

p1distance = cdf(desc['p1distance'])

298

p2distance = cdf(desc['p2distance'])

299

p2distance = cdf(desc['p2distance'])

299

interarrival = cdf(desc['interarrival'])

300

interarrival = cdf(desc['interarrival'])

300

linesinfilesadded = cdf(desc['linesinfilesadded'])

301

linesinfilesadded = cdf(desc['linesinfilesadded'])

301

tzoffset = cdf(desc['tzoffset'])

302

tzoffset = cdf(desc['tzoffset'])

302

303

dictfile = opts.get('dict') or '/usr/share/dict/words'

304

dictfile = opts.get('dict') or '/usr/share/dict/words'

304

try:

305

try:

305

fp = open(dictfile, 'rU')

306

fp = open(dictfile, 'rU')

306

except IOError as err:

307

except IOError as err:

307

raise error.Abort('%s: %s' % (dictfile, err.strerror))

308

raise error.Abort('%s: %s' % (dictfile, err.strerror))

308

words = fp.read().splitlines()

309

words = fp.read().splitlines()

309

fp.close()

310

fp.close()

310

311

initdirs = {}

312

initdirs = {}

312

if desc['initdirs']:

313

if desc['initdirs']:

313

for k, v in desc['initdirs']:

314

for k, v in desc['initdirs']:

314

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

315

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

315

initdirs = renamedirs(initdirs, words)

316

initdirs = renamedirs(initdirs, words)

316

initdirscdf = cdf(initdirs)

317

initdirscdf = cdf(initdirs)

317

318

def pick(cdf):

319

def pick(cdf):

319

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

320

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

320

321

def pickpath():

322

def pickpath():

322

return os.path.join(pick(initdirscdf), random.choice(words))

323

return os.path.join(pick(initdirscdf), random.choice(words))

323

324

def makeline(minimum=0):

325

def makeline(minimum=0):

325

total = max(minimum, pick(linelengths))

326

total = max(minimum, pick(linelengths))

326

c, l = 0, []

327

c, l = 0, []

327

while c < total:

328

while c < total:

328

w = random.choice(words)

329

w = random.choice(words)

329

c += len(w) + 1

330

c += len(w) + 1

330

l.append(w)

331

l.append(w)

331

return ' '.join(l)

332

return ' '.join(l)

332

333

wlock = repo.wlock()

334

wlock = repo.wlock()

334

lock = repo.lock()

335

lock = repo.lock()

335

336

nevertouch = {'.hgsub', '.hgignore', '.hgtags'}

337

nevertouch = {'.hgsub', '.hgignore', '.hgtags'}

337

338

_synthesizing = _('synthesizing')

339

_synthesizing = _('synthesizing')

339

_files = _('initial files')

340

_files = _('initial files')

340

_changesets = _('changesets')

341

_changesets = _('changesets')

341

342

# Synthesize a single initial revision adding files to the repo according

343

# Synthesize a single initial revision adding files to the repo according

343

# to the modeled directory structure.

344

# to the modeled directory structure.

344

initcount = int(opts['initfiles'])

345

initcount = int(opts['initfiles'])

345

if initcount and initdirs:

346

if initcount and initdirs:

346

pctx = repo[None].parents()[0]

347

pctx = repo[None].parents()[0]

347

dirs = set(pctx.dirs())

348

dirs = set(pctx.dirs())

348

files = {}

349

files = {}

349

350

def validpath(path):

351

def validpath(path):

351

# Don't pick filenames which are already directory names.

352

# Don't pick filenames which are already directory names.

352

if path in dirs:

353

if path in dirs:

353

return False

354

return False

354

# Don't pick directories which were used as file names.

355

# Don't pick directories which were used as file names.

355

while path:

356

while path:

356

if path in files:

357

if path in files:

357

return False

358

return False

358

path = os.path.dirname(path)

359

path = os.path.dirname(path)

359

return True

360

return True

360

361

progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)

362

progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)

362

for i in xrange(0, initcount):

363

for i in xrange(0, initcount):

363

progress.update(i)

364

progress.update(i)

364

365

path = pickpath()

366

path = pickpath()

366

while not validpath(path):

367

while not validpath(path):

367

path = pickpath()

368

path = pickpath()

368

data = '%s contents\n' % path

369

data = '%s contents\n' % path

369

files[path] = data

370

files[path] = data

370

dir = os.path.dirname(path)

371

dir = os.path.dirname(path)

371

while dir and dir not in dirs:

372

while dir and dir not in dirs:

372

dirs.add(dir)

373

dirs.add(dir)

373

dir = os.path.dirname(dir)

374

dir = os.path.dirname(dir)

374

375

def filectxfn(repo, memctx, path):

376

def filectxfn(repo, memctx, path):

376

return context.memfilectx(repo, memctx, path, files[path])

377

return context.memfilectx(repo, memctx, path, files[path])

377

378

progress.complete()

379

progress.complete()

379

message = 'synthesized wide repo with %d files' % (len(files),)

380

message = 'synthesized wide repo with %d files' % (len(files),)

380

mc = context.memctx(repo, [pctx.node(), nullid], message,

381

mc = context.memctx(repo, [pctx.node(), nullid], message,

381

files, filectxfn, ui.username(),

382

files, filectxfn, ui.username(),

382

'%d %d' % dateutil.makedate())

383

'%d %d' % dateutil.makedate())

383

initnode = mc.commit()

384

initnode = mc.commit()

384

if ui.debugflag:

385

if ui.debugflag:

385

hexfn = hex

386

hexfn = hex

386

else:

387

else:

387

hexfn = short

388

hexfn = short

388

ui.status(_('added commit %s with %d files\n')

389

ui.status(_('added commit %s with %d files\n')

389

% (hexfn(initnode), len(files)))

390

% (hexfn(initnode), len(files)))

390

391

# Synthesize incremental revisions to the repository, adding repo depth.

392

# Synthesize incremental revisions to the repository, adding repo depth.

392

count = int(opts['count'])

393

count = int(opts['count'])

393

heads = set(map(repo.changelog.rev, repo.heads()))

394

heads = set(map(repo.changelog.rev, repo.heads()))

394

progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)

395

progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)

395

for i in xrange(count):

396

for i in xrange(count):

396

progress.update(i)

397

progress.update(i)

397

398

node = repo.changelog.node

399

node = repo.changelog.node

399

revs = len(repo)

400

revs = len(repo)

400

401

def pickhead(heads, distance):

402

def pickhead(heads, distance):

402

if heads:

403

if heads:

403

lheads = sorted(heads)

404

lheads = sorted(heads)

404

rev = revs - min(pick(distance), revs)

405

rev = revs - min(pick(distance), revs)

405

if rev < lheads[-1]:

406

if rev < lheads[-1]:

406

rev = lheads[bisect.bisect_left(lheads, rev)]

407

rev = lheads[bisect.bisect_left(lheads, rev)]

407

else:

408

else:

408

rev = lheads[-1]

409

rev = lheads[-1]

409

return rev, node(rev)

410

return rev, node(rev)

410

return nullrev, nullid

411

return nullrev, nullid

411

412

r1 = revs - min(pick(p1distance), revs)

413

r1 = revs - min(pick(p1distance), revs)

413

p1 = node(r1)

414

p1 = node(r1)

414

415

# the number of heads will grow without bound if we use a pure

416

# the number of heads will grow without bound if we use a pure

416

# model, so artificially constrain their proliferation

417

# model, so artificially constrain their proliferation

417

toomanyheads = len(heads) > random.randint(1, 20)

418

toomanyheads = len(heads) > random.randint(1, 20)

418

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

419

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

419

r2, p2 = pickhead(heads.difference([r1]), p2distance)

420

r2, p2 = pickhead(heads.difference([r1]), p2distance)

420

else:

421

else:

421

r2, p2 = nullrev, nullid

422

r2, p2 = nullrev, nullid

422

423

pl = [p1, p2]

424

pl = [p1, p2]

424

pctx = repo[r1]

425

pctx = repo[r1]

425

mf = pctx.manifest()

426

mf = pctx.manifest()

426

mfk = mf.keys()

427

mfk = mf.keys()

427

changes = {}

428

changes = {}

428

if mfk:

429

if mfk:

429

for __ in xrange(pick(fileschanged)):

430

for __ in xrange(pick(fileschanged)):

430

for __ in xrange(10):

431

for __ in xrange(10):

431

fctx = pctx.filectx(random.choice(mfk))

432

fctx = pctx.filectx(random.choice(mfk))

432

path = fctx.path()

433

path = fctx.path()

433

if not (path in nevertouch or fctx.isbinary() or

434

if not (path in nevertouch or fctx.isbinary() or

434

'l' in fctx.flags()):

435

'l' in fctx.flags()):

435

break

436

break

436

lines = fctx.data().splitlines()

437

lines = fctx.data().splitlines()

437

add, remove = pick(lineschanged)

438

add, remove = pick(lineschanged)

438

for __ in xrange(remove):

439

for __ in xrange(remove):

439

if not lines:

440

if not lines:

440

break

441

break

441

del lines[random.randrange(0, len(lines))]

442

del lines[random.randrange(0, len(lines))]

442

for __ in xrange(add):

443

for __ in xrange(add):

443

lines.insert(random.randint(0, len(lines)), makeline())

444

lines.insert(random.randint(0, len(lines)), makeline())

444

path = fctx.path()

445

path = fctx.path()

445

changes[path] = '\n'.join(lines) + '\n'

446

changes[path] = '\n'.join(lines) + '\n'

446

for __ in xrange(pick(filesremoved)):

447

for __ in xrange(pick(filesremoved)):

447

path = random.choice(mfk)

448

path = random.choice(mfk)

448

for __ in xrange(10):

449

for __ in xrange(10):

449

path = random.choice(mfk)

450

path = random.choice(mfk)

450

if path not in changes:

451

if path not in changes:

451

break

452

break

452

if filesadded:

453

if filesadded:

453

dirs = list(pctx.dirs())

454

dirs = list(pctx.dirs())

454

dirs.insert(0, '')

455

dirs.insert(0, '')

455

for __ in xrange(pick(filesadded)):

456

for __ in xrange(pick(filesadded)):

456

pathstr = ''

457

pathstr = ''

457

while pathstr in dirs:

458

while pathstr in dirs:

458

path = [random.choice(dirs)]

459

path = [random.choice(dirs)]

459

if pick(dirsadded):

460

if pick(dirsadded):

460

path.append(random.choice(words))

461

path.append(random.choice(words))

461

path.append(random.choice(words))

462

path.append(random.choice(words))

462

pathstr = '/'.join(filter(None, path))

463

pathstr = '/'.join(filter(None, path))

463

data = '\n'.join(makeline()

464

data = '\n'.join(makeline()

464

for __ in xrange(pick(linesinfilesadded))) + '\n'

465

for __ in xrange(pick(linesinfilesadded))) + '\n'

465

changes[pathstr] = data

466

changes[pathstr] = data

466

def filectxfn(repo, memctx, path):

467

def filectxfn(repo, memctx, path):

467

if path not in changes:

468

if path not in changes:

468

return None

469

return None

469

return context.memfilectx(repo, memctx, path, changes[path])

470

return context.memfilectx(repo, memctx, path, changes[path])

470

if not changes:

471

if not changes:

471

continue

472

continue

472

if revs:

473

if revs:

473

date = repo['tip'].date()[0] + pick(interarrival)

474

date = repo['tip'].date()[0] + pick(interarrival)

474

else:

475

else:

475

date = time.time() - (86400 * count)

476

date = time.time() - (86400 * count)

476

# dates in mercurial must be positive, fit in 32-bit signed integers.

477

# dates in mercurial must be positive, fit in 32-bit signed integers.

477

date = min(0x7fffffff, max(0, date))

478

date = min(0x7fffffff, max(0, date))

478

user = random.choice(words) + '@' + random.choice(words)

479

user = random.choice(words) + '@' + random.choice(words)

479

mc = context.memctx(repo, pl, makeline(minimum=2),

480

mc = context.memctx(repo, pl, makeline(minimum=2),

480

sorted(changes),

481

sorted(changes),

481

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

482

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

482

newnode = mc.commit()

483

newnode = mc.commit()

483

heads.add(repo.changelog.rev(newnode))

484

heads.add(repo.changelog.rev(newnode))

484

heads.discard(r1)

485

heads.discard(r1)

485

heads.discard(r2)

486

heads.discard(r2)

487

progress.complete()

486

488

487

lock.release()

489

lock.release()

488

wlock.release()

490

wlock.release()

489

491

490

def renamedirs(dirs, words):

492

def renamedirs(dirs, words):

491

'''Randomly rename the directory names in the per-dir file count dict.'''

493

'''Randomly rename the directory names in the per-dir file count dict.'''

492

wordgen = itertools.cycle(words)

494

wordgen = itertools.cycle(words)

493

replacements = {'': ''}

495

replacements = {'': ''}

494

def rename(dirpath):

496

def rename(dirpath):

495

'''Recursively rename the directory and all path prefixes.

497

'''Recursively rename the directory and all path prefixes.

496

498

497

The mapping from path to renamed path is stored for all path prefixes

499

The mapping from path to renamed path is stored for all path prefixes

498

as in dynamic programming, ensuring linear runtime and consistent

500

as in dynamic programming, ensuring linear runtime and consistent

499

renaming regardless of iteration order through the model.

501

renaming regardless of iteration order through the model.

500

'''

502

'''

501

if dirpath in replacements:

503

if dirpath in replacements:

502

return replacements[dirpath]

504

return replacements[dirpath]

503

head, _ = os.path.split(dirpath)

505

head, _ = os.path.split(dirpath)

504

if head:

506

if head:

505

head = rename(head)

507

head = rename(head)

506

else:

508

else:

507

head = ''

509

head = ''

508

renamed = os.path.join(head, next(wordgen))

510

renamed = os.path.join(head, next(wordgen))

509

replacements[dirpath] = renamed

511

replacements[dirpath] = renamed

510

return renamed

512

return renamed

511

result = []

513

result = []

512

for dirpath, count in dirs.iteritems():

514

for dirpath, count in dirs.iteritems():

513

result.append([rename(dirpath.lstrip(os.sep)), count])

515

result.append([rename(dirpath.lstrip(os.sep)), count])

514

return result

516

return result

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # synthrepo.py - repo synthesis
             #
             # Copyright 2012 Facebook
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             '''synthesize structurally interesting change history
             This extension is useful for creating a repository with properties
             that are statistically similar to an existing repository. During
             analysis, a simple probability table is constructed from the history
             of an existing repository.  During synthesis, these properties are
             reconstructed.
             Properties that are analyzed and synthesized include the following:
             - Lines added or removed when an existing file is modified
             - Number and sizes of files added
             - Number of files removed
             - Line lengths
             - Topological distance to parent changeset(s)
             - Probability of a commit being a merge
             - Probability of a newly added file being added to a new directory
             - Interarrival time, and time zone, of commits
             - Number of files in each directory
             A few obvious properties that are not currently handled realistically:
             - Merges are treated as regular commits with two parents, which is not
               realistic
             - Modifications are not treated as operations on hunks of lines, but
               as insertions and deletions of randomly chosen single lines
             - Committer ID (always random)
             - Executability of files
             - Symlinks and binary files are ignored
             '''
             from __future__ import absolute_import
             import bisect
             import collections
             import itertools
             import json
             import os
             import random
             import sys
             import time
             from mercurial.i18n import _
             from mercurial.node import (
                 nullid,
                 nullrev,
                 short,
             )
             from mercurial import (
                 context,
                 error,
                 hg,
                 patch,
                 registrar,
                 scmutil,
             )
             from mercurial.utils import dateutil
             # Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
             # extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
             # be specifying the version(s) of Mercurial they are tested with, or
             # leave the attribute unspecified.
             testedwith = 'ships-with-hg-core'
             cmdtable = {}
             command = registrar.command(cmdtable)
             newfile = {'new fi', 'rename', 'copy f', 'copy t'}
             def zerodict():
                 return collections.defaultdict(lambda: 0)
             def roundto(x, k):
                 if x > k * 2:
                     return int(round(x / float(k)) * k)
                 return int(round(x))
             def parsegitdiff(lines):
                 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
                 binary = False
                 for line in lines:
                     start = line[:6]
                     if start == 'diff -':
                         if filename:
                             yield filename, mar, lineadd, lineremove, binary
                         mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
                         filename = patch.gitre.match(line).group(1)
                     elif start in newfile:
                         mar = 'a'
                     elif start == 'GIT bi':
                         binary = True
                     elif start == 'delete':
                         mar = 'r'
                     elif start:
                         s = start[0]
                         if s == '-' and not line.startswith('--- '):
                             lineremove += 1
                         elif s == '+' and not line.startswith('+++ '):
                             lineadd[roundto(len(line) - 1, 5)] += 1
                 if filename:
                     yield filename, mar, lineadd, lineremove, binary
             @command('analyze',
                      [('o', 'output', '', _('write output to given file'), _('FILE')),
                       ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
                      _('hg analyze'), optionalrepo=True)
             def analyze(ui, repo, *revs, **opts):
                 '''create a simple model of a repository to use for later synthesis
                 This command examines every changeset in the given range (or all
                 of history if none are specified) and creates a simple statistical
                 model of the history of the repository. It also measures the directory
                 structure of the repository as checked out.
                 The model is written out to a JSON file, and can be used by
                 :hg:`synthesize` to create or augment a repository with synthetic
                 commits that have a structure that is statistically similar to the
                 analyzed repository.
                 '''
                 root = repo.root
                 if not root.endswith(os.path.sep):
                     root += os.path.sep
                 revs = list(revs)
                 revs.extend(opts['rev'])
                 if not revs:
                     revs = [':']
                 output = opts['output']
                 if not output:
                     output = os.path.basename(root) + '.json'
                 if output == '-':
                     fp = sys.stdout
                 else:
                     fp = open(output, 'w')
                 # Always obtain file counts of each directory in the given root directory.
                 def onerror(e):
                     ui.warn(_('error walking directory structure: %s\n') % e)
                 dirs = {}
                 rootprefixlen = len(root)
                 for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
                     dirpathfromroot = dirpath[rootprefixlen:]
                     dirs[dirpathfromroot] = len(filenames)
                     if '.hg' in dirnames:
                         dirnames.remove('.hg')
                 lineschanged = zerodict()
                 children = zerodict()
                 p1distance = zerodict()
                 p2distance = zerodict()
                 linesinfilesadded = zerodict()
                 fileschanged = zerodict()
                 filesadded = zerodict()
                 filesremoved = zerodict()
                 linelengths = zerodict()
                 interarrival = zerodict()
                 parents = zerodict()
                 dirsadded = zerodict()
                 tzoffset = zerodict()
                 # If a mercurial repo is available, also model the commit history.
                 if repo:
                     revs = scmutil.revrange(repo, revs)
                     revs.sort()
                     progress = ui.makeprogress(_('analyzing'), unit=_('changesets'),
                                                total=len(revs))
                     for i, rev in enumerate(revs):
                         progress.update(i)
                         ctx = repo[rev]
                         pl = ctx.parents()
                         pctx = pl[0]
                         prev = pctx.rev()
                         children[prev] += 1
                         p1distance[rev - prev] += 1
                         parents[len(pl)] += 1
                         tzoffset[ctx.date()[1]] += 1
                         if len(pl) > 1:
                             p2distance[rev - pl[1].rev()] += 1
                         if prev == rev - 1:
                             lastctx = pctx
                         else:
                             lastctx = repo[rev - 1]
                         if lastctx.rev() != nullrev:
                             timedelta = ctx.date()[0] - lastctx.date()[0]
                             interarrival[roundto(timedelta, 300)] += 1
                         diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
                         fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
                         for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
                             if isbin:
                                 continue
                             added = sum(lineadd.itervalues(), 0)
                             if mar == 'm':
                                 if added and lineremove:
                                     lineschanged[roundto(added, 5),
                                                  roundto(lineremove, 5)] += 1
                                     filechanges += 1
                             elif mar == 'a':
                                 fileadds += 1
                                 if '/' in filename:
                                     filedir = filename.rsplit('/', 1)[0]
                                     if filedir not in pctx.dirs():
                                         diradds += 1
                                 linesinfilesadded[roundto(added, 5)] += 1
                             elif mar == 'r':
                                 fileremoves += 1
                             for length, count in lineadd.iteritems():
                                 linelengths[length] += count
                         fileschanged[filechanges] += 1
                         filesadded[fileadds] += 1
                         dirsadded[diradds] += 1
                         filesremoved[fileremoves] += 1
+                    progress.complete()
                 invchildren = zerodict()
                 for rev, count in children.iteritems():
                     invchildren[count] += 1
                 if output != '-':
                     ui.status(_('writing output to %s\n') % output)
                 def pronk(d):
                     return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
                 json.dump({'revs': len(revs),
                            'initdirs': pronk(dirs),
                            'lineschanged': pronk(lineschanged),
                            'children': pronk(invchildren),
                            'fileschanged': pronk(fileschanged),
                            'filesadded': pronk(filesadded),
                            'linesinfilesadded': pronk(linesinfilesadded),
                            'dirsadded': pronk(dirsadded),
                            'filesremoved': pronk(filesremoved),
                            'linelengths': pronk(linelengths),
                            'parents': pronk(parents),
                            'p1distance': pronk(p1distance),
                            'p2distance': pronk(p2distance),
                            'interarrival': pronk(interarrival),
                            'tzoffset': pronk(tzoffset),
                            },
                           fp)
                 fp.close()
             @command('synthesize',
                      [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
                       ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
                       ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
                      _('hg synthesize [OPTION].. DESCFILE'))
             def synthesize(ui, repo, descpath, **opts):
                 '''synthesize commits based on a model of an existing repository
                 The model must have been generated by :hg:`analyze`. Commits will
                 be generated randomly according to the probabilities described in
                 the model. If --initfiles is set, the repository will be seeded with
                 the given number files following the modeled repository's directory
                 structure.
                 When synthesizing new content, commit descriptions, and user
                 names, words will be chosen randomly from a dictionary that is
                 presumed to contain one word per line. Use --dict to specify the
                 path to an alternate dictionary to use.
                 '''
                 try:
                     fp = hg.openpath(ui, descpath)
                 except Exception as err:
                     raise error.Abort('%s: %s' % (descpath, err[0].strerror))
                 desc = json.load(fp)
                 fp.close()
                 def cdf(l):
                     if not l:
                         return [], []
                     vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
                     t = float(sum(probs, 0))
                     s, cdfs = 0, []
                     for v in probs:
                         s += v
                         cdfs.append(s / t)
                     return vals, cdfs
                 lineschanged = cdf(desc['lineschanged'])
                 fileschanged = cdf(desc['fileschanged'])
                 filesadded = cdf(desc['filesadded'])
                 dirsadded = cdf(desc['dirsadded'])
                 filesremoved = cdf(desc['filesremoved'])
                 linelengths = cdf(desc['linelengths'])
                 parents = cdf(desc['parents'])
                 p1distance = cdf(desc['p1distance'])
                 p2distance = cdf(desc['p2distance'])
                 interarrival = cdf(desc['interarrival'])
                 linesinfilesadded = cdf(desc['linesinfilesadded'])
                 tzoffset = cdf(desc['tzoffset'])
                 dictfile = opts.get('dict') or '/usr/share/dict/words'
                 try:
                     fp = open(dictfile, 'rU')
                 except IOError as err:
                     raise error.Abort('%s: %s' % (dictfile, err.strerror))
                 words = fp.read().splitlines()
                 fp.close()
                 initdirs = {}
                 if desc['initdirs']:
                     for k, v in desc['initdirs']:
                         initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
                     initdirs = renamedirs(initdirs, words)
                 initdirscdf = cdf(initdirs)
                 def pick(cdf):
                     return cdf[0][bisect.bisect_left(cdf[1], random.random())]
                 def pickpath():
                     return os.path.join(pick(initdirscdf), random.choice(words))
                 def makeline(minimum=0):
                     total = max(minimum, pick(linelengths))
                     c, l = 0, []
                     while c < total:
                         w = random.choice(words)
                         c += len(w) + 1
                         l.append(w)
                     return ' '.join(l)
                 wlock = repo.wlock()
                 lock = repo.lock()
                 nevertouch = {'.hgsub', '.hgignore', '.hgtags'}
                 _synthesizing = _('synthesizing')
                 _files = _('initial files')
                 _changesets = _('changesets')
                 # Synthesize a single initial revision adding files to the repo according
                 # to the modeled directory structure.
                 initcount = int(opts['initfiles'])
                 if initcount and initdirs:
                     pctx = repo[None].parents()[0]
                     dirs = set(pctx.dirs())
                     files = {}
                     def validpath(path):
                         # Don't pick filenames which are already directory names.
                         if path in dirs:
                             return False
                         # Don't pick directories which were used as file names.
                         while path:
                             if path in files:
                                 return False
                             path = os.path.dirname(path)
                         return True
                     progress = ui.makeprogress(_synthesizing, unit=_files, total=initcount)
                     for i in xrange(0, initcount):
                         progress.update(i)
                         path = pickpath()
                         while not validpath(path):
                             path = pickpath()
                         data = '%s contents\n' % path
                         files[path] = data
                         dir = os.path.dirname(path)
                         while dir and dir not in dirs:
                             dirs.add(dir)
                             dir = os.path.dirname(dir)
                     def filectxfn(repo, memctx, path):
                         return context.memfilectx(repo, memctx, path, files[path])
                     progress.complete()
                     message = 'synthesized wide repo with %d files' % (len(files),)
                     mc = context.memctx(repo, [pctx.node(), nullid], message,
                                         files, filectxfn, ui.username(),
                                         '%d %d' % dateutil.makedate())
                     initnode = mc.commit()
                     if ui.debugflag:
                         hexfn = hex
                     else:
                         hexfn = short
                     ui.status(_('added commit %s with %d files\n')
                               % (hexfn(initnode), len(files)))
                 # Synthesize incremental revisions to the repository, adding repo depth.
                 count = int(opts['count'])
                 heads = set(map(repo.changelog.rev, repo.heads()))
                 progress = ui.makeprogress(_synthesizing, unit=_changesets, total=count)
                 for i in xrange(count):
                     progress.update(i)
                     node = repo.changelog.node
                     revs = len(repo)
                     def pickhead(heads, distance):
                         if heads:
                             lheads = sorted(heads)
                             rev = revs - min(pick(distance), revs)
                             if rev < lheads[-1]:
                                 rev = lheads[bisect.bisect_left(lheads, rev)]
                             else:
                                 rev = lheads[-1]
                             return rev, node(rev)
                         return nullrev, nullid
                     r1 = revs - min(pick(p1distance), revs)
                     p1 = node(r1)
                     # the number of heads will grow without bound if we use a pure
                     # model, so artificially constrain their proliferation
                     toomanyheads = len(heads) > random.randint(1, 20)
                     if p2distance[0] and (pick(parents) == 2 or toomanyheads):
                         r2, p2 = pickhead(heads.difference([r1]), p2distance)
                     else:
                         r2, p2 = nullrev, nullid
                     pl = [p1, p2]
                     pctx = repo[r1]
                     mf = pctx.manifest()
                     mfk = mf.keys()
                     changes = {}
                     if mfk:
                         for __ in xrange(pick(fileschanged)):
                             for __ in xrange(10):
                                 fctx = pctx.filectx(random.choice(mfk))
                                 path = fctx.path()
                                 if not (path in nevertouch or fctx.isbinary() or
                                         'l' in fctx.flags()):
                                     break
                             lines = fctx.data().splitlines()
                             add, remove = pick(lineschanged)
                             for __ in xrange(remove):
                                 if not lines:
                                     break
                                 del lines[random.randrange(0, len(lines))]
                             for __ in xrange(add):
                                 lines.insert(random.randint(0, len(lines)), makeline())
                             path = fctx.path()
                             changes[path] = '\n'.join(lines) + '\n'
                         for __ in xrange(pick(filesremoved)):
                             path = random.choice(mfk)
                             for __ in xrange(10):
                                 path = random.choice(mfk)
                                 if path not in changes:
                                     break
                     if filesadded:
                         dirs = list(pctx.dirs())
                         dirs.insert(0, '')
                     for __ in xrange(pick(filesadded)):
                         pathstr = ''
                         while pathstr in dirs:
                             path = [random.choice(dirs)]
                             if pick(dirsadded):
                                 path.append(random.choice(words))
                             path.append(random.choice(words))
                             pathstr = '/'.join(filter(None, path))
                         data = '\n'.join(makeline()
                                          for __ in xrange(pick(linesinfilesadded))) + '\n'
                         changes[pathstr] = data
                     def filectxfn(repo, memctx, path):
                         if path not in changes:
                             return None
                         return context.memfilectx(repo, memctx, path, changes[path])
                     if not changes:
                         continue
                     if revs:
                         date = repo['tip'].date()[0] + pick(interarrival)
                     else:
                         date = time.time() - (86400 * count)
                     # dates in mercurial must be positive, fit in 32-bit signed integers.
                     date = min(0x7fffffff, max(0, date))
                     user = random.choice(words) + '@' + random.choice(words)
                     mc = context.memctx(repo, pl, makeline(minimum=2),
                                         sorted(changes),
                                         filectxfn, user, '%d %d' % (date, pick(tzoffset)))
                     newnode = mc.commit()
                     heads.add(repo.changelog.rev(newnode))
                     heads.discard(r1)
                     heads.discard(r2)
+                progress.complete()
                 lock.release()
                 wlock.release()
             def renamedirs(dirs, words):
                 '''Randomly rename the directory names in the per-dir file count dict.'''
                 wordgen = itertools.cycle(words)
                 replacements = {'': ''}
                 def rename(dirpath):
                     '''Recursively rename the directory and all path prefixes.
                     The mapping from path to renamed path is stored for all path prefixes
                     as in dynamic programming, ensuring linear runtime and consistent
                     renaming regardless of iteration order through the model.
                     '''
                     if dirpath in replacements:
                         return replacements[dirpath]
                     head, _ = os.path.split(dirpath)
                     if head:
                         head = rename(head)
                     else:
                         head = ''
                     renamed = os.path.join(head, next(wordgen))
                     replacements[dirpath] = renamed
                     return renamed
                 result = []
                 for dirpath, count in dirs.iteritems():
                     result.append([rename(dirpath.lstrip(os.sep)), count])
                 return result