upstream/mercurial-mirror Commit - r22708:4c66e70c

1

# synthrepo.py - repo synthesis

1

# synthrepo.py - repo synthesis

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

'''synthesize structurally interesting change history

8

'''synthesize structurally interesting change history

9

10

This extension is useful for creating a repository with properties

10

This extension is useful for creating a repository with properties

11

that are statistically similar to an existing repository. During

11

that are statistically similar to an existing repository. During

12

analysis, a simple probability table is constructed from the history

12

analysis, a simple probability table is constructed from the history

13

of an existing repository. During synthesis, these properties are

13

of an existing repository. During synthesis, these properties are

14

reconstructed.

14

reconstructed.

15

16

Properties that are analyzed and synthesized include the following:

16

Properties that are analyzed and synthesized include the following:

17

18

- Lines added or removed when an existing file is modified

18

- Lines added or removed when an existing file is modified

19

- Number and sizes of files added

19

- Number and sizes of files added

20

- Number of files removed

20

- Number of files removed

21

- Line lengths

21

- Line lengths

22

- Topological distance to parent changeset(s)

22

- Topological distance to parent changeset(s)

23

- Probability of a commit being a merge

23

- Probability of a commit being a merge

24

- Probability of a newly added file being added to a new directory

24

- Probability of a newly added file being added to a new directory

25

- Interarrival time, and time zone, of commits

25

- Interarrival time, and time zone, of commits

26

27

A few obvious properties that are not currently handled realistically:

27

A few obvious properties that are not currently handled realistically:

28

29

- Merges are treated as regular commits with two parents, which is not

29

- Merges are treated as regular commits with two parents, which is not

30

realistic

30

realistic

31

- Modifications are not treated as operations on hunks of lines, but

31

- Modifications are not treated as operations on hunks of lines, but

32

as insertions and deletions of randomly chosen single lines

32

as insertions and deletions of randomly chosen single lines

33

- Committer ID (always random)

33

- Committer ID (always random)

34

- Executability of files

34

- Executability of files

35

- Symlinks and binary files are ignored

35

- Symlinks and binary files are ignored

36

'''

36

'''

37

38

import bisect, collections, json, os, random, time, sys

38

import bisect, collections, itertools, json, os, random, time, sys

39

from mercurial import cmdutil, context, patch, scmutil, util, hg

39

from mercurial import cmdutil, context, patch, scmutil, util, hg

40

from mercurial.i18n import _

40

from mercurial.i18n import _

41

from mercurial.node import nullrev, nullid

41

from mercurial.node import nullrev, nullid, short

42

43

testedwith = 'internal'

43

testedwith = 'internal'

44

45

cmdtable = {}

45

cmdtable = {}

46

command = cmdutil.command(cmdtable)

46

command = cmdutil.command(cmdtable)

47

48

newfile = set(('new fi', 'rename', 'copy f', 'copy t'))

48

newfile = set(('new fi', 'rename', 'copy f', 'copy t'))

49

50

def zerodict():

50

def zerodict():

51

return collections.defaultdict(lambda: 0)

51

return collections.defaultdict(lambda: 0)

52

53

def roundto(x, k):

53

def roundto(x, k):

54

if x > k * 2:

54

if x > k * 2:

55

return int(round(x / float(k)) * k)

55

return int(round(x / float(k)) * k)

56

return int(round(x))

56

return int(round(x))

57

58

def parsegitdiff(lines):

58

def parsegitdiff(lines):

59

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

59

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

60

binary = False

60

binary = False

61

for line in lines:

61

for line in lines:

62

start = line[:6]

62

start = line[:6]

63

if start == 'diff -':

63

if start == 'diff -':

64

if filename:

64

if filename:

65

yield filename, mar, lineadd, lineremove, binary

65

yield filename, mar, lineadd, lineremove, binary

66

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

66

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

67

filename = patch.gitre.match(line).group(1)

67

filename = patch.gitre.match(line).group(1)

68

elif start in newfile:

68

elif start in newfile:

69

mar = 'a'

69

mar = 'a'

70

elif start == 'GIT bi':

70

elif start == 'GIT bi':

71

binary = True

71

binary = True

72

elif start == 'delete':

72

elif start == 'delete':

73

mar = 'r'

73

mar = 'r'

74

elif start:

74

elif start:

75

s = start[0]

75

s = start[0]

76

if s == '-' and not line.startswith('--- '):

76

if s == '-' and not line.startswith('--- '):

77

lineremove += 1

77

lineremove += 1

78

elif s == '+' and not line.startswith('+++ '):

78

elif s == '+' and not line.startswith('+++ '):

79

lineadd[roundto(len(line) - 1, 5)] += 1

79

lineadd[roundto(len(line) - 1, 5)] += 1

80

if filename:

80

if filename:

81

yield filename, mar, lineadd, lineremove, binary

81

yield filename, mar, lineadd, lineremove, binary

82

83

@command('analyze',

83

@command('analyze',

84

[('o', 'output', [], _('write output to given file'), _('FILE')),

84

[('o', 'output', [], _('write output to given file'), _('FILE')),

85

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

85

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

86

_('hg analyze'))

86

_('hg analyze'))

87

def analyze(ui, repo, *revs, **opts):

87

def analyze(ui, repo, *revs, **opts):

88

'''create a simple model of a repository to use for later synthesis

88

'''create a simple model of a repository to use for later synthesis

89

90

This command examines every changeset in the given range (or all

90

This command examines every changeset in the given range (or all

91

of history if none are specified) and creates a simple statistical

91

of history if none are specified) and creates a simple statistical

92

model of the history of the repository.

92

model of the history of the repository.

93

94

The model is written out to a JSON file, and can be used by

94

The model is written out to a JSON file, and can be used by

95

:hg:`synthesize` to create or augment a repository with synthetic

95

:hg:`synthesize` to create or augment a repository with synthetic

96

commits that have a structure that is statistically similar to the

96

commits that have a structure that is statistically similar to the

97

analyzed repository.

97

analyzed repository.

98

'''

98

'''

99

100

revs = list(revs)

100

revs = list(revs)

101

revs.extend(opts['rev'])

101

revs.extend(opts['rev'])

102

if not revs:

102

if not revs:

103

revs = [':']

103

revs = [':']

104

105

output = opts['output']

105

output = opts['output']

106

if not output:

106

if not output:

107

output = os.path.basename(repo.root) + '.json'

107

output = os.path.basename(repo.root) + '.json'

108

109

if output == '-':

109

if output == '-':

110

fp = sys.stdout

110

fp = sys.stdout

111

else:

111

else:

112

fp = open(output, 'w')

112

fp = open(output, 'w')

113

114

revs = scmutil.revrange(repo, revs)

114

revs = scmutil.revrange(repo, revs)

115

revs.sort()

115

revs.sort()

116

117

lineschanged = zerodict()

117

lineschanged = zerodict()

118

children = zerodict()

118

children = zerodict()

119

p1distance = zerodict()

119

p1distance = zerodict()

120

p2distance = zerodict()

120

p2distance = zerodict()

121

linesinfilesadded = zerodict()

121

linesinfilesadded = zerodict()

122

fileschanged = zerodict()

122

fileschanged = zerodict()

123

filesadded = zerodict()

123

filesadded = zerodict()

124

filesremoved = zerodict()

124

filesremoved = zerodict()

125

linelengths = zerodict()

125

linelengths = zerodict()

126

interarrival = zerodict()

126

interarrival = zerodict()

127

parents = zerodict()

127

parents = zerodict()

128

dirsadded = zerodict()

128

dirsadded = zerodict()

129

tzoffset = zerodict()

129

tzoffset = zerodict()

130

131

progress = ui.progress

131

progress = ui.progress

132

_analyzing = _('analyzing')

132

_analyzing = _('analyzing')

133

_changesets = _('changesets')

133

_changesets = _('changesets')

134

_total = len(revs)

134

_total = len(revs)

135

136

for i, rev in enumerate(revs):

136

for i, rev in enumerate(revs):

137

progress(_analyzing, i, unit=_changesets, total=_total)

137

progress(_analyzing, i, unit=_changesets, total=_total)

138

ctx = repo[rev]

138

ctx = repo[rev]

139

pl = ctx.parents()

139

pl = ctx.parents()

140

pctx = pl[0]

140

pctx = pl[0]

141

prev = pctx.rev()

141

prev = pctx.rev()

142

children[prev] += 1

142

children[prev] += 1

143

p1distance[rev - prev] += 1

143

p1distance[rev - prev] += 1

144

parents[len(pl)] += 1

144

parents[len(pl)] += 1

145

tzoffset[ctx.date()[1]] += 1

145

tzoffset[ctx.date()[1]] += 1

146

if len(pl) > 1:

146

if len(pl) > 1:

147

p2distance[rev - pl[1].rev()] += 1

147

p2distance[rev - pl[1].rev()] += 1

148

if prev == rev - 1:

148

if prev == rev - 1:

149

lastctx = pctx

149

lastctx = pctx

150

else:

150

else:

151

lastctx = repo[rev - 1]

151

lastctx = repo[rev - 1]

152

if lastctx.rev() != nullrev:

152

if lastctx.rev() != nullrev:

153

interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1

153

interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1

154

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

154

diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])

155

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

155

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

156

for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):

156

for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):

157

if binary:

157

if binary:

158

continue

158

continue

159

added = sum(lineadd.itervalues(), 0)

159

added = sum(lineadd.itervalues(), 0)

160

if mar == 'm':

160

if mar == 'm':

161

if added and lineremove:

161

if added and lineremove:

162

lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1

162

lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1

163

filechanges += 1

163

filechanges += 1

164

elif mar == 'a':

164

elif mar == 'a':

165

fileadds += 1

165

fileadds += 1

166

if '/' in filename:

166

if '/' in filename:

167

filedir = filename.rsplit('/', 1)[0]

167

filedir = filename.rsplit('/', 1)[0]

168

if filedir not in pctx.dirs():

168

if filedir not in pctx.dirs():

169

diradds += 1

169

diradds += 1

170

linesinfilesadded[roundto(added, 5)] += 1

170

linesinfilesadded[roundto(added, 5)] += 1

171

elif mar == 'r':

171

elif mar == 'r':

172

fileremoves += 1

172

fileremoves += 1

173

for length, count in lineadd.iteritems():

173

for length, count in lineadd.iteritems():

174

linelengths[length] += count

174

linelengths[length] += count

175

fileschanged[filechanges] += 1

175

fileschanged[filechanges] += 1

176

filesadded[fileadds] += 1

176

filesadded[fileadds] += 1

177

dirsadded[diradds] += 1

177

dirsadded[diradds] += 1

178

filesremoved[fileremoves] += 1

178

filesremoved[fileremoves] += 1

179

180

invchildren = zerodict()

180

invchildren = zerodict()

181

182

for rev, count in children.iteritems():

182

for rev, count in children.iteritems():

183

invchildren[count] += 1

183

invchildren[count] += 1

184

185

if output != '-':

185

if output != '-':

186

ui.status(_('writing output to %s\n') % output)

186

ui.status(_('writing output to %s\n') % output)

187

188

def pronk(d):

188

def pronk(d):

189

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

189

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

190

191

json.dump({'revs': len(revs),

191

json.dump({'revs': len(revs),

192

'lineschanged': pronk(lineschanged),

192

'lineschanged': pronk(lineschanged),

193

'children': pronk(invchildren),

193

'children': pronk(invchildren),

194

'fileschanged': pronk(fileschanged),

194

'fileschanged': pronk(fileschanged),

195

'filesadded': pronk(filesadded),

195

'filesadded': pronk(filesadded),

196

'linesinfilesadded': pronk(linesinfilesadded),

196

'linesinfilesadded': pronk(linesinfilesadded),

197

'dirsadded': pronk(dirsadded),

197

'dirsadded': pronk(dirsadded),

198

'filesremoved': pronk(filesremoved),

198

'filesremoved': pronk(filesremoved),

199

'linelengths': pronk(linelengths),

199

'linelengths': pronk(linelengths),

200

'parents': pronk(parents),

200

'parents': pronk(parents),

201

'p1distance': pronk(p1distance),

201

'p1distance': pronk(p1distance),

202

'p2distance': pronk(p2distance),

202

'p2distance': pronk(p2distance),

203

'interarrival': pronk(interarrival),

203

'interarrival': pronk(interarrival),

204

'tzoffset': pronk(tzoffset),

204

'tzoffset': pronk(tzoffset),

205

},

205

},

206

fp)

206

fp)

207

fp.close()

207

fp.close()

208

209

@command('synthesize',

209

@command('synthesize',

210

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

210

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

211

('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],

211

('', 'dict', '', _('path to a dictionary of words'), _('FILE')),

212

('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],

212

_('hg synthesize [OPTION].. DESCFILE'))

213

_('hg synthesize [OPTION].. DESCFILE'))

213

def synthesize(ui, repo, descpath, **opts):

214

def synthesize(ui, repo, descpath, **opts):

214

'''synthesize commits based on a model of an existing repository

215

'''synthesize commits based on a model of an existing repository

215

216

The model must have been generated by :hg:`analyze`. Commits will

217

The model must have been generated by :hg:`analyze`. Commits will

217

be generated randomly according to the probabilities described in

218

be generated randomly according to the probabilities described in

218

the model.

219

the model. If --initfiles is set, the repository will be seeded with

220

the given number files following the modeled repository's directory

221

structure.

219

222

220

When synthesizing new content, commit descriptions, and user

223

When synthesizing new content, commit descriptions, and user

221

names, words will be chosen randomly from a dictionary that is

224

names, words will be chosen randomly from a dictionary that is

222

presumed to contain one word per line. Use --dict to specify the

225

presumed to contain one word per line. Use --dict to specify the

223

path to an alternate dictionary to use.

226

path to an alternate dictionary to use.

224

'''

227

'''

225

try:

228

try:

226

fp = hg.openpath(ui, descpath)

229

fp = hg.openpath(ui, descpath)

227

except Exception, err:

230

except Exception, err:

228

raise util.Abort('%s: %s' % (descpath, err[0].strerror))

231

raise util.Abort('%s: %s' % (descpath, err[0].strerror))

229

desc = json.load(fp)

232

desc = json.load(fp)

230

fp.close()

233

fp.close()

231

234

232

def cdf(l):

235

def cdf(l):

233

if not l:

236

if not l:

234

return [], []

237

return [], []

235

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

238

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

236

t = float(sum(probs, 0))

239

t = float(sum(probs, 0))

237

s, cdfs = 0, []

240

s, cdfs = 0, []

238

for v in probs:

241

for v in probs:

239

s += v

242

s += v

240

cdfs.append(s / t)

243

cdfs.append(s / t)

241

return vals, cdfs

244

return vals, cdfs

242

245

243

lineschanged = cdf(desc['lineschanged'])

246

lineschanged = cdf(desc['lineschanged'])

244

fileschanged = cdf(desc['fileschanged'])

247

fileschanged = cdf(desc['fileschanged'])

245

filesadded = cdf(desc['filesadded'])

248

filesadded = cdf(desc['filesadded'])

246

dirsadded = cdf(desc['dirsadded'])

249

dirsadded = cdf(desc['dirsadded'])

247

filesremoved = cdf(desc['filesremoved'])

250

filesremoved = cdf(desc['filesremoved'])

248

linelengths = cdf(desc['linelengths'])

251

linelengths = cdf(desc['linelengths'])

249

parents = cdf(desc['parents'])

252

parents = cdf(desc['parents'])

250

p1distance = cdf(desc['p1distance'])

253

p1distance = cdf(desc['p1distance'])

251

p2distance = cdf(desc['p2distance'])

254

p2distance = cdf(desc['p2distance'])

252

interarrival = cdf(desc['interarrival'])

255

interarrival = cdf(desc['interarrival'])

253

linesinfilesadded = cdf(desc['linesinfilesadded'])

256

linesinfilesadded = cdf(desc['linesinfilesadded'])

254

tzoffset = cdf(desc['tzoffset'])

257

tzoffset = cdf(desc['tzoffset'])

255

258

256

dictfile = opts.get('dict') or '/usr/share/dict/words'

259

dictfile = opts.get('dict') or '/usr/share/dict/words'

257

try:

260

try:

258

fp = open(dictfile, 'rU')

261

fp = open(dictfile, 'rU')

259

except IOError, err:

262

except IOError, err:

260

raise util.Abort('%s: %s' % (dictfile, err.strerror))

263

raise util.Abort('%s: %s' % (dictfile, err.strerror))

261

words = fp.read().splitlines()

264

words = fp.read().splitlines()

262

fp.close()

265

fp.close()

263

266

267

initdirs = {}

268

if desc['initdirs']:

269

for k, v in desc['initdirs']:

270

initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v

271

initdirs = renamedirs(initdirs, words)

272

initdirscdf = cdf(initdirs)

273

264

def pick(cdf):

274

def pick(cdf):

265

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

275

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

266

276

277

def pickpath():

278

return os.path.join(pick(initdirscdf), random.choice(words))

279

267

def makeline(minimum=0):

280

def makeline(minimum=0):

268

total = max(minimum, pick(linelengths))

281

total = max(minimum, pick(linelengths))

269

c, l = 0, []

282

c, l = 0, []

270

while c < total:

283

while c < total:

271

w = random.choice(words)

284

w = random.choice(words)

272

c += len(w) + 1

285

c += len(w) + 1

273

l.append(w)

286

l.append(w)

274

return ' '.join(l)

287

return ' '.join(l)

275

288

276

wlock = repo.wlock()

289

wlock = repo.wlock()

277

lock = repo.lock()

290

lock = repo.lock()

278

291

279

nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))

292

nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))

280

293

281

progress = ui.progress

294

progress = ui.progress

282

_synthesizing = _('synthesizing')

295

_synthesizing = _('synthesizing')

296

_files = _('initial files')

283

_changesets = _('changesets')

297

_changesets = _('changesets')

284

298

299

# Synthesize a single initial revision adding files to the repo according

300

# to the modeled directory structure.

301

initcount = int(opts['initfiles'])

302

if initcount and initdirs:

303

pctx = repo[None].parents()[0]

304

files = {}

305

for i in xrange(0, initcount):

306

ui.progress(_synthesizing, i, unit=_files, total=initcount)

307

308

path = pickpath()

309

while path in pctx.dirs():

310

path = pickpath()

311

data = '%s contents\n' % path

312

files[path] = context.memfilectx(repo, path, data)

313

314

def filectxfn(repo, memctx, path):

315

return files[path]

316

317

ui.progress(_synthesizing, None)

318

message = 'synthesized wide repo with %d files' % (len(files),)

319

mc = context.memctx(repo, [pctx.node(), nullid], message,

320

files.iterkeys(), filectxfn, ui.username(),

321

'%d %d' % util.makedate())

322

initnode = mc.commit()

323

hexfn = ui.debugflag and hex or short

324

ui.status(_('added commit %s with %d files\n')

325

% (hexfn(initnode), len(files)))

326

327

# Synthesize incremental revisions to the repository, adding repo depth.

285

count = int(opts['count'])

328

count = int(opts['count'])

286

heads = set(map(repo.changelog.rev, repo.heads()))

329

heads = set(map(repo.changelog.rev, repo.heads()))

287

for i in xrange(count):

330

for i in xrange(count):

288

progress(_synthesizing, i, unit=_changesets, total=count)

331

progress(_synthesizing, i, unit=_changesets, total=count)

289

332

290

node = repo.changelog.node

333

node = repo.changelog.node

291

revs = len(repo)

334

revs = len(repo)

292

335

293

def pickhead(heads, distance):

336

def pickhead(heads, distance):

294

if heads:

337

if heads:

295

lheads = sorted(heads)

338

lheads = sorted(heads)

296

rev = revs - min(pick(distance), revs)

339

rev = revs - min(pick(distance), revs)

297

if rev < lheads[-1]:

340

if rev < lheads[-1]:

298

rev = lheads[bisect.bisect_left(lheads, rev)]

341

rev = lheads[bisect.bisect_left(lheads, rev)]

299

else:

342

else:

300

rev = lheads[-1]

343

rev = lheads[-1]

301

return rev, node(rev)

344

return rev, node(rev)

302

return nullrev, nullid

345

return nullrev, nullid

303

346

304

r1 = revs - min(pick(p1distance), revs)

347

r1 = revs - min(pick(p1distance), revs)

305

p1 = node(r1)

348

p1 = node(r1)

306

349

307

# the number of heads will grow without bound if we use a pure

350

# the number of heads will grow without bound if we use a pure

308

# model, so artificially constrain their proliferation

351

# model, so artificially constrain their proliferation

309

toomanyheads = len(heads) > random.randint(1, 20)

352

toomanyheads = len(heads) > random.randint(1, 20)

310

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

353

if p2distance[0] and (pick(parents) == 2 or toomanyheads):

311

r2, p2 = pickhead(heads.difference([r1]), p2distance)

354

r2, p2 = pickhead(heads.difference([r1]), p2distance)

312

else:

355

else:

313

r2, p2 = nullrev, nullid

356

r2, p2 = nullrev, nullid

314

357

315

pl = [p1, p2]

358

pl = [p1, p2]

316

pctx = repo[r1]

359

pctx = repo[r1]

317

mf = pctx.manifest()

360

mf = pctx.manifest()

318

mfk = mf.keys()

361

mfk = mf.keys()

319

changes = {}

362

changes = {}

320

if mfk:

363

if mfk:

321

for __ in xrange(pick(fileschanged)):

364

for __ in xrange(pick(fileschanged)):

322

for __ in xrange(10):

365

for __ in xrange(10):

323

fctx = pctx.filectx(random.choice(mfk))

366

fctx = pctx.filectx(random.choice(mfk))

324

path = fctx.path()

367

path = fctx.path()

325

if not (path in nevertouch or fctx.isbinary() or

368

if not (path in nevertouch or fctx.isbinary() or

326

'l' in fctx.flags()):

369

'l' in fctx.flags()):

327

break

370

break

328

lines = fctx.data().splitlines()

371

lines = fctx.data().splitlines()

329

add, remove = pick(lineschanged)

372

add, remove = pick(lineschanged)

330

for __ in xrange(remove):

373

for __ in xrange(remove):

331

if not lines:

374

if not lines:

332

break

375

break

333

del lines[random.randrange(0, len(lines))]

376

del lines[random.randrange(0, len(lines))]

334

for __ in xrange(add):

377

for __ in xrange(add):

335

lines.insert(random.randint(0, len(lines)), makeline())

378

lines.insert(random.randint(0, len(lines)), makeline())

336

path = fctx.path()

379

path = fctx.path()

337

changes[path] = context.memfilectx(repo, path,

380

changes[path] = context.memfilectx(repo, path,

338

'\n'.join(lines) + '\n')

381

'\n'.join(lines) + '\n')

339

for __ in xrange(pick(filesremoved)):

382

for __ in xrange(pick(filesremoved)):

340

path = random.choice(mfk)

383

path = random.choice(mfk)

341

for __ in xrange(10):

384

for __ in xrange(10):

342

path = random.choice(mfk)

385

path = random.choice(mfk)

343

if path not in changes:

386

if path not in changes:

344

changes[path] = None

387

changes[path] = None

345

break

388

break

346

if filesadded:

389

if filesadded:

347

dirs = list(pctx.dirs())

390

dirs = list(pctx.dirs())

348

dirs.append('')

391

dirs.append('')

349

for __ in xrange(pick(filesadded)):

392

for __ in xrange(pick(filesadded)):

350

path = [random.choice(dirs)]

393

path = [random.choice(dirs)]

351

if pick(dirsadded):

394

if pick(dirsadded):

352

path.append(random.choice(words))

395

path.append(random.choice(words))

353

path.append(random.choice(words))

396

path.append(random.choice(words))

354

path = '/'.join(filter(None, path))

397

path = '/'.join(filter(None, path))

355

data = '\n'.join(makeline()

398

data = '\n'.join(makeline()

356

for __ in xrange(pick(linesinfilesadded))) + '\n'

399

for __ in xrange(pick(linesinfilesadded))) + '\n'

357

changes[path] = context.memfilectx(repo, path, data)

400

changes[path] = context.memfilectx(repo, path, data)

358

def filectxfn(repo, memctx, path):

401

def filectxfn(repo, memctx, path):

359

return changes[path]

402

return changes[path]

360

if not changes:

403

if not changes:

361

continue

404

continue

362

if revs:

405

if revs:

363

date = repo['tip'].date()[0] + pick(interarrival)

406

date = repo['tip'].date()[0] + pick(interarrival)

364

else:

407

else:

365

date = time.time() - (86400 * count)

408

date = time.time() - (86400 * count)

366

user = random.choice(words) + '@' + random.choice(words)

409

user = random.choice(words) + '@' + random.choice(words)

367

mc = context.memctx(repo, pl, makeline(minimum=2),

410

mc = context.memctx(repo, pl, makeline(minimum=2),

368

sorted(changes.iterkeys()),

411

sorted(changes.iterkeys()),

369

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

412

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

370

newnode = mc.commit()

413

newnode = mc.commit()

371

heads.add(repo.changelog.rev(newnode))

414

heads.add(repo.changelog.rev(newnode))

372

heads.discard(r1)

415

heads.discard(r1)

373

heads.discard(r2)

416

heads.discard(r2)

374

417

375

lock.release()

418

lock.release()

376

wlock.release()

419

wlock.release()

420

421

def renamedirs(dirs, words):

422

'''Randomly rename the directory names in the per-dir file count dict.'''

423

wordgen = itertools.cycle(words)

424

replacements = {'': ''}

425

def rename(dirpath):

426

'''Recursively rename the directory and all path prefixes.

427

428

The mapping from path to renamed path is stored for all path prefixes

429

as in dynamic programming, ensuring linear runtime and consistent

430

renaming regardless of iteration order through the model.

431

'''

432

if dirpath in replacements:

433

return replacements[dirpath]

434

head, _ = os.path.split(dirpath)

435

head = head and rename(head) or ''

436

renamed = os.path.join(head, wordgen.next())

437

replacements[dirpath] = renamed

438

return renamed

439

result = []

440

for dirpath, count in dirs.iteritems():

441

result.append([rename(dirpath.lstrip(os.sep)), count])

442

return result

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # synthrepo.py - repo synthesis
             #
             # Copyright 2012 Facebook
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             '''synthesize structurally interesting change history
             This extension is useful for creating a repository with properties
             that are statistically similar to an existing repository. During
             analysis, a simple probability table is constructed from the history
             of an existing repository.  During synthesis, these properties are
             reconstructed.
             Properties that are analyzed and synthesized include the following:
             - Lines added or removed when an existing file is modified
             - Number and sizes of files added
             - Number of files removed
             - Line lengths
             - Topological distance to parent changeset(s)
             - Probability of a commit being a merge
             - Probability of a newly added file being added to a new directory
             - Interarrival time, and time zone, of commits
             A few obvious properties that are not currently handled realistically:
             - Merges are treated as regular commits with two parents, which is not
               realistic
             - Modifications are not treated as operations on hunks of lines, but
               as insertions and deletions of randomly chosen single lines
             - Committer ID (always random)
             - Executability of files
             - Symlinks and binary files are ignored
             '''
-            import bisect, collections, json, os, random, time, sys
+            import bisect, collections, itertools, json, os, random, time, sys
             from mercurial import cmdutil, context, patch, scmutil, util, hg
             from mercurial.i18n import _
-            from mercurial.node import nullrev, nullid
+            from mercurial.node import nullrev, nullid, short
             testedwith = 'internal'
             cmdtable = {}
             command = cmdutil.command(cmdtable)
             newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
             def zerodict():
                 return collections.defaultdict(lambda: 0)
             def roundto(x, k):
                 if x > k * 2:
                     return int(round(x / float(k)) * k)
                 return int(round(x))
             def parsegitdiff(lines):
                 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
                 binary = False
                 for line in lines:
                     start = line[:6]
                     if start == 'diff -':
                         if filename:
                             yield filename, mar, lineadd, lineremove, binary
                         mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
                         filename = patch.gitre.match(line).group(1)
                     elif start in newfile:
                         mar = 'a'
                     elif start == 'GIT bi':
                         binary = True
                     elif start == 'delete':
                         mar = 'r'
                     elif start:
                         s = start[0]
                         if s == '-' and not line.startswith('--- '):
                             lineremove += 1
                         elif s == '+' and not line.startswith('+++ '):
                             lineadd[roundto(len(line) - 1, 5)] += 1
                 if filename:
                     yield filename, mar, lineadd, lineremove, binary
             @command('analyze',
                      [('o', 'output', [], _('write output to given file'), _('FILE')),
                       ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
                      _('hg analyze'))
             def analyze(ui, repo, *revs, **opts):
                 '''create a simple model of a repository to use for later synthesis
                 This command examines every changeset in the given range (or all
                 of history if none are specified) and creates a simple statistical
                 model of the history of the repository.
                 The model is written out to a JSON file, and can be used by
                 :hg:`synthesize` to create or augment a repository with synthetic
                 commits that have a structure that is statistically similar to the
                 analyzed repository.
                 '''
                 revs = list(revs)
                 revs.extend(opts['rev'])
                 if not revs:
                     revs = [':']
                 output = opts['output']
                 if not output:
                     output = os.path.basename(repo.root) + '.json'
                 if output == '-':
                     fp = sys.stdout
                 else:
                     fp = open(output, 'w')
                 revs = scmutil.revrange(repo, revs)
                 revs.sort()
                 lineschanged = zerodict()
                 children = zerodict()
                 p1distance = zerodict()
                 p2distance = zerodict()
                 linesinfilesadded = zerodict()
                 fileschanged = zerodict()
                 filesadded = zerodict()
                 filesremoved = zerodict()
                 linelengths = zerodict()
                 interarrival = zerodict()
                 parents = zerodict()
                 dirsadded = zerodict()
                 tzoffset = zerodict()
                 progress = ui.progress
                 _analyzing = _('analyzing')
                 _changesets = _('changesets')
                 _total = len(revs)
                 for i, rev in enumerate(revs):
                     progress(_analyzing, i, unit=_changesets, total=_total)
                     ctx = repo[rev]
                     pl = ctx.parents()
                     pctx = pl[0]
                     prev = pctx.rev()
                     children[prev] += 1
                     p1distance[rev - prev] += 1
                     parents[len(pl)] += 1
                     tzoffset[ctx.date()[1]] += 1
                     if len(pl) > 1:
                         p2distance[rev - pl[1].rev()] += 1
                     if prev == rev - 1:
                         lastctx = pctx
                     else:
                         lastctx = repo[rev - 1]
                     if lastctx.rev() != nullrev:
                         interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
                     diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
                     fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
                     for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
                         if binary:
                             continue
                         added = sum(lineadd.itervalues(), 0)
                         if mar == 'm':
                             if added and lineremove:
                                 lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
                                 filechanges += 1
                         elif mar == 'a':
                             fileadds += 1
                             if '/' in filename:
                                 filedir = filename.rsplit('/', 1)[0]
                                 if filedir not in pctx.dirs():
                                     diradds += 1
                             linesinfilesadded[roundto(added, 5)] += 1
                         elif mar == 'r':
                             fileremoves += 1
                         for length, count in lineadd.iteritems():
                             linelengths[length] += count
                     fileschanged[filechanges] += 1
                     filesadded[fileadds] += 1
                     dirsadded[diradds] += 1
                     filesremoved[fileremoves] += 1
                 invchildren = zerodict()
                 for rev, count in children.iteritems():
                     invchildren[count] += 1
                 if output != '-':
                     ui.status(_('writing output to %s\n') % output)
                 def pronk(d):
                     return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
                 json.dump({'revs': len(revs),
                            'lineschanged': pronk(lineschanged),
                            'children': pronk(invchildren),
                            'fileschanged': pronk(fileschanged),
                            'filesadded': pronk(filesadded),
                            'linesinfilesadded': pronk(linesinfilesadded),
                            'dirsadded': pronk(dirsadded),
                            'filesremoved': pronk(filesremoved),
                            'linelengths': pronk(linelengths),
                            'parents': pronk(parents),
                            'p1distance': pronk(p1distance),
                            'p2distance': pronk(p2distance),
                            'interarrival': pronk(interarrival),
                            'tzoffset': pronk(tzoffset),
                            },
                           fp)
                 fp.close()
             @command('synthesize',
                      [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
-                      ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
+                      ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
+                      ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
                      _('hg synthesize [OPTION].. DESCFILE'))
             def synthesize(ui, repo, descpath, **opts):
                 '''synthesize commits based on a model of an existing repository
                 The model must have been generated by :hg:`analyze`. Commits will
                 be generated randomly according to the probabilities described in
-                the model.
+                the model. If --initfiles is set, the repository will be seeded with
+                the given number files following the modeled repository's directory
+                structure.
                 When synthesizing new content, commit descriptions, and user
                 names, words will be chosen randomly from a dictionary that is
                 presumed to contain one word per line. Use --dict to specify the
                 path to an alternate dictionary to use.
                 '''
                 try:
                     fp = hg.openpath(ui, descpath)
                 except Exception, err:
                     raise util.Abort('%s: %s' % (descpath, err[0].strerror))
                 desc = json.load(fp)
                 fp.close()
                 def cdf(l):
                     if not l:
                         return [], []
                     vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
                     t = float(sum(probs, 0))
                     s, cdfs = 0, []
                     for v in probs:
                         s += v
                         cdfs.append(s / t)
                     return vals, cdfs
                 lineschanged = cdf(desc['lineschanged'])
                 fileschanged = cdf(desc['fileschanged'])
                 filesadded = cdf(desc['filesadded'])
                 dirsadded = cdf(desc['dirsadded'])
                 filesremoved = cdf(desc['filesremoved'])
                 linelengths = cdf(desc['linelengths'])
                 parents = cdf(desc['parents'])
                 p1distance = cdf(desc['p1distance'])
                 p2distance = cdf(desc['p2distance'])
                 interarrival = cdf(desc['interarrival'])
                 linesinfilesadded = cdf(desc['linesinfilesadded'])
                 tzoffset = cdf(desc['tzoffset'])
                 dictfile = opts.get('dict') or '/usr/share/dict/words'
                 try:
                     fp = open(dictfile, 'rU')
                 except IOError, err:
                     raise util.Abort('%s: %s' % (dictfile, err.strerror))
                 words = fp.read().splitlines()
                 fp.close()
+                initdirs = {}
+                if desc['initdirs']:
+                    for k, v in desc['initdirs']:
+                        initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
+                    initdirs = renamedirs(initdirs, words)
+                initdirscdf = cdf(initdirs)
                 def pick(cdf):
                     return cdf[0][bisect.bisect_left(cdf[1], random.random())]
+                def pickpath():
+                    return os.path.join(pick(initdirscdf), random.choice(words))
                 def makeline(minimum=0):
                     total = max(minimum, pick(linelengths))
                     c, l = 0, []
                     while c < total:
                         w = random.choice(words)
                         c += len(w) + 1
                         l.append(w)
                     return ' '.join(l)
                 wlock = repo.wlock()
                 lock = repo.lock()
                 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
                 progress = ui.progress
                 _synthesizing = _('synthesizing')
+                _files = _('initial files')
                 _changesets = _('changesets')
+                # Synthesize a single initial revision adding files to the repo according
+                # to the modeled directory structure.
+                initcount = int(opts['initfiles'])
+                if initcount and initdirs:
+                    pctx = repo[None].parents()[0]
+                    files = {}
+                    for i in xrange(0, initcount):
+                        ui.progress(_synthesizing, i, unit=_files, total=initcount)
+                        path = pickpath()
+                        while path in pctx.dirs():
+                            path = pickpath()
+                        data = '%s contents\n' % path
+                        files[path] = context.memfilectx(repo, path, data)
+                    def filectxfn(repo, memctx, path):
+                        return files[path]
+                    ui.progress(_synthesizing, None)
+                    message = 'synthesized wide repo with %d files' % (len(files),)
+                    mc = context.memctx(repo, [pctx.node(), nullid], message,
+                                        files.iterkeys(), filectxfn, ui.username(),
+                                        '%d %d' % util.makedate())
+                    initnode = mc.commit()
+                    hexfn = ui.debugflag and hex or short
+                    ui.status(_('added commit %s with %d files\n')
+                              % (hexfn(initnode), len(files)))
+                # Synthesize incremental revisions to the repository, adding repo depth.
                 count = int(opts['count'])
                 heads = set(map(repo.changelog.rev, repo.heads()))
                 for i in xrange(count):
                     progress(_synthesizing, i, unit=_changesets, total=count)
                     node = repo.changelog.node
                     revs = len(repo)
                     def pickhead(heads, distance):
                         if heads:
                             lheads = sorted(heads)
                             rev = revs - min(pick(distance), revs)
                             if rev < lheads[-1]:
                                 rev = lheads[bisect.bisect_left(lheads, rev)]
                             else:
                                 rev = lheads[-1]
                             return rev, node(rev)
                         return nullrev, nullid
                     r1 = revs - min(pick(p1distance), revs)
                     p1 = node(r1)
                     # the number of heads will grow without bound if we use a pure
                     # model, so artificially constrain their proliferation
                     toomanyheads = len(heads) > random.randint(1, 20)
                     if p2distance[0] and (pick(parents) == 2 or toomanyheads):
                         r2, p2 = pickhead(heads.difference([r1]), p2distance)
                     else:
                         r2, p2 = nullrev, nullid
                     pl = [p1, p2]
                     pctx = repo[r1]
                     mf = pctx.manifest()
                     mfk = mf.keys()
                     changes = {}
                     if mfk:
                         for __ in xrange(pick(fileschanged)):
                             for __ in xrange(10):
                                 fctx = pctx.filectx(random.choice(mfk))
                                 path = fctx.path()
                                 if not (path in nevertouch or fctx.isbinary() or
                                         'l' in fctx.flags()):
                                     break
                             lines = fctx.data().splitlines()
                             add, remove = pick(lineschanged)
                             for __ in xrange(remove):
                                 if not lines:
                                     break
                                 del lines[random.randrange(0, len(lines))]
                             for __ in xrange(add):
                                 lines.insert(random.randint(0, len(lines)), makeline())
                             path = fctx.path()
                             changes[path] = context.memfilectx(repo, path,
                                                                '\n'.join(lines) + '\n')
                         for __ in xrange(pick(filesremoved)):
                             path = random.choice(mfk)
                             for __ in xrange(10):
                                 path = random.choice(mfk)
                                 if path not in changes:
                                     changes[path] = None
                                     break
                     if filesadded:
                         dirs = list(pctx.dirs())
                         dirs.append('')
                     for __ in xrange(pick(filesadded)):
                         path = [random.choice(dirs)]
                         if pick(dirsadded):
                             path.append(random.choice(words))
                         path.append(random.choice(words))
                         path = '/'.join(filter(None, path))
                         data = '\n'.join(makeline()
                                          for __ in xrange(pick(linesinfilesadded))) + '\n'
                         changes[path] = context.memfilectx(repo, path, data)
                     def filectxfn(repo, memctx, path):
                         return changes[path]
                     if not changes:
                         continue
                     if revs:
                         date = repo['tip'].date()[0] + pick(interarrival)
                     else:
                         date = time.time() - (86400 * count)
                     user = random.choice(words) + '@' + random.choice(words)
                     mc = context.memctx(repo, pl, makeline(minimum=2),
                                         sorted(changes.iterkeys()),
                                         filectxfn, user, '%d %d' % (date, pick(tzoffset)))
                     newnode = mc.commit()
                     heads.add(repo.changelog.rev(newnode))
                     heads.discard(r1)
                     heads.discard(r2)
                 lock.release()
                 wlock.release()
+            def renamedirs(dirs, words):
+                '''Randomly rename the directory names in the per-dir file count dict.'''
+                wordgen = itertools.cycle(words)
+                replacements = {'': ''}
+                def rename(dirpath):
+                    '''Recursively rename the directory and all path prefixes.
+                    The mapping from path to renamed path is stored for all path prefixes
+                    as in dynamic programming, ensuring linear runtime and consistent
+                    renaming regardless of iteration order through the model.
+                    '''
+                    if dirpath in replacements:
+                        return replacements[dirpath]
+                    head, _ = os.path.split(dirpath)
+                    head = head and rename(head) or ''
+                    renamed = os.path.join(head, wordgen.next())
+                    replacements[dirpath] = renamed
+                    return renamed
+                result = []
+                for dirpath, count in dirs.iteritems():
+                    result.append([rename(dirpath.lstrip(os.sep)), count])
+                return result