upstream/mercurial-mirror Commit - r20672:05e58b08

1

# synthrepo.py - repo synthesis

1

# synthrepo.py - repo synthesis

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

'''synthesize structurally interesting change history

8

'''synthesize structurally interesting change history

9

10

This extension is useful for creating a repository with properties

10

This extension is useful for creating a repository with properties

11

that are statistically similar to an existing repository. During

11

that are statistically similar to an existing repository. During

12

analysis, a simple probability table is constructed from the history

12

analysis, a simple probability table is constructed from the history

13

of an existing repository. During synthesis, these properties are

13

of an existing repository. During synthesis, these properties are

14

reconstructed.

14

reconstructed.

15

16

Properties that are analyzed and synthesized include the following:

16

Properties that are analyzed and synthesized include the following:

17

18

- Lines added or removed when an existing file is modified

18

- Lines added or removed when an existing file is modified

19

- Number and sizes of files added

19

- Number and sizes of files added

20

- Number of files removed

20

- Number of files removed

21

- Line lengths

21

- Line lengths

22

- Topological distance to parent changeset(s)

22

- Topological distance to parent changeset(s)

23

- Probability of a commit being a merge

23

- Probability of a commit being a merge

24

- Probability of a newly added file being added to a new directory

24

- Probability of a newly added file being added to a new directory

25

- Interarrival time, and time zone, of commits

25

- Interarrival time, and time zone, of commits

26

27

A few obvious properties that are not currently handled realistically:

27

A few obvious properties that are not currently handled realistically:

28

29

- Merges are treated as regular commits with two parents, which is not

29

- Merges are treated as regular commits with two parents, which is not

30

realistic

30

realistic

31

- Modifications are not treated as operations on hunks of lines, but

31

- Modifications are not treated as operations on hunks of lines, but

32

as insertions and deletions of randomly chosen single lines

32

as insertions and deletions of randomly chosen single lines

33

- Committer ID (always random)

33

- Committer ID (always random)

34

- Executability of files

34

- Executability of files

35

- Symlinks and binary files are ignored

35

- Symlinks and binary files are ignored

36

'''

36

'''

37

38

import bisect, collections, json, os, random, time, sys

38

import bisect, collections, json, os, random, time, sys

39

from mercurial import cmdutil, context, patch, scmutil, util, hg

39

from mercurial import cmdutil, context, patch, scmutil, util, hg

40

from mercurial.i18n import _

40

from mercurial.i18n import _

41

from mercurial.node import nullrev, nullid

41

from mercurial.node import nullrev, nullid

42

43

testedwith = 'internal'

43

testedwith = 'internal'

44

45

cmdtable = {}

45

cmdtable = {}

46

command = cmdutil.command(cmdtable)

46

command = cmdutil.command(cmdtable)

47

48

newfile = set(('new fi', 'rename', 'copy f', 'copy t'))

48

newfile = set(('new fi', 'rename', 'copy f', 'copy t'))

49

50

def zerodict():

50

def zerodict():

51

return collections.defaultdict(lambda: 0)

51

return collections.defaultdict(lambda: 0)

52

53

def roundto(x, k):

53

def roundto(x, k):

54

if x > k * 2:

54

if x > k * 2:

55

return int(round(x / float(k)) * k)

55

return int(round(x / float(k)) * k)

56

return int(round(x))

56

return int(round(x))

57

58

def parsegitdiff(lines):

58

def parsegitdiff(lines):

59

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

59

filename, mar, lineadd, lineremove = None, None, zerodict(), 0

60

binary = False

60

binary = False

61

for line in lines:

61

for line in lines:

62

start = line[:6]

62

start = line[:6]

63

if start == 'diff -':

63

if start == 'diff -':

64

if filename:

64

if filename:

65

yield filename, mar, lineadd, lineremove, binary

65

yield filename, mar, lineadd, lineremove, binary

66

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

66

mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False

67

filename = patch.gitre.match(line).group(1)

67

filename = patch.gitre.match(line).group(1)

68

elif start in newfile:

68

elif start in newfile:

69

mar = 'a'

69

mar = 'a'

70

elif start == 'GIT bi':

70

elif start == 'GIT bi':

71

binary = True

71

binary = True

72

elif start == 'delete':

72

elif start == 'delete':

73

mar = 'r'

73

mar = 'r'

74

elif start:

74

elif start:

75

s = start[0]

75

s = start[0]

76

if s == '-' and not line.startswith('--- '):

76

if s == '-' and not line.startswith('--- '):

77

lineremove += 1

77

lineremove += 1

78

elif s == '+' and not line.startswith('+++ '):

78

elif s == '+' and not line.startswith('+++ '):

79

lineadd[roundto(len(line) - 1, 5)] += 1

79

lineadd[roundto(len(line) - 1, 5)] += 1

80

if filename:

80

if filename:

81

yield filename, mar, lineadd, lineremove, binary

81

yield filename, mar, lineadd, lineremove, binary

82

83

@command('analyze',

83

@command('analyze',

84

[('o', 'output', [], _('write output to given file'), _('FILE')),

84

[('o', 'output', [], _('write output to given file'), _('FILE')),

85

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

85

('r', 'rev', [], _('analyze specified revisions'), _('REV'))],

86

_('hg analyze'))

86

_('hg analyze'))

87

def analyze(ui, repo, *revs, **opts):

87

def analyze(ui, repo, *revs, **opts):

88

'''create a simple model of a repository to use for later synthesis

88

'''create a simple model of a repository to use for later synthesis

89

90

This command examines every changeset in the given range (or all

90

This command examines every changeset in the given range (or all

91

of history if none are specified) and creates a simple statistical

91

of history if none are specified) and creates a simple statistical

92

model of the history of the repository.

92

model of the history of the repository.

93

94

The model is written out to a JSON file, and can be used by

94

The model is written out to a JSON file, and can be used by

95

:hg:`synthesize` to create or augment a repository with synthetic

95

:hg:`synthesize` to create or augment a repository with synthetic

96

commits that have a structure that is statistically similar to the

96

commits that have a structure that is statistically similar to the

97

analyzed repository.

97

analyzed repository.

98

'''

98

'''

99

100

revs = list(revs)

100

revs = list(revs)

101

revs.extend(opts['rev'])

101

revs.extend(opts['rev'])

102

if not revs:

102

if not revs:

103

revs = [':']

103

revs = [':']

104

105

output = opts['output']

105

output = opts['output']

106

if not output:

106

if not output:

107

output = os.path.basename(repo.root) + '.json'

107

output = os.path.basename(repo.root) + '.json'

108

109

if output == '-':

109

if output == '-':

110

fp = sys.stdout

110

fp = sys.stdout

111

else:

111

else:

112

fp = open(output, 'w')

112

fp = open(output, 'w')

113

114

revs = scmutil.revrange(repo, revs)

114

revs = scmutil.revrange(repo, revs)

115

revs.sort()

115

revs.sort()

116

117

lineschanged = zerodict()

117

lineschanged = zerodict()

118

children = zerodict()

118

children = zerodict()

119

p1distance = zerodict()

119

p1distance = zerodict()

120

p2distance = zerodict()

120

p2distance = zerodict()

121

linesinfilesadded = zerodict()

121

linesinfilesadded = zerodict()

122

fileschanged = zerodict()

122

fileschanged = zerodict()

123

filesadded = zerodict()

123

filesadded = zerodict()

124

filesremoved = zerodict()

124

filesremoved = zerodict()

125

linelengths = zerodict()

125

linelengths = zerodict()

126

interarrival = zerodict()

126

interarrival = zerodict()

127

parents = zerodict()

127

parents = zerodict()

128

dirsadded = zerodict()

128

dirsadded = zerodict()

129

tzoffset = zerodict()

129

tzoffset = zerodict()

130

131

progress = ui.progress

131

progress = ui.progress

132

_analyzing = _('analyzing')

132

_analyzing = _('analyzing')

133

_changesets = _('changesets')

133

_changesets = _('changesets')

134

_total = len(revs)

134

_total = len(revs)

135

136

for i, rev in enumerate(revs):

136

for i, rev in enumerate(revs):

137

progress(_analyzing, i, unit=_changesets, total=_total)

137

progress(_analyzing, i, unit=_changesets, total=_total)

138

ctx = repo[rev]

138

ctx = repo[rev]

139

pl = ctx.parents()

139

pl = ctx.parents()

140

pctx = pl[0]

140

pctx = pl[0]

141

prev = pctx.rev()

141

prev = pctx.rev()

142

children[prev] += 1

142

children[prev] += 1

143

p1distance[rev - prev] += 1

143

p1distance[rev - prev] += 1

144

parents[len(pl)] += 1

144

parents[len(pl)] += 1

145

tzoffset[ctx.date()[1]] += 1

145

tzoffset[ctx.date()[1]] += 1

146

if len(pl) > 1:

146

if len(pl) > 1:

147

p2distance[rev - pl[1].rev()] += 1

147

p2distance[rev - pl[1].rev()] += 1

148

if prev == rev - 1:

148

if prev == rev - 1:

149

lastctx = pctx

149

lastctx = pctx

150

else:

150

else:

151

lastctx = repo[rev - 1]

151

lastctx = repo[rev - 1]

152

if lastctx.rev() != nullrev:

152

if lastctx.rev() != nullrev:

153

interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1

153

interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1

154

diff = sum((d.splitlines()

154

diff = sum((d.splitlines()

155

for d in ctx.diff(pctx, opts=~~dict~~(git=True))), [])

155

for d in ctx.diff(pctx, opts={'git': True})), [])

156

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

156

fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0

157

for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):

157

for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):

158

if binary:

158

if binary:

159

continue

159

continue

160

added = sum(lineadd.itervalues(), 0)

160

added = sum(lineadd.itervalues(), 0)

161

if mar == 'm':

161

if mar == 'm':

162

if added and lineremove:

162

if added and lineremove:

163

lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1

163

lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1

164

filechanges += 1

164

filechanges += 1

165

elif mar == 'a':

165

elif mar == 'a':

166

fileadds += 1

166

fileadds += 1

167

if '/' in filename:

167

if '/' in filename:

168

filedir = filename.rsplit('/', 1)[0]

168

filedir = filename.rsplit('/', 1)[0]

169

if filedir not in pctx.dirs():

169

if filedir not in pctx.dirs():

170

diradds += 1

170

diradds += 1

171

linesinfilesadded[roundto(added, 5)] += 1

171

linesinfilesadded[roundto(added, 5)] += 1

172

elif mar == 'r':

172

elif mar == 'r':

173

fileremoves += 1

173

fileremoves += 1

174

for length, count in lineadd.iteritems():

174

for length, count in lineadd.iteritems():

175

linelengths[length] += count

175

linelengths[length] += count

176

fileschanged[filechanges] += 1

176

fileschanged[filechanges] += 1

177

filesadded[fileadds] += 1

177

filesadded[fileadds] += 1

178

dirsadded[diradds] += 1

178

dirsadded[diradds] += 1

179

filesremoved[fileremoves] += 1

179

filesremoved[fileremoves] += 1

180

181

invchildren = zerodict()

181

invchildren = zerodict()

182

183

for rev, count in children.iteritems():

183

for rev, count in children.iteritems():

184

invchildren[count] += 1

184

invchildren[count] += 1

185

186

if output != '-':

186

if output != '-':

187

ui.status(_('writing output to %s\n') % output)

187

ui.status(_('writing output to %s\n') % output)

188

189

def pronk(d):

189

def pronk(d):

190

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

190

return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)

191

192

json.dump(~~dict~~(revs=len(revs),

192

json.dump({'revs': len(revs),

193

lineschanged=pronk(lineschanged),

193

'lineschanged': pronk(lineschanged),

194

children=pronk(invchildren),

194

'children': pronk(invchildren),

195

fileschanged=pronk(fileschanged),

195

'fileschanged': pronk(fileschanged),

196

filesadded=pronk(filesadded),

196

'filesadded': pronk(filesadded),

197

linesinfilesadded=pronk(linesinfilesadded),

197

'linesinfilesadded': pronk(linesinfilesadded),

198

dirsadded=pronk(dirsadded),

198

'dirsadded': pronk(dirsadded),

199

filesremoved=pronk(filesremoved),

199

'filesremoved': pronk(filesremoved),

200

linelengths=pronk(linelengths),

200

'linelengths': pronk(linelengths),

201

parents=pronk(parents),

201

'parents': pronk(parents),

202

p1distance=pronk(p1distance),

202

'p1distance': pronk(p1distance),

203

p2distance=pronk(p2distance),

203

'p2distance': pronk(p2distance),

204

interarrival=pronk(interarrival),

204

'interarrival': pronk(interarrival),

205

tzoffset=pronk(tzoffset),

205

'tzoffset': pronk(tzoffset),

206

),

206

},

207

fp)

207

fp)

208

fp.close()

208

fp.close()

209

210

@command('synthesize',

210

@command('synthesize',

211

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

211

[('c', 'count', 0, _('create given number of commits'), _('COUNT')),

212

('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],

212

('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],

213

_('hg synthesize [OPTION].. DESCFILE'))

213

_('hg synthesize [OPTION].. DESCFILE'))

214

def synthesize(ui, repo, descpath, **opts):

214

def synthesize(ui, repo, descpath, **opts):

215

'''synthesize commits based on a model of an existing repository

215

'''synthesize commits based on a model of an existing repository

216

217

The model must have been generated by :hg:`analyze`. Commits will

217

The model must have been generated by :hg:`analyze`. Commits will

218

be generated randomly according to the probabilities described in

218

be generated randomly according to the probabilities described in

219

the model.

219

the model.

220

221

When synthesizing new content, commit descriptions, and user

221

When synthesizing new content, commit descriptions, and user

222

names, words will be chosen randomly from a dictionary that is

222

names, words will be chosen randomly from a dictionary that is

223

presumed to contain one word per line. Use --dict to specify the

223

presumed to contain one word per line. Use --dict to specify the

224

path to an alternate dictionary to use.

224

path to an alternate dictionary to use.

225

'''

225

'''

226

try:

226

try:

227

fp = hg.openpath(ui, descpath)

227

fp = hg.openpath(ui, descpath)

228

except Exception, err:

228

except Exception, err:

229

raise util.Abort('%s: %s' % (descpath, err[0].strerror))

229

raise util.Abort('%s: %s' % (descpath, err[0].strerror))

230

desc = json.load(fp)

230

desc = json.load(fp)

231

fp.close()

231

fp.close()

232

233

def cdf(l):

233

def cdf(l):

234

if not l:

234

if not l:

235

return [], []

235

return [], []

236

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

236

vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))

237

t = float(sum(probs, 0))

237

t = float(sum(probs, 0))

238

s, cdfs = 0, []

238

s, cdfs = 0, []

239

for v in probs:

239

for v in probs:

240

s += v

240

s += v

241

cdfs.append(s / t)

241

cdfs.append(s / t)

242

return vals, cdfs

242

return vals, cdfs

243

244

lineschanged = cdf(desc['lineschanged'])

244

lineschanged = cdf(desc['lineschanged'])

245

fileschanged = cdf(desc['fileschanged'])

245

fileschanged = cdf(desc['fileschanged'])

246

filesadded = cdf(desc['filesadded'])

246

filesadded = cdf(desc['filesadded'])

247

dirsadded = cdf(desc['dirsadded'])

247

dirsadded = cdf(desc['dirsadded'])

248

filesremoved = cdf(desc['filesremoved'])

248

filesremoved = cdf(desc['filesremoved'])

249

linelengths = cdf(desc['linelengths'])

249

linelengths = cdf(desc['linelengths'])

250

parents = cdf(desc['parents'])

250

parents = cdf(desc['parents'])

251

p1distance = cdf(desc['p1distance'])

251

p1distance = cdf(desc['p1distance'])

252

p2distance = cdf(desc['p2distance'])

252

p2distance = cdf(desc['p2distance'])

253

interarrival = cdf(desc['interarrival'])

253

interarrival = cdf(desc['interarrival'])

254

linesinfilesadded = cdf(desc['linesinfilesadded'])

254

linesinfilesadded = cdf(desc['linesinfilesadded'])

255

tzoffset = cdf(desc['tzoffset'])

255

tzoffset = cdf(desc['tzoffset'])

256

257

dictfile = opts.get('dict') or '/usr/share/dict/words'

257

dictfile = opts.get('dict') or '/usr/share/dict/words'

258

try:

258

try:

259

fp = open(dictfile, 'rU')

259

fp = open(dictfile, 'rU')

260

except IOError, err:

260

except IOError, err:

261

raise util.Abort('%s: %s' % (dictfile, err.strerror))

261

raise util.Abort('%s: %s' % (dictfile, err.strerror))

262

words = fp.read().splitlines()

262

words = fp.read().splitlines()

263

fp.close()

263

fp.close()

264

265

def pick(cdf):

265

def pick(cdf):

266

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

266

return cdf[0][bisect.bisect_left(cdf[1], random.random())]

267

268

def makeline(minimum=0):

268

def makeline(minimum=0):

269

total = max(minimum, pick(linelengths))

269

total = max(minimum, pick(linelengths))

270

c, l = 0, []

270

c, l = 0, []

271

while c < total:

271

while c < total:

272

w = random.choice(words)

272

w = random.choice(words)

273

c += len(w) + 1

273

c += len(w) + 1

274

l.append(w)

274

l.append(w)

275

return ' '.join(l)

275

return ' '.join(l)

276

277

wlock = repo.wlock()

277

wlock = repo.wlock()

278

lock = repo.lock()

278

lock = repo.lock()

279

280

nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))

280

nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))

281

282

progress = ui.progress

282

progress = ui.progress

283

_synthesizing = _('synthesizing')

283

_synthesizing = _('synthesizing')

284

_changesets = _('changesets')

284

_changesets = _('changesets')

285

286

count = int(opts['count'])

286

count = int(opts['count'])

287

heads = set(map(repo.changelog.rev, repo.heads()))

287

heads = set(map(repo.changelog.rev, repo.heads()))

288

for i in xrange(count):

288

for i in xrange(count):

289

progress(_synthesizing, i, unit=_changesets, total=count)

289

progress(_synthesizing, i, unit=_changesets, total=count)

290

291

node = repo.changelog.node

291

node = repo.changelog.node

292

revs = len(repo)

292

revs = len(repo)

293

294

def pickhead(heads, distance):

294

def pickhead(heads, distance):

295

if heads:

295

if heads:

296

lheads = sorted(heads)

296

lheads = sorted(heads)

297

rev = revs - min(pick(distance), revs)

297

rev = revs - min(pick(distance), revs)

298

if rev < lheads[-1]:

298

if rev < lheads[-1]:

299

rev = lheads[bisect.bisect_left(lheads, rev)]

299

rev = lheads[bisect.bisect_left(lheads, rev)]

300

else:

300

else:

301

rev = lheads[-1]

301

rev = lheads[-1]

302

return rev, node(rev)

302

return rev, node(rev)

303

return nullrev, nullid

303

return nullrev, nullid

304

305

r1 = revs - min(pick(p1distance), revs)

305

r1 = revs - min(pick(p1distance), revs)

306

p1 = node(r1)

306

p1 = node(r1)

307

308

# the number of heads will grow without bound if we use a pure

308

# the number of heads will grow without bound if we use a pure

309

# model, so artificially constrain their proliferation

309

# model, so artificially constrain their proliferation

310

if pick(parents) == 2 or len(heads) > random.randint(1, 20):

310

if pick(parents) == 2 or len(heads) > random.randint(1, 20):

311

r2, p2 = pickhead(heads.difference([r1]), p2distance)

311

r2, p2 = pickhead(heads.difference([r1]), p2distance)

312

else:

312

else:

313

r2, p2 = nullrev, nullid

313

r2, p2 = nullrev, nullid

314

315

pl = [p1, p2]

315

pl = [p1, p2]

316

pctx = repo[r1]

316

pctx = repo[r1]

317

mf = pctx.manifest()

317

mf = pctx.manifest()

318

mfk = mf.keys()

318

mfk = mf.keys()

319

changes = {}

319

changes = {}

320

if mfk:

320

if mfk:

321

for __ in xrange(pick(fileschanged)):

321

for __ in xrange(pick(fileschanged)):

322

for __ in xrange(10):

322

for __ in xrange(10):

323

fctx = pctx.filectx(random.choice(mfk))

323

fctx = pctx.filectx(random.choice(mfk))

324

path = fctx.path()

324

path = fctx.path()

325

if not (path in nevertouch or fctx.isbinary() or

325

if not (path in nevertouch or fctx.isbinary() or

326

'l' in fctx.flags()):

326

'l' in fctx.flags()):

327

break

327

break

328

lines = fctx.data().splitlines()

328

lines = fctx.data().splitlines()

329

add, remove = pick(lineschanged)

329

add, remove = pick(lineschanged)

330

for __ in xrange(remove):

330

for __ in xrange(remove):

331

if not lines:

331

if not lines:

332

break

332

break

333

del lines[random.randrange(0, len(lines))]

333

del lines[random.randrange(0, len(lines))]

334

for __ in xrange(add):

334

for __ in xrange(add):

335

lines.insert(random.randint(0, len(lines)), makeline())

335

lines.insert(random.randint(0, len(lines)), makeline())

336

path = fctx.path()

336

path = fctx.path()

337

changes[path] = context.memfilectx(path,

337

changes[path] = context.memfilectx(path,

338

'\n'.join(lines) + '\n')

338

'\n'.join(lines) + '\n')

339

for __ in xrange(pick(filesremoved)):

339

for __ in xrange(pick(filesremoved)):

340

path = random.choice(mfk)

340

path = random.choice(mfk)

341

for __ in xrange(10):

341

for __ in xrange(10):

342

path = random.choice(mfk)

342

path = random.choice(mfk)

343

if path not in changes:

343

if path not in changes:

344

changes[path] = None

344

changes[path] = None

345

break

345

break

346

if filesadded:

346

if filesadded:

347

dirs = list(pctx.dirs())

347

dirs = list(pctx.dirs())

348

dirs.append('')

348

dirs.append('')

349

for __ in xrange(pick(filesadded)):

349

for __ in xrange(pick(filesadded)):

350

path = [random.choice(dirs)]

350

path = [random.choice(dirs)]

351

if pick(dirsadded):

351

if pick(dirsadded):

352

path.append(random.choice(words))

352

path.append(random.choice(words))

353

path.append(random.choice(words))

353

path.append(random.choice(words))

354

path = '/'.join(filter(None, path))

354

path = '/'.join(filter(None, path))

355

data = '\n'.join(makeline()

355

data = '\n'.join(makeline()

356

for __ in xrange(pick(linesinfilesadded))) + '\n'

356

for __ in xrange(pick(linesinfilesadded))) + '\n'

357

changes[path] = context.memfilectx(path, data)

357

changes[path] = context.memfilectx(path, data)

358

def filectxfn(repo, memctx, path):

358

def filectxfn(repo, memctx, path):

359

data = changes[path]

359

data = changes[path]

360

if data is None:

360

if data is None:

361

raise IOError

361

raise IOError

362

return data

362

return data

363

if not changes:

363

if not changes:

364

continue

364

continue

365

if revs:

365

if revs:

366

date = repo['tip'].date()[0] + pick(interarrival)

366

date = repo['tip'].date()[0] + pick(interarrival)

367

else:

367

else:

368

date = time.time() - (86400 * count)

368

date = time.time() - (86400 * count)

369

user = random.choice(words) + '@' + random.choice(words)

369

user = random.choice(words) + '@' + random.choice(words)

370

mc = context.memctx(repo, pl, makeline(minimum=2),

370

mc = context.memctx(repo, pl, makeline(minimum=2),

371

sorted(changes.iterkeys()),

371

sorted(changes.iterkeys()),

372

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

372

filectxfn, user, '%d %d' % (date, pick(tzoffset)))

373

newnode = mc.commit()

373

newnode = mc.commit()

374

heads.add(repo.changelog.rev(newnode))

374

heads.add(repo.changelog.rev(newnode))

375

heads.discard(r1)

375

heads.discard(r1)

376

heads.discard(r2)

376

heads.discard(r2)

377

378

lock.release()

378

lock.release()

379

wlock.release()

379

wlock.release()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # synthrepo.py - repo synthesis
             #
             # Copyright 2012 Facebook
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             '''synthesize structurally interesting change history
             This extension is useful for creating a repository with properties
             that are statistically similar to an existing repository. During
             analysis, a simple probability table is constructed from the history
             of an existing repository.  During synthesis, these properties are
             reconstructed.
             Properties that are analyzed and synthesized include the following:
             - Lines added or removed when an existing file is modified
             - Number and sizes of files added
             - Number of files removed
             - Line lengths
             - Topological distance to parent changeset(s)
             - Probability of a commit being a merge
             - Probability of a newly added file being added to a new directory
             - Interarrival time, and time zone, of commits
             A few obvious properties that are not currently handled realistically:
             - Merges are treated as regular commits with two parents, which is not
               realistic
             - Modifications are not treated as operations on hunks of lines, but
               as insertions and deletions of randomly chosen single lines
             - Committer ID (always random)
             - Executability of files
             - Symlinks and binary files are ignored
             '''
             import bisect, collections, json, os, random, time, sys
             from mercurial import cmdutil, context, patch, scmutil, util, hg
             from mercurial.i18n import _
             from mercurial.node import nullrev, nullid
             testedwith = 'internal'
             cmdtable = {}
             command = cmdutil.command(cmdtable)
             newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
             def zerodict():
                 return collections.defaultdict(lambda: 0)
             def roundto(x, k):
                 if x > k * 2:
                     return int(round(x / float(k)) * k)
                 return int(round(x))
             def parsegitdiff(lines):
                 filename, mar, lineadd, lineremove = None, None, zerodict(), 0
                 binary = False
                 for line in lines:
                     start = line[:6]
                     if start == 'diff -':
                         if filename:
                             yield filename, mar, lineadd, lineremove, binary
                         mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
                         filename = patch.gitre.match(line).group(1)
                     elif start in newfile:
                         mar = 'a'
                     elif start == 'GIT bi':
                         binary = True
                     elif start == 'delete':
                         mar = 'r'
                     elif start:
                         s = start[0]
                         if s == '-' and not line.startswith('--- '):
                             lineremove += 1
                         elif s == '+' and not line.startswith('+++ '):
                             lineadd[roundto(len(line) - 1, 5)] += 1
                 if filename:
                     yield filename, mar, lineadd, lineremove, binary
             @command('analyze',
                      [('o', 'output', [], _('write output to given file'), _('FILE')),
                       ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
                      _('hg analyze'))
             def analyze(ui, repo, *revs, **opts):
                 '''create a simple model of a repository to use for later synthesis
                 This command examines every changeset in the given range (or all
                 of history if none are specified) and creates a simple statistical
                 model of the history of the repository.
                 The model is written out to a JSON file, and can be used by
                 :hg:`synthesize` to create or augment a repository with synthetic
                 commits that have a structure that is statistically similar to the
                 analyzed repository.
                 '''
                 revs = list(revs)
                 revs.extend(opts['rev'])
                 if not revs:
                     revs = [':']
                 output = opts['output']
                 if not output:
                     output = os.path.basename(repo.root) + '.json'
                 if output == '-':
                     fp = sys.stdout
                 else:
                     fp = open(output, 'w')
                 revs = scmutil.revrange(repo, revs)
                 revs.sort()
                 lineschanged = zerodict()
                 children = zerodict()
                 p1distance = zerodict()
                 p2distance = zerodict()
                 linesinfilesadded = zerodict()
                 fileschanged = zerodict()
                 filesadded = zerodict()
                 filesremoved = zerodict()
                 linelengths = zerodict()
                 interarrival = zerodict()
                 parents = zerodict()
                 dirsadded = zerodict()
                 tzoffset = zerodict()
                 progress = ui.progress
                 _analyzing = _('analyzing')
                 _changesets = _('changesets')
                 _total = len(revs)
                 for i, rev in enumerate(revs):
                     progress(_analyzing, i, unit=_changesets, total=_total)
                     ctx = repo[rev]
                     pl = ctx.parents()
                     pctx = pl[0]
                     prev = pctx.rev()
                     children[prev] += 1
                     p1distance[rev - prev] += 1
                     parents[len(pl)] += 1
                     tzoffset[ctx.date()[1]] += 1
                     if len(pl) > 1:
                         p2distance[rev - pl[1].rev()] += 1
                     if prev == rev - 1:
                         lastctx = pctx
                     else:
                         lastctx = repo[rev - 1]
                     if lastctx.rev() != nullrev:
                         interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
                     diff = sum((d.splitlines()
-                                for d in ctx.diff(pctx, opts=dict(git=True))), [])
+                                for d in ctx.diff(pctx, opts={'git': True})), [])
                     fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
                     for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
                         if binary:
                             continue
                         added = sum(lineadd.itervalues(), 0)
                         if mar == 'm':
                             if added and lineremove:
                                 lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
                                 filechanges += 1
                         elif mar == 'a':
                             fileadds += 1
                             if '/' in filename:
                                 filedir = filename.rsplit('/', 1)[0]
                                 if filedir not in pctx.dirs():
                                     diradds += 1
                             linesinfilesadded[roundto(added, 5)] += 1
                         elif mar == 'r':
                             fileremoves += 1
                         for length, count in lineadd.iteritems():
                             linelengths[length] += count
                     fileschanged[filechanges] += 1
                     filesadded[fileadds] += 1
                     dirsadded[diradds] += 1
                     filesremoved[fileremoves] += 1
                 invchildren = zerodict()
                 for rev, count in children.iteritems():
                     invchildren[count] += 1
                 if output != '-':
                     ui.status(_('writing output to %s\n') % output)
                 def pronk(d):
                     return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
-                json.dump(dict(revs=len(revs),
+                json.dump({'revs': len(revs),
-                               lineschanged=pronk(lineschanged),
+                           'lineschanged': pronk(lineschanged),
-                               children=pronk(invchildren),
+                           'children': pronk(invchildren),
-                               fileschanged=pronk(fileschanged),
+                           'fileschanged': pronk(fileschanged),
-                               filesadded=pronk(filesadded),
+                           'filesadded': pronk(filesadded),
-                               linesinfilesadded=pronk(linesinfilesadded),
+                           'linesinfilesadded': pronk(linesinfilesadded),
-                               dirsadded=pronk(dirsadded),
+                           'dirsadded': pronk(dirsadded),
-                               filesremoved=pronk(filesremoved),
+                           'filesremoved': pronk(filesremoved),
-                               linelengths=pronk(linelengths),
+                           'linelengths': pronk(linelengths),
-                               parents=pronk(parents),
+                           'parents': pronk(parents),
-                               p1distance=pronk(p1distance),
+                           'p1distance': pronk(p1distance),
-                               p2distance=pronk(p2distance),
+                           'p2distance': pronk(p2distance),
-                               interarrival=pronk(interarrival),
+                           'interarrival': pronk(interarrival),
-                               tzoffset=pronk(tzoffset),
+                           'tzoffset': pronk(tzoffset),
-                               ),
+                           },
                           fp)
                 fp.close()
             @command('synthesize',
                      [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
                       ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
                      _('hg synthesize [OPTION].. DESCFILE'))
             def synthesize(ui, repo, descpath, **opts):
                 '''synthesize commits based on a model of an existing repository
                 The model must have been generated by :hg:`analyze`. Commits will
                 be generated randomly according to the probabilities described in
                 the model.
                 When synthesizing new content, commit descriptions, and user
                 names, words will be chosen randomly from a dictionary that is
                 presumed to contain one word per line. Use --dict to specify the
                 path to an alternate dictionary to use.
                 '''
                 try:
                     fp = hg.openpath(ui, descpath)
                 except Exception, err:
                     raise util.Abort('%s: %s' % (descpath, err[0].strerror))
                 desc = json.load(fp)
                 fp.close()
                 def cdf(l):
                     if not l:
                         return [], []
                     vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
                     t = float(sum(probs, 0))
                     s, cdfs = 0, []
                     for v in probs:
                         s += v
                         cdfs.append(s / t)
                     return vals, cdfs
                 lineschanged = cdf(desc['lineschanged'])
                 fileschanged = cdf(desc['fileschanged'])
                 filesadded = cdf(desc['filesadded'])
                 dirsadded = cdf(desc['dirsadded'])
                 filesremoved = cdf(desc['filesremoved'])
                 linelengths = cdf(desc['linelengths'])
                 parents = cdf(desc['parents'])
                 p1distance = cdf(desc['p1distance'])
                 p2distance = cdf(desc['p2distance'])
                 interarrival = cdf(desc['interarrival'])
                 linesinfilesadded = cdf(desc['linesinfilesadded'])
                 tzoffset = cdf(desc['tzoffset'])
                 dictfile = opts.get('dict') or '/usr/share/dict/words'
                 try:
                     fp = open(dictfile, 'rU')
                 except IOError, err:
                     raise util.Abort('%s: %s' % (dictfile, err.strerror))
                 words = fp.read().splitlines()
                 fp.close()
                 def pick(cdf):
                     return cdf[0][bisect.bisect_left(cdf[1], random.random())]
                 def makeline(minimum=0):
                     total = max(minimum, pick(linelengths))
                     c, l = 0, []
                     while c < total:
                         w = random.choice(words)
                         c += len(w) + 1
                         l.append(w)
                     return ' '.join(l)
                 wlock = repo.wlock()
                 lock = repo.lock()
                 nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
                 progress = ui.progress
                 _synthesizing = _('synthesizing')
                 _changesets = _('changesets')
                 count = int(opts['count'])
                 heads = set(map(repo.changelog.rev, repo.heads()))
                 for i in xrange(count):
                     progress(_synthesizing, i, unit=_changesets, total=count)
                     node = repo.changelog.node
                     revs = len(repo)
                     def pickhead(heads, distance):
                         if heads:
                             lheads = sorted(heads)
                             rev = revs - min(pick(distance), revs)
                             if rev < lheads[-1]:
                                 rev = lheads[bisect.bisect_left(lheads, rev)]
                             else:
                                 rev = lheads[-1]
                             return rev, node(rev)
                         return nullrev, nullid
                     r1 = revs - min(pick(p1distance), revs)
                     p1 = node(r1)
                     # the number of heads will grow without bound if we use a pure
                     # model, so artificially constrain their proliferation
                     if pick(parents) == 2 or len(heads) > random.randint(1, 20):
                         r2, p2 = pickhead(heads.difference([r1]), p2distance)
                     else:
                         r2, p2 = nullrev, nullid
                     pl = [p1, p2]
                     pctx = repo[r1]
                     mf = pctx.manifest()
                     mfk = mf.keys()
                     changes = {}
                     if mfk:
                         for __ in xrange(pick(fileschanged)):
                             for __ in xrange(10):
                                 fctx = pctx.filectx(random.choice(mfk))
                                 path = fctx.path()
                                 if not (path in nevertouch or fctx.isbinary() or
                                         'l' in fctx.flags()):
                                     break
                             lines = fctx.data().splitlines()
                             add, remove = pick(lineschanged)
                             for __ in xrange(remove):
                                 if not lines:
                                     break
                                 del lines[random.randrange(0, len(lines))]
                             for __ in xrange(add):
                                 lines.insert(random.randint(0, len(lines)), makeline())
                             path = fctx.path()
                             changes[path] = context.memfilectx(path,
                                                                '\n'.join(lines) + '\n')
                         for __ in xrange(pick(filesremoved)):
                             path = random.choice(mfk)
                             for __ in xrange(10):
                                 path = random.choice(mfk)
                                 if path not in changes:
                                     changes[path] = None
                                     break
                     if filesadded:
                         dirs = list(pctx.dirs())
                         dirs.append('')
                     for __ in xrange(pick(filesadded)):
                         path = [random.choice(dirs)]
                         if pick(dirsadded):
                             path.append(random.choice(words))
                         path.append(random.choice(words))
                         path = '/'.join(filter(None, path))
                         data = '\n'.join(makeline()
                                          for __ in xrange(pick(linesinfilesadded))) + '\n'
                         changes[path] = context.memfilectx(path, data)
                     def filectxfn(repo, memctx, path):
                         data = changes[path]
                         if data is None:
                             raise IOError
                         return data
                     if not changes:
                         continue
                     if revs:
                         date = repo['tip'].date()[0] + pick(interarrival)
                     else:
                         date = time.time() - (86400 * count)
                     user = random.choice(words) + '@' + random.choice(words)
                     mc = context.memctx(repo, pl, makeline(minimum=2),
                                         sorted(changes.iterkeys()),
                                         filectxfn, user, '%d %d' % (date, pick(tzoffset)))
                     newnode = mc.commit()
                     heads.add(repo.changelog.rev(newnode))
                     heads.discard(r1)
                     heads.discard(r2)
                 lock.release()
                 wlock.release()