upstream/mercurial-mirror Commit - r22709:889789a2

contrib/synthrepo: walk a repo's directory structure during analysis...

Mike Edgar -

r22709:889789a2 default

parent child

contrib/synthrepo.py

0 +75 -53

@@ -23,6 +23,7 b' Properties that are analyzed and synthes'
23	- Probability of a commit being a merge	23	- Probability of a commit being a merge
24	- Probability of a newly added file being added to a new directory	24	- Probability of a newly added file being added to a new directory
25	- Interarrival time, and time zone, of commits	25	- Interarrival time, and time zone, of commits
		26	- Number of files in each directory
26		27
27	A few obvious properties that are not currently handled realistically:	28	A few obvious properties that are not currently handled realistically:
28		29
@@ -81,21 +82,25 b' def parsegitdiff(lines):'
81	yield filename, mar, lineadd, lineremove, binary	82	yield filename, mar, lineadd, lineremove, binary
82		83
83	@command('analyze',	84	@command('analyze',
84	[('o', 'output', [], _('write output to given file'), _('FILE')),	85	[('o', 'output', '', _('write output to given file'), _('FILE')),
85	('r', 'rev', [], _('analyze specified revisions'), _('REV'))],	86	('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
86	_('hg analyze'))	87	_('hg analyze'), optionalrepo=True)
87	def analyze(ui, repo, revs, *opts):	88	def analyze(ui, repo, revs, *opts):
88	'''create a simple model of a repository to use for later synthesis	89	'''create a simple model of a repository to use for later synthesis
89		90
90	This command examines every changeset in the given range (or all	91	This command examines every changeset in the given range (or all
91	of history if none are specified) and creates a simple statistical	92	of history if none are specified) and creates a simple statistical
92	model of the history of the repository.	93	model of the history of the repository. It also measures the directory
		94	structure of the repository as checked out.
93		95
94	The model is written out to a JSON file, and can be used by	96	The model is written out to a JSON file, and can be used by
95	:hg:`synthesize` to create or augment a repository with synthetic	97	:hg:`synthesize` to create or augment a repository with synthetic
96	commits that have a structure that is statistically similar to the	98	commits that have a structure that is statistically similar to the
97	analyzed repository.	99	analyzed repository.
98	'''	100	'''
		101	root = repo.root
		102	if not root.endswith(os.path.sep):
		103	root += os.path.sep
99		104
100	revs = list(revs)	105	revs = list(revs)
101	revs.extend(opts['rev'])	106	revs.extend(opts['rev'])
@@ -104,15 +109,24 b' def analyze(ui, repo, revs, *opts):'
104		109
105	output = opts['output']	110	output = opts['output']
106	if not output:	111	if not output:
107	output = os.path.basename(~~repo~~.root) + '.json'	112	output = os.path.basename(root) + '.json'
108		113
109	if output == '-':	114	if output == '-':
110	fp = sys.stdout	115	fp = sys.stdout
111	else:	116	else:
112	fp = open(output, 'w')	117	fp = open(output, 'w')
113		118
114	revs = scmutil.revrange(repo, revs)	119	# Always obtain file counts of each directory in the given root directory.
115	revs.sort()	120	def onerror(e):
		121	ui.warn(_('error walking directory structure: %s\n') % e)
		122
		123	dirs = {}
		124	rootprefixlen = len(root)
		125	for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
		126	dirpathfromroot = dirpath[rootprefixlen:]
		127	dirs[dirpathfromroot] = len(filenames)
		128	if '.hg' in dirnames:
		129	dirnames.remove('.hg')
116		130
117	lineschanged = zerodict()	131	lineschanged = zerodict()
118	children = zerodict()	132	children = zerodict()
@@ -128,54 +142,61 b' def analyze(ui, repo, revs, *opts):'
128	dirsadded = zerodict()	142	dirsadded = zerodict()
129	tzoffset = zerodict()	143	tzoffset = zerodict()
130		144
131	progress = ui.progress	145	# If a mercurial repo is available, also model the commit history.
132	_analyzing = _('analyzing')	146	if repo:
133	_changesets = _('changesets')	147	revs = scmutil.revrange(repo, revs)
134	_total = len(revs)	148	revs.sort()
		149
		150	progress = ui.progress
		151	_analyzing = _('analyzing')
		152	_changesets = _('changesets')
		153	_total = len(revs)
135		154
136	for i, rev in enumerate(revs):	155	for i, rev in enumerate(revs):
137	progress(_analyzing, i, unit=_changesets, total=_total)	156	progress(_analyzing, i, unit=_changesets, total=_total)
138	ctx = repo[rev]	157	ctx = repo[rev]
139	pl = ctx.parents()	158	pl = ctx.parents()
140	pctx = pl[0]	159	pctx = pl[0]
141	prev = pctx.rev()	160	prev = pctx.rev()
142	children[prev] += 1	161	children[prev] += 1
143	p1distance[rev - prev] += 1	162	p1distance[rev - prev] += 1
144	parents[len(pl)] += 1	163	parents[len(pl)] += 1
145	tzoffset[ctx.date()[1]] += 1	164	tzoffset[ctx.date()[1]] += 1
146	if len(pl) > 1:	165	if len(pl) > 1:
147	p2distance[rev - pl[1].rev()] += 1	166	p2distance[rev - pl[1].rev()] += 1
148	if prev == rev - 1:	167	if prev == rev - 1:
149	lastctx = pctx	168	lastctx = pctx
150	else:	169	else:
151	lastctx = repo[rev - 1]	170	lastctx = repo[rev - 1]
152	if lastctx.rev() != nullrev:	171	if lastctx.rev() != nullrev:
153	~~interarrival~~[~~roundto~~(ctx.date()[0] - lastctx.date()[0], ~~300~~)] += 1	172	timedelta = ctx.date()[0] - lastctx.date()[0]
154	diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])	173	interarrival[roundto(timedelta, 300)] += 1
155	fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0	174	diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
156	for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):	175	fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
157	if binary:	176	for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
158	~~continue~~	177	if isbin:
159	added = sum(lineadd.itervalues(), 0)	178	continue
160	if mar == 'm':	179	added = sum(lineadd.itervalues(), 0)
161	if ~~added~~ ~~and~~ ~~lineremove~~:	180	if mar == 'm':
162	~~lineschanged~~[~~roundto~~(added, 5), ~~roundto~~(lineremove, 5)] += 1	181	if added and lineremove:
163	~~filechanges~~ += 1	182	lineschanged[roundto(added, 5),
164	elif mar == 'a':	183	roundto(lineremove, 5)] += 1
165	~~fileadd~~s += 1	184	filechanges += 1
166	if '/' in ~~filename~~:	185	elif mar == 'a':
167	file~~dir~~ = ~~filename~~.~~rsplit~~(~~'/'~~, 1)[0]	186	fileadds += 1
168	if ~~filedir~~ ~~not~~ in ~~pctx~~.~~dirs~~():	187	if '/' in filename:
169	dir~~adds~~ += 1	188	filedir = filename.rsplit('/', 1)[0]
170	linesinfilesadded[roundto(added, 5)] += 1	189	if filedir not in pctx.dirs():
171	elif mar == 'r':	190	diradds += 1
172	fileremoves += 1	191	linesinfilesadded[roundto(added, 5)] += 1
173	for length, count in lineadd.iteritems():	192	elif mar == 'r':
174	~~linelengths~~[~~length~~] += ~~count~~	193	fileremoves += 1
175	fileschanged[filechanges] += 1	194	for length, count in lineadd.iteritems():
176	filesadded[fileadds] += 1	195	linelengths[length] += count
177	~~dirsadd~~ed[~~diradd~~s] += 1	196	fileschanged[filechanges] += 1
178	~~filesremov~~ed[file~~remove~~s] += 1	197	filesadded[fileadds] += 1
		198	dirsadded[diradds] += 1
		199	filesremoved[fileremoves] += 1
179		200
180	invchildren = zerodict()	201	invchildren = zerodict()
181		202
@@ -189,6 +210,7 b' def analyze(ui, repo, revs, *opts):'
189	return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)	210	return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
190		211
191	json.dump({'revs': len(revs),	212	json.dump({'revs': len(revs),
		213	'initdirs': pronk(dirs),
192	'lineschanged': pronk(lineschanged),	214	'lineschanged': pronk(lineschanged),
193	'children': pronk(invchildren),	215	'children': pronk(invchildren),
194	'fileschanged': pronk(fileschanged),	216	'fileschanged': pronk(fileschanged),

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages