shrink-revlog.py
294 lines
| 9.5 KiB
| text/x-python
|
PythonLexer
/ contrib / shrink-revlog.py
Augie Fackler
|
r14028 | """reorder a revlog (the manifest by default) to save space | ||
Greg Ward
|
r10236 | |||
Specifically, this topologically sorts the revisions in the revlog so that | ||||
revisions on the same branch are adjacent as much as possible. This is a | ||||
workaround for the fact that Mercurial computes deltas relative to the | ||||
Dirkjan Ochtman
|
r10216 | previous revision rather than relative to a parent revision. | ||
This is *not* safe to run on a changelog. | ||||
Greg Ward
|
r9515 | """ | ||
# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org> | ||||
Dirkjan Ochtman
|
r10216 | # as a patch to rewrite-log. Cleaned up, refactored, documented, and | ||
Greg Ward
|
r9515 | # renamed by Greg Ward <greg at gerg.ca>. | ||
# XXX would be nice to have a way to verify the repository after shrinking, | ||||
# e.g. by comparing "before" and "after" states of random changesets | ||||
# (maybe: export before, shrink, export after, diff). | ||||
Greg Ward
|
r16305 | import os, errno | ||
Augie Fackler
|
r14029 | from mercurial import revlog, transaction, node, util, scmutil | ||
Benoit Boissinot
|
r10009 | from mercurial import changegroup | ||
Benoit Boissinot
|
r10508 | from mercurial.i18n import _ | ||
Greg Ward
|
r9515 | |||
Benoit Boissinot
|
r10627 | def postorder(start, edges): | ||
result = [] | ||||
visit = list(start) | ||||
finished = set() | ||||
Greg Ward
|
r9515 | |||
Benoit Boissinot
|
r10627 | while visit: | ||
cur = visit[-1] | ||||
for p in edges[cur]: | ||||
Augie Fackler
|
r14034 | # defend against node.nullrev because it's occasionally | ||
# possible for a node to have parents (null, something) | ||||
# rather than (something, null) | ||||
if p not in finished and p != node.nullrev: | ||||
Benoit Boissinot
|
r10627 | visit.append(p) | ||
break | ||||
else: | ||||
result.append(cur) | ||||
finished.add(cur) | ||||
visit.pop() | ||||
return result | ||||
Greg Ward
|
r9515 | |||
Greg Ward
|
r10623 | def toposort_reversepostorder(ui, rl): | ||
# postorder of the reverse directed graph | ||||
# map rev to list of parent revs (p2 first) | ||||
parents = {} | ||||
heads = set() | ||||
Benoit Boissinot
|
r10508 | ui.status(_('reading revs\n')) | ||
Greg Ward
|
r9515 | try: | ||
Greg Ward
|
r10623 | for rev in rl: | ||
ui.progress(_('reading'), rev, total=len(rl)) | ||||
(p1, p2) = rl.parentrevs(rev) | ||||
if p1 == p2 == node.nullrev: | ||||
parents[rev] = () # root node | ||||
elif p1 == p2 or p2 == node.nullrev: | ||||
parents[rev] = (p1,) # normal node | ||||
else: | ||||
parents[rev] = (p2, p1) # merge node | ||||
heads.add(rev) | ||||
for p in parents[rev]: | ||||
heads.discard(p) | ||||
finally: | ||||
Matt Mackall
|
r10724 | ui.progress(_('reading'), None) | ||
Greg Ward
|
r9515 | |||
Benoit Boissinot
|
r10627 | heads = list(heads) | ||
heads.sort(reverse=True) | ||||
Greg Ward
|
r10623 | |||
Benoit Boissinot
|
r10627 | ui.status(_('sorting revs\n')) | ||
return postorder(heads, parents) | ||||
Greg Ward
|
r10623 | |||
def toposort_postorderreverse(ui, rl): | ||||
# reverse-postorder of the reverse directed graph | ||||
Greg Ward
|
r9515 | |||
Greg Ward
|
r10623 | children = {} | ||
roots = set() | ||||
ui.status(_('reading revs\n')) | ||||
try: | ||||
for rev in rl: | ||||
ui.progress(_('reading'), rev, total=len(rl)) | ||||
(p1, p2) = rl.parentrevs(rev) | ||||
if p1 == p2 == node.nullrev: | ||||
roots.add(rev) | ||||
children[rev] = [] | ||||
if p1 != node.nullrev: | ||||
children[p1].append(rev) | ||||
if p2 != node.nullrev: | ||||
children[p2].append(rev) | ||||
finally: | ||||
Matt Mackall
|
r10724 | ui.progress(_('reading'), None) | ||
Greg Ward
|
r10623 | |||
Pradeepkumar Gayam
|
r11298 | roots = list(roots) | ||
Benoit Boissinot
|
r10627 | roots.sort() | ||
Greg Ward
|
r10624 | |||
Benoit Boissinot
|
r10508 | ui.status(_('sorting revs\n')) | ||
Benoit Boissinot
|
r10627 | result = postorder(roots, children) | ||
Greg Ward
|
r10623 | result.reverse() | ||
return result | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10213 | def writerevs(ui, r1, r2, order, tr): | ||
Benoit Boissinot
|
r10009 | |||
Benoit Boissinot
|
r10508 | ui.status(_('writing revs\n')) | ||
Benoit Boissinot
|
r10440 | |||
Benoit Boissinot
|
r10009 | |||
order = [r1.node(r) for r in order] | ||||
# this is a bit ugly, but it works | ||||
Matt Mackall
|
r13783 | count = [0] | ||
Augie Fackler
|
r14030 | def lookup(revl, x): | ||
Matt Mackall
|
r13783 | count[0] += 1 | ||
ui.progress(_('writing'), count[0], total=len(order)) | ||||
Augie Fackler
|
r14030 | return "%020d" % revl.linkrev(revl.rev(x)) | ||
Matt Mackall
|
r13782 | |||
Benoit Boissinot
|
r10009 | unlookup = lambda x: int(x, 10) | ||
Greg Ward
|
r9515 | try: | ||
Augie Fackler
|
r14030 | bundler = changegroup.bundle10(lookup) | ||
group = util.chunkbuffer(r1.group(order, bundler)) | ||||
Matt Mackall
|
r12348 | group = changegroup.unbundle10(group, "UN") | ||
Matt Mackall
|
r12335 | r2.addgroup(group, unlookup, tr) | ||
Greg Ward
|
r9515 | finally: | ||
Matt Mackall
|
r10724 | ui.progress(_('writing'), None) | ||
Greg Ward
|
r9515 | |||
Benoit Boissinot
|
r10542 | def report(ui, r1, r2): | ||
def getsize(r): | ||||
s = 0 | ||||
for fn in (r.indexfile, r.datafile): | ||||
try: | ||||
s += os.stat(fn).st_size | ||||
except OSError, inst: | ||||
if inst.errno != errno.ENOENT: | ||||
raise | ||||
return s | ||||
oldsize = float(getsize(r1)) | ||||
newsize = float(getsize(r2)) | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r9712 | # argh: have to pass an int to %d, because a float >= 2^32 | ||
Greg Ward
|
r9515 | # blows up under Python 2.5 or earlier | ||
Benoit Boissinot
|
r10508 | ui.write(_('old file size: %12d bytes (%6.1f MiB)\n') | ||
Matt Mackall
|
r10282 | % (int(oldsize), oldsize / 1024 / 1024)) | ||
Benoit Boissinot
|
r10508 | ui.write(_('new file size: %12d bytes (%6.1f MiB)\n') | ||
Matt Mackall
|
r10282 | % (int(newsize), newsize / 1024 / 1024)) | ||
Greg Ward
|
r9515 | |||
shrink_percent = (oldsize - newsize) / oldsize * 100 | ||||
shrink_factor = oldsize / newsize | ||||
Benoit Boissinot
|
r10508 | ui.write(_('shrinkage: %.1f%% (%.1fx)\n') | ||
% (shrink_percent, shrink_factor)) | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | def shrink(ui, repo, **opts): | ||
Greg Ward
|
r10622 | """shrink a revlog by reordering revisions | ||
Rewrites all the entries in some revlog of the current repository | ||||
(by default, the manifest log) to save space. | ||||
Different sort algorithms have different performance | ||||
characteristics. Use ``--sort`` to select a sort algorithm so you | ||||
Benoit Boissinot
|
r10625 | can determine which works best for your data. | ||
Dirkjan Ochtman
|
r10215 | """ | ||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | if not repo.local(): | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('not a local repository: %s') % repo.root) | ||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | fn = opts.get('revlog') | ||
if not fn: | ||||
Greg Ward
|
r9515 | indexfn = repo.sjoin('00manifest.i') | ||
else: | ||||
Dirkjan Ochtman
|
r10215 | if not fn.endswith('.i'): | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('--revlog option must specify the revlog index ' | ||
'file (*.i), not %s') % opts.get('revlog')) | ||||
Greg Ward
|
r9515 | |||
Matt Mackall
|
r15381 | indexfn = os.path.realpath(fn) | ||
Greg Ward
|
r9515 | store = repo.sjoin('') | ||
if not indexfn.startswith(store): | ||||
Benoit Boissinot
|
r10508 | raise util.Abort(_('--revlog option must specify a revlog in %s, ' | ||
'not %s') % (store, indexfn)) | ||||
Greg Ward
|
r9515 | |||
Greg Ward
|
r10622 | sortname = opts['sort'] | ||
try: | ||||
toposort = globals()['toposort_' + sortname] | ||||
except KeyError: | ||||
raise util.Abort(_('no such toposort algorithm: %s') % sortname) | ||||
Greg Ward
|
r9515 | if not os.path.exists(indexfn): | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('no such file: %s') % indexfn) | ||
Greg Ward
|
r9515 | if '00changelog' in indexfn: | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('shrinking the changelog ' | ||
'will corrupt your repository')) | ||||
Benoit Boissinot
|
r10542 | |||
ui.write(_('shrinking %s\n') % indexfn) | ||||
Greg Ward
|
r11294 | tmpindexfn = util.mktempcopy(indexfn, emptyok=True) | ||
Benoit Boissinot
|
r10542 | |||
Augie Fackler
|
r14029 | r1 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), indexfn) | ||
r2 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), tmpindexfn) | ||||
Benoit Boissinot
|
r10542 | |||
datafn, tmpdatafn = r1.datafile, r2.datafile | ||||
Greg Ward
|
r9515 | |||
oldindexfn = indexfn + '.old' | ||||
olddatafn = datafn + '.old' | ||||
if os.path.exists(oldindexfn) or os.path.exists(olddatafn): | ||||
Benoit Boissinot
|
r10508 | raise util.Abort(_('one or both of\n' | ||
' %s\n' | ||||
' %s\n' | ||||
'exists from a previous run; please clean up ' | ||||
'before running again') % (oldindexfn, olddatafn)) | ||||
Greg Ward
|
r9515 | |||
# Don't use repo.transaction(), because then things get hairy with | ||||
# paths: some need to be relative to .hg, and some need to be | ||||
Dirkjan Ochtman
|
r10216 | # absolute. Doing it this way keeps things simple: everything is an | ||
Greg Ward
|
r9515 | # absolute path. | ||
lock = repo.lock(wait=False) | ||||
Patrick Mezard
|
r10234 | tr = transaction.transaction(ui.warn, | ||
Greg Ward
|
r9515 | open, | ||
repo.sjoin('journal')) | ||||
Benoit Boissinot
|
r10542 | def ignoremissing(func): | ||
def f(*args, **kw): | ||||
try: | ||||
return func(*args, **kw) | ||||
except OSError, inst: | ||||
if inst.errno != errno.ENOENT: | ||||
raise | ||||
return f | ||||
Greg Ward
|
r9515 | try: | ||
try: | ||||
Dirkjan Ochtman
|
r10213 | order = toposort(ui, r1) | ||
Benoit Boissinot
|
r10626 | |||
suboptimal = 0 | ||||
for i in xrange(1, len(order)): | ||||
parents = [p for p in r1.parentrevs(order[i]) | ||||
if p != node.nullrev] | ||||
Martin Geisler
|
r10655 | if parents and order[i - 1] not in parents: | ||
Benoit Boissinot
|
r10626 | suboptimal += 1 | ||
ui.note(_('%d suboptimal nodes\n') % suboptimal) | ||||
Dirkjan Ochtman
|
r10213 | writerevs(ui, r1, r2, order, tr) | ||
Benoit Boissinot
|
r10542 | report(ui, r1, r2) | ||
Greg Ward
|
r9515 | tr.close() | ||
Brodie Rao
|
r16705 | except: # re-raises | ||
Greg Ward
|
r9515 | # Abort transaction first, so we truncate the files before | ||
# deleting them. | ||||
tr.abort() | ||||
Benoit Boissinot
|
r10542 | for fn in (tmpindexfn, tmpdatafn): | ||
ignoremissing(os.unlink)(fn) | ||||
Greg Ward
|
r9515 | raise | ||
Patrick Mezard
|
r10241 | if not opts.get('dry_run'): | ||
Benoit Boissinot
|
r10542 | # racy, both files cannot be renamed atomically | ||
# copy files | ||||
Adrian Buehlmann
|
r14235 | util.oslink(indexfn, oldindexfn) | ||
ignoremissing(util.oslink)(datafn, olddatafn) | ||||
Greg Ward
|
r11267 | |||
Benoit Boissinot
|
r10542 | # rename | ||
Patrick Mezard
|
r10241 | util.rename(tmpindexfn, indexfn) | ||
Benoit Boissinot
|
r10542 | try: | ||
Greg Ward
|
r11267 | os.chmod(tmpdatafn, os.stat(datafn).st_mode) | ||
Benoit Boissinot
|
r10542 | util.rename(tmpdatafn, datafn) | ||
except OSError, inst: | ||||
if inst.errno != errno.ENOENT: | ||||
raise | ||||
ignoremissing(os.unlink)(datafn) | ||||
Patrick Mezard
|
r10241 | else: | ||
Benoit Boissinot
|
r10542 | for fn in (tmpindexfn, tmpdatafn): | ||
ignoremissing(os.unlink)(fn) | ||||
Greg Ward
|
r9515 | finally: | ||
lock.release() | ||||
Patrick Mezard
|
r10241 | if not opts.get('dry_run'): | ||
Greg Ward
|
r16306 | ui.write( | ||
_('note: old revlog saved in:\n' | ||||
' %s\n' | ||||
' %s\n' | ||||
'(You can delete those files when you are satisfied that your\n' | ||||
'repository is still sane. ' | ||||
'Running \'hg verify\' is strongly recommended.)\n') | ||||
% (oldindexfn, olddatafn)) | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | cmdtable = { | ||
'shrink': (shrink, | ||||
Greg Ward
|
r16306 | [('', 'revlog', '', | ||
_('the revlog to shrink (.i)')), | ||||
('n', 'dry-run', None, | ||||
_('do not shrink, simulate only')), | ||||
('', 'sort', 'reversepostorder', | ||||
_('name of sort algorithm to use')), | ||||
Patrick Mezard
|
r10241 | ], | ||
Benoit Boissinot
|
r10508 | _('hg shrink [--revlog PATH]')) | ||
Dirkjan Ochtman
|
r10215 | } | ||
Greg Ward
|
r10236 | |||
if __name__ == "__main__": | ||||
Matt Mackall
|
r10282 | print "shrink-revlog.py is now an extension (see hg help extensions)" | ||