shrink-revlog.py
368 lines
| 11.9 KiB
| text/x-python
|
PythonLexer
/ contrib / shrink-revlog.py
Greg Ward
|
r9515 | #!/usr/bin/env python | ||
"""\ | ||||
Greg Ward
|
r10236 | reorder a revlog (the manifest by default) to save space | ||
Specifically, this topologically sorts the revisions in the revlog so that | ||||
revisions on the same branch are adjacent as much as possible. This is a | ||||
workaround for the fact that Mercurial computes deltas relative to the | ||||
Dirkjan Ochtman
|
r10216 | previous revision rather than relative to a parent revision. | ||
This is *not* safe to run on a changelog. | ||||
Greg Ward
|
r9515 | """ | ||
# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org> | ||||
Dirkjan Ochtman
|
r10216 | # as a patch to rewrite-log. Cleaned up, refactored, documented, and | ||
Greg Ward
|
r9515 | # renamed by Greg Ward <greg at gerg.ca>. | ||
# XXX would be nice to have a way to verify the repository after shrinking, | ||||
# e.g. by comparing "before" and "after" states of random changesets | ||||
# (maybe: export before, shrink, export after, diff). | ||||
Benoit Boissinot
|
r10542 | import os, tempfile, errno | ||
Benoit Boissinot
|
r10509 | from mercurial import revlog, transaction, node, util | ||
Benoit Boissinot
|
r10009 | from mercurial import changegroup | ||
Benoit Boissinot
|
r10508 | from mercurial.i18n import _ | ||
Greg Ward
|
r9515 | |||
Greg Ward
|
r10622 | def toposort_branchsort(ui, rl): | ||
Greg Ward
|
r9515 | |||
children = {} | ||||
root = [] | ||||
# build children and roots | ||||
Benoit Boissinot
|
r10508 | ui.status(_('reading revs\n')) | ||
Greg Ward
|
r9515 | try: | ||
Greg Ward
|
r10621 | for rev in rl: | ||
ui.progress(_('reading'), rev, total=len(rl)) | ||||
children[rev] = [] | ||||
parents = [p for p in rl.parentrevs(rev) if p != node.nullrev] | ||||
Greg Ward
|
r9515 | # in case of duplicate parents | ||
if len(parents) == 2 and parents[0] == parents[1]: | ||||
del parents[1] | ||||
for p in parents: | ||||
assert p in children | ||||
Greg Ward
|
r10621 | children[p].append(rev) | ||
Greg Ward
|
r9515 | |||
if len(parents) == 0: | ||||
Greg Ward
|
r10621 | root.append(rev) | ||
Greg Ward
|
r9515 | finally: | ||
Martin Geisler
|
r10496 | ui.progress(_('reading'), None, total=len(rl)) | ||
Greg Ward
|
r9515 | |||
# XXX this is a reimplementation of the 'branchsort' topo sort | ||||
# algorithm in hgext.convert.convcmd... would be nice not to duplicate | ||||
# the algorithm | ||||
Benoit Boissinot
|
r10508 | ui.status(_('sorting revs\n')) | ||
Greg Ward
|
r9515 | visit = root | ||
Greg Ward
|
r10621 | result = [] | ||
Greg Ward
|
r10620 | |||
# suboptimal: nodes whose predecessor is not first parent | ||||
suboptimal = 0 | ||||
Greg Ward
|
r9515 | while visit: | ||
Greg Ward
|
r10621 | cur = visit.pop(0) | ||
# revlog will compute delta relative to result[-1], so keep track | ||||
Greg Ward
|
r10620 | # of nodes where this might result in a large delta | ||
Greg Ward
|
r10621 | parents = rl.parentrevs(cur) | ||
if result: | ||||
if result[-1] != parents[0]: | ||||
Greg Ward
|
r10620 | suboptimal += 1 | ||
Greg Ward
|
r10621 | result.append(cur) | ||
if cur not in children: | ||||
Greg Ward
|
r9515 | # This only happens if some node's p1 == p2, which can | ||
# happen in the manifest in certain circumstances. | ||||
continue | ||||
next = [] | ||||
Greg Ward
|
r10621 | for c in children.pop(cur): | ||
Greg Ward
|
r9515 | parents_unseen = [p for p in rl.parentrevs(c) | ||
if p != node.nullrev and p in children] | ||||
if len(parents_unseen) == 0: | ||||
next.append(c) | ||||
visit = next + visit | ||||
Greg Ward
|
r10620 | ui.note(_('%d suboptimal nodes\n') % suboptimal) | ||
Greg Ward
|
r10621 | return result | ||
Greg Ward
|
r9515 | |||
Greg Ward
|
r10623 | def toposort_reversepostorder(ui, rl): | ||
# postorder of the reverse directed graph | ||||
# map rev to list of parent revs (p2 first) | ||||
parents = {} | ||||
heads = set() | ||||
ui.status(_('reading revs\n')) | ||||
try: | ||||
for rev in rl: | ||||
ui.progress(_('reading'), rev, total=len(rl)) | ||||
(p1, p2) = rl.parentrevs(rev) | ||||
if p1 == p2 == node.nullrev: | ||||
parents[rev] = () # root node | ||||
elif p1 == p2 or p2 == node.nullrev: | ||||
parents[rev] = (p1,) # normal node | ||||
else: | ||||
parents[rev] = (p2, p1) # merge node | ||||
heads.add(rev) | ||||
for p in parents[rev]: | ||||
heads.discard(p) | ||||
finally: | ||||
ui.progress(_('reading'), None, total=len(rl)) | ||||
ui.status(_('sorting revs\n')) | ||||
result = [] | ||||
visit = list(heads) | ||||
visit.sort(reverse=True) | ||||
finished = set() | ||||
Greg Ward
|
r10624 | suboptimal = 0 | ||
Greg Ward
|
r10623 | |||
while visit: | ||||
cur = visit[-1] | ||||
for p in parents[cur]: | ||||
if p not in finished: | ||||
visit.append(p) | ||||
break | ||||
else: | ||||
Greg Ward
|
r10624 | curparents = rl.parentrevs(cur) | ||
if result and result[-1] != curparents[0]: | ||||
suboptimal += 1 | ||||
Greg Ward
|
r10623 | result.append(cur) | ||
finished.add(cur) | ||||
visit.pop() | ||||
Greg Ward
|
r10624 | |||
ui.note(_('%d suboptimal nodes\n') % suboptimal) | ||||
Greg Ward
|
r10623 | return result | ||
def toposort_postorderreverse(ui, rl): | ||||
# reverse-postorder of the reverse directed graph | ||||
children = {} | ||||
roots = set() | ||||
ui.status(_('reading revs\n')) | ||||
try: | ||||
for rev in rl: | ||||
ui.progress(_('reading'), rev, total=len(rl)) | ||||
(p1, p2) = rl.parentrevs(rev) | ||||
if p1 == p2 == node.nullrev: | ||||
roots.add(rev) | ||||
children[rev] = [] | ||||
if p1 != node.nullrev: | ||||
children[p1].append(rev) | ||||
if p2 != node.nullrev: | ||||
children[p2].append(rev) | ||||
finally: | ||||
ui.progress(_('reading'), None, total=len(rl)) | ||||
ui.status(_('sorting revs\n')) | ||||
result = [] | ||||
visit = list(roots) | ||||
visit.sort() | ||||
finished = set() | ||||
Greg Ward
|
r10624 | suboptimal = 0 | ||
Greg Ward
|
r10623 | while visit: | ||
cur = visit[-1] | ||||
for p in children[cur]: | ||||
if p not in finished: | ||||
visit.append(p) | ||||
break | ||||
else: | ||||
Greg Ward
|
r10624 | # if cur is not the first parent of its successor, then the | ||
# successor is a suboptimal node | ||||
if result: | ||||
succparents = rl.parentrevs(result[-1]) | ||||
if succparents[0] != cur: | ||||
suboptimal += 1 | ||||
Greg Ward
|
r10623 | result.append(cur) | ||
finished.add(cur) | ||||
visit.pop() | ||||
Greg Ward
|
r10624 | |||
ui.note(_('%d suboptimal nodes\n') % suboptimal) | ||||
Greg Ward
|
r10623 | result.reverse() | ||
return result | ||||
Dirkjan Ochtman
|
r10213 | def writerevs(ui, r1, r2, order, tr): | ||
Benoit Boissinot
|
r10009 | |||
Benoit Boissinot
|
r10508 | ui.status(_('writing revs\n')) | ||
Benoit Boissinot
|
r10440 | |||
Benoit Boissinot
|
r10009 | count = [0] | ||
def progress(*args): | ||||
Martin Geisler
|
r10496 | ui.progress(_('writing'), count[0], total=len(order)) | ||
Benoit Boissinot
|
r10009 | count[0] += 1 | ||
order = [r1.node(r) for r in order] | ||||
# this is a bit ugly, but it works | ||||
lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x)) | ||||
unlookup = lambda x: int(x, 10) | ||||
Greg Ward
|
r9515 | try: | ||
Benoit Boissinot
|
r10009 | group = util.chunkbuffer(r1.group(order, lookup, progress)) | ||
chunkiter = changegroup.chunkiter(group) | ||||
r2.addgroup(chunkiter, unlookup, tr) | ||||
Greg Ward
|
r9515 | finally: | ||
Martin Geisler
|
r10496 | ui.progress(_('writing'), None, len(order)) | ||
Greg Ward
|
r9515 | |||
Benoit Boissinot
|
r10542 | def report(ui, r1, r2): | ||
def getsize(r): | ||||
s = 0 | ||||
for fn in (r.indexfile, r.datafile): | ||||
try: | ||||
s += os.stat(fn).st_size | ||||
except OSError, inst: | ||||
if inst.errno != errno.ENOENT: | ||||
raise | ||||
return s | ||||
oldsize = float(getsize(r1)) | ||||
newsize = float(getsize(r2)) | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r9712 | # argh: have to pass an int to %d, because a float >= 2^32 | ||
Greg Ward
|
r9515 | # blows up under Python 2.5 or earlier | ||
Benoit Boissinot
|
r10508 | ui.write(_('old file size: %12d bytes (%6.1f MiB)\n') | ||
Matt Mackall
|
r10282 | % (int(oldsize), oldsize / 1024 / 1024)) | ||
Benoit Boissinot
|
r10508 | ui.write(_('new file size: %12d bytes (%6.1f MiB)\n') | ||
Matt Mackall
|
r10282 | % (int(newsize), newsize / 1024 / 1024)) | ||
Greg Ward
|
r9515 | |||
shrink_percent = (oldsize - newsize) / oldsize * 100 | ||||
shrink_factor = oldsize / newsize | ||||
Benoit Boissinot
|
r10508 | ui.write(_('shrinkage: %.1f%% (%.1fx)\n') | ||
% (shrink_percent, shrink_factor)) | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | def shrink(ui, repo, **opts): | ||
Greg Ward
|
r10622 | """shrink a revlog by reordering revisions | ||
Rewrites all the entries in some revlog of the current repository | ||||
(by default, the manifest log) to save space. | ||||
Different sort algorithms have different performance | ||||
characteristics. Use ``--sort`` to select a sort algorithm so you | ||||
can determine which works best for your data. The default | ||||
algorithm, ``branchsort``, works well for workflows with lots of | ||||
active (unmerged) branches, but not so well when all branches have | ||||
been merged and there is only one repository head. | ||||
Dirkjan Ochtman
|
r10215 | """ | ||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | if not repo.local(): | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('not a local repository: %s') % repo.root) | ||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | fn = opts.get('revlog') | ||
if not fn: | ||||
Greg Ward
|
r9515 | indexfn = repo.sjoin('00manifest.i') | ||
else: | ||||
Dirkjan Ochtman
|
r10215 | if not fn.endswith('.i'): | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('--revlog option must specify the revlog index ' | ||
'file (*.i), not %s') % opts.get('revlog')) | ||||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | indexfn = os.path.realpath(fn) | ||
Greg Ward
|
r9515 | store = repo.sjoin('') | ||
if not indexfn.startswith(store): | ||||
Benoit Boissinot
|
r10508 | raise util.Abort(_('--revlog option must specify a revlog in %s, ' | ||
'not %s') % (store, indexfn)) | ||||
Greg Ward
|
r9515 | |||
Greg Ward
|
r10622 | sortname = opts['sort'] | ||
try: | ||||
toposort = globals()['toposort_' + sortname] | ||||
except KeyError: | ||||
raise util.Abort(_('no such toposort algorithm: %s') % sortname) | ||||
Greg Ward
|
r9515 | if not os.path.exists(indexfn): | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('no such file: %s') % indexfn) | ||
Greg Ward
|
r9515 | if '00changelog' in indexfn: | ||
Benoit Boissinot
|
r10508 | raise util.Abort(_('shrinking the changelog ' | ||
'will corrupt your repository')) | ||||
Benoit Boissinot
|
r10542 | |||
ui.write(_('shrinking %s\n') % indexfn) | ||||
prefix = os.path.basename(indexfn)[:-1] | ||||
(tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn), | ||||
prefix=prefix, | ||||
suffix='.i') | ||||
os.close(tmpfd) | ||||
r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn) | ||||
r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn) | ||||
datafn, tmpdatafn = r1.datafile, r2.datafile | ||||
Greg Ward
|
r9515 | |||
oldindexfn = indexfn + '.old' | ||||
olddatafn = datafn + '.old' | ||||
if os.path.exists(oldindexfn) or os.path.exists(olddatafn): | ||||
Benoit Boissinot
|
r10508 | raise util.Abort(_('one or both of\n' | ||
' %s\n' | ||||
' %s\n' | ||||
'exists from a previous run; please clean up ' | ||||
'before running again') % (oldindexfn, olddatafn)) | ||||
Greg Ward
|
r9515 | |||
# Don't use repo.transaction(), because then things get hairy with | ||||
# paths: some need to be relative to .hg, and some need to be | ||||
Dirkjan Ochtman
|
r10216 | # absolute. Doing it this way keeps things simple: everything is an | ||
Greg Ward
|
r9515 | # absolute path. | ||
lock = repo.lock(wait=False) | ||||
Patrick Mezard
|
r10234 | tr = transaction.transaction(ui.warn, | ||
Greg Ward
|
r9515 | open, | ||
repo.sjoin('journal')) | ||||
Benoit Boissinot
|
r10542 | def ignoremissing(func): | ||
def f(*args, **kw): | ||||
try: | ||||
return func(*args, **kw) | ||||
except OSError, inst: | ||||
if inst.errno != errno.ENOENT: | ||||
raise | ||||
return f | ||||
Greg Ward
|
r9515 | try: | ||
try: | ||||
Dirkjan Ochtman
|
r10213 | order = toposort(ui, r1) | ||
writerevs(ui, r1, r2, order, tr) | ||||
Benoit Boissinot
|
r10542 | report(ui, r1, r2) | ||
Greg Ward
|
r9515 | tr.close() | ||
except: | ||||
# Abort transaction first, so we truncate the files before | ||||
# deleting them. | ||||
tr.abort() | ||||
Benoit Boissinot
|
r10542 | for fn in (tmpindexfn, tmpdatafn): | ||
ignoremissing(os.unlink)(fn) | ||||
Greg Ward
|
r9515 | raise | ||
Patrick Mezard
|
r10241 | if not opts.get('dry_run'): | ||
Benoit Boissinot
|
r10542 | # racy, both files cannot be renamed atomically | ||
# copy files | ||||
Patrick Mezard
|
r10241 | util.os_link(indexfn, oldindexfn) | ||
Benoit Boissinot
|
r10542 | ignoremissing(util.os_link)(datafn, olddatafn) | ||
# rename | ||||
Patrick Mezard
|
r10241 | util.rename(tmpindexfn, indexfn) | ||
Benoit Boissinot
|
r10542 | try: | ||
util.rename(tmpdatafn, datafn) | ||||
except OSError, inst: | ||||
if inst.errno != errno.ENOENT: | ||||
raise | ||||
ignoremissing(os.unlink)(datafn) | ||||
Patrick Mezard
|
r10241 | else: | ||
Benoit Boissinot
|
r10542 | for fn in (tmpindexfn, tmpdatafn): | ||
ignoremissing(os.unlink)(fn) | ||||
Greg Ward
|
r9515 | finally: | ||
lock.release() | ||||
Patrick Mezard
|
r10241 | if not opts.get('dry_run'): | ||
Benoit Boissinot
|
r10508 | ui.write(_('note: old revlog saved in:\n' | ||
' %s\n' | ||||
' %s\n' | ||||
'(You can delete those files when you are satisfied that your\n' | ||||
'repository is still sane. ' | ||||
'Running \'hg verify\' is strongly recommended.)\n') | ||||
Patrick Mezard
|
r10241 | % (oldindexfn, olddatafn)) | ||
Greg Ward
|
r9515 | |||
Dirkjan Ochtman
|
r10215 | cmdtable = { | ||
'shrink': (shrink, | ||||
Benoit Boissinot
|
r10508 | [('', 'revlog', '', _('index (.i) file of the revlog to shrink')), | ||
('n', 'dry-run', None, _('do not shrink, simulate only')), | ||||
Greg Ward
|
r10622 | ('', 'sort', 'branchsort', 'name of sort algorithm to use'), | ||
Patrick Mezard
|
r10241 | ], | ||
Benoit Boissinot
|
r10508 | _('hg shrink [--revlog PATH]')) | ||
Dirkjan Ochtman
|
r10215 | } | ||
Greg Ward
|
r10236 | |||
if __name__ == "__main__": | ||||
Matt Mackall
|
r10282 | print "shrink-revlog.py is now an extension (see hg help extensions)" | ||