shrink-revlog.py
218 lines
| 7.6 KiB
| text/x-python
|
PythonLexer
/ contrib / shrink-revlog.py
Greg Ward
|
r9515 | #!/usr/bin/env python | ||
"""\ | ||||
Reorder a revlog (by default the the manifest file in the current | ||||
repository) to save space. Specifically, this topologically sorts the | ||||
revisions in the revlog so that revisions on the same branch are adjacent | ||||
as much as possible. This is a workaround for the fact that Mercurial | ||||
computes deltas relative to the previous revision rather than relative to a | ||||
parent revision. This is *not* safe to run on a changelog. | ||||
""" | ||||
# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org> | ||||
# as a patch to rewrite-log. Cleaned up, refactored, documented, and | ||||
# renamed by Greg Ward <greg at gerg.ca>. | ||||
# XXX would be nice to have a way to verify the repository after shrinking, | ||||
# e.g. by comparing "before" and "after" states of random changesets | ||||
# (maybe: export before, shrink, export after, diff). | ||||
import sys, os, tempfile | ||||
import optparse | ||||
from mercurial import ui as ui_, hg, revlog, transaction, node, util | ||||
def toposort(rl): | ||||
write = sys.stdout.write | ||||
children = {} | ||||
root = [] | ||||
# build children and roots | ||||
write('reading %d revs ' % len(rl)) | ||||
try: | ||||
for i in rl: | ||||
children[i] = [] | ||||
parents = [p for p in rl.parentrevs(i) if p != node.nullrev] | ||||
# in case of duplicate parents | ||||
if len(parents) == 2 and parents[0] == parents[1]: | ||||
del parents[1] | ||||
for p in parents: | ||||
assert p in children | ||||
children[p].append(i) | ||||
if len(parents) == 0: | ||||
root.append(i) | ||||
if i % 1000 == 0: | ||||
write('.') | ||||
finally: | ||||
write('\n') | ||||
# XXX this is a reimplementation of the 'branchsort' topo sort | ||||
# algorithm in hgext.convert.convcmd... would be nice not to duplicate | ||||
# the algorithm | ||||
write('sorting ...') | ||||
visit = root | ||||
ret = [] | ||||
while visit: | ||||
i = visit.pop(0) | ||||
ret.append(i) | ||||
if i not in children: | ||||
# This only happens if some node's p1 == p2, which can | ||||
# happen in the manifest in certain circumstances. | ||||
continue | ||||
next = [] | ||||
for c in children.pop(i): | ||||
parents_unseen = [p for p in rl.parentrevs(c) | ||||
if p != node.nullrev and p in children] | ||||
if len(parents_unseen) == 0: | ||||
next.append(c) | ||||
visit = next + visit | ||||
write('\n') | ||||
return ret | ||||
def writerevs(r1, r2, order, tr): | ||||
write = sys.stdout.write | ||||
write('writing %d revs ' % len(order)) | ||||
try: | ||||
count = 0 | ||||
for rev in order: | ||||
n = r1.node(rev) | ||||
p1, p2 = r1.parents(n) | ||||
l = r1.linkrev(rev) | ||||
t = r1.revision(n) | ||||
n2 = r2.addrevision(t, tr, l, p1, p2) | ||||
if count % 1000 == 0: | ||||
write('.') | ||||
count += 1 | ||||
finally: | ||||
write('\n') | ||||
def report(olddatafn, newdatafn): | ||||
oldsize = float(os.stat(olddatafn).st_size) | ||||
newsize = float(os.stat(newdatafn).st_size) | ||||
# argh: have to pass an int to %d, because a float >= 2^32 | ||||
# blows up under Python 2.5 or earlier | ||||
sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n' | ||||
% (int(oldsize), oldsize/1024/1024)) | ||||
sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n' | ||||
% (int(newsize), newsize/1024/1024)) | ||||
shrink_percent = (oldsize - newsize) / oldsize * 100 | ||||
shrink_factor = oldsize / newsize | ||||
sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n' | ||||
% (shrink_percent, shrink_factor)) | ||||
def main(): | ||||
# Unbuffer stdout for nice progress output. | ||||
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) | ||||
write = sys.stdout.write | ||||
parser = optparse.OptionParser(description=__doc__) | ||||
parser.add_option('-R', '--repository', | ||||
default=os.path.curdir, | ||||
metavar='REPO', | ||||
help='repository root directory [default: current dir]') | ||||
parser.add_option('--revlog', | ||||
metavar='FILE', | ||||
help='shrink FILE [default: REPO/hg/store/00manifest.i]') | ||||
(options, args) = parser.parse_args() | ||||
if args: | ||||
parser.error('too many arguments') | ||||
# Open the specified repository. | ||||
ui = ui_.ui() | ||||
repo = hg.repository(ui, options.repository) | ||||
if not repo.local(): | ||||
parser.error('not a local repository: %s' % options.repository) | ||||
if options.revlog is None: | ||||
indexfn = repo.sjoin('00manifest.i') | ||||
else: | ||||
if not options.revlog.endswith('.i'): | ||||
parser.error('--revlog option must specify the revlog index file ' | ||||
'(*.i), not %s' % options.revlog) | ||||
indexfn = os.path.realpath(options.revlog) | ||||
store = repo.sjoin('') | ||||
if not indexfn.startswith(store): | ||||
parser.error('--revlog option must specify a revlog in %s, not %s' | ||||
% (store, indexfn)) | ||||
datafn = indexfn[:-2] + '.d' | ||||
if not os.path.exists(indexfn): | ||||
parser.error('no such file: %s' % indexfn) | ||||
if '00changelog' in indexfn: | ||||
parser.error('shrinking the changelog will corrupt your repository') | ||||
if not os.path.exists(datafn): | ||||
# This is just a lazy shortcut because I can't be bothered to | ||||
# handle all the special cases that entail from no .d file. | ||||
parser.error('%s does not exist: revlog not big enough ' | ||||
'to be worth shrinking' % datafn) | ||||
oldindexfn = indexfn + '.old' | ||||
olddatafn = datafn + '.old' | ||||
if os.path.exists(oldindexfn) or os.path.exists(olddatafn): | ||||
parser.error('one or both of\n' | ||||
' %s\n' | ||||
' %s\n' | ||||
'exists from a previous run; please clean up before ' | ||||
'running again' | ||||
% (oldindexfn, olddatafn)) | ||||
write('shrinking %s\n' % indexfn) | ||||
prefix = os.path.basename(indexfn)[:-1] | ||||
(tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn), | ||||
prefix=prefix, | ||||
suffix='.i') | ||||
tmpdatafn = tmpindexfn[:-2] + '.d' | ||||
os.close(tmpfd) | ||||
r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn) | ||||
r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn) | ||||
# Don't use repo.transaction(), because then things get hairy with | ||||
# paths: some need to be relative to .hg, and some need to be | ||||
# absolute. Doing it this way keeps things simple: everything is an | ||||
# absolute path. | ||||
lock = repo.lock(wait=False) | ||||
tr = transaction.transaction(sys.stderr.write, | ||||
open, | ||||
repo.sjoin('journal')) | ||||
try: | ||||
try: | ||||
order = toposort(r1) | ||||
writerevs(r1, r2, order, tr) | ||||
report(datafn, tmpdatafn) | ||||
tr.close() | ||||
except: | ||||
# Abort transaction first, so we truncate the files before | ||||
# deleting them. | ||||
tr.abort() | ||||
if os.path.exists(tmpindexfn): | ||||
os.unlink(tmpindexfn) | ||||
if os.path.exists(tmpdatafn): | ||||
os.unlink(tmpdatafn) | ||||
raise | ||||
finally: | ||||
lock.release() | ||||
os.link(indexfn, oldindexfn) | ||||
os.link(datafn, olddatafn) | ||||
os.rename(tmpindexfn, indexfn) | ||||
os.rename(tmpdatafn, datafn) | ||||
write('note: old revlog saved in:\n' | ||||
' %s\n' | ||||
' %s\n' | ||||
'(You can delete those files when you are satisfied that your\n' | ||||
'repository is still sane. ' | ||||
'Running \'hg verify\' is strongly recommended.)\n' | ||||
% (oldindexfn, olddatafn)) | ||||
try: | ||||
main() | ||||
except KeyboardInterrupt: | ||||
sys.exit("interrupted") | ||||