##// END OF EJS Templates
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.

File last commit:

r10624:432eb853 default
r10624:432eb853 default
Show More
shrink-revlog.py
368 lines | 11.9 KiB | text/x-python | PythonLexer
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 #!/usr/bin/env python
"""\
Greg Ward
shrink-revlog: help/doc tweaks...
r10236 reorder a revlog (the manifest by default) to save space
Specifically, this topologically sorts the revisions in the revlog so that
revisions on the same branch are adjacent as much as possible. This is a
workaround for the fact that Mercurial computes deltas relative to the
Dirkjan Ochtman
contrib: small documentation fixes in shrink-revlog.py
r10216 previous revision rather than relative to a parent revision.
This is *not* safe to run on a changelog.
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 """
# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
Dirkjan Ochtman
contrib: small documentation fixes in shrink-revlog.py
r10216 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 # renamed by Greg Ward <greg at gerg.ca>.
# XXX would be nice to have a way to verify the repository after shrinking,
# e.g. by comparing "before" and "after" states of random changesets
# (maybe: export before, shrink, export after, diff).
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 import os, tempfile, errno
Benoit Boissinot
shrink-revlog: remove unneeded imports and useless code
r10509 from mercurial import revlog, transaction, node, util
Benoit Boissinot
shrink-revlog: improve performance: use changegroup instead of revisions...
r10009 from mercurial import changegroup
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 from mercurial.i18n import _
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Greg Ward
shrink-revlog: add --sort option for user-selectable toposort algorithm.
r10622 def toposort_branchsort(ui, rl):
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
children = {}
root = []
# build children and roots
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.status(_('reading revs\n'))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 try:
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 for rev in rl:
ui.progress(_('reading'), rev, total=len(rl))
children[rev] = []
parents = [p for p in rl.parentrevs(rev) if p != node.nullrev]
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 # in case of duplicate parents
if len(parents) == 2 and parents[0] == parents[1]:
del parents[1]
for p in parents:
assert p in children
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 children[p].append(rev)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
if len(parents) == 0:
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 root.append(rev)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 finally:
Martin Geisler
progress: mark strings for translation
r10496 ui.progress(_('reading'), None, total=len(rl))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
# XXX this is a reimplementation of the 'branchsort' topo sort
# algorithm in hgext.convert.convcmd... would be nice not to duplicate
# the algorithm
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.status(_('sorting revs\n'))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 visit = root
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 result = []
Greg Ward
shrink-revlog: instrument sort code to record statistics....
r10620
# suboptimal: nodes whose predecessor is not first parent
suboptimal = 0
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 while visit:
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 cur = visit.pop(0)
# revlog will compute delta relative to result[-1], so keep track
Greg Ward
shrink-revlog: instrument sort code to record statistics....
r10620 # of nodes where this might result in a large delta
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 parents = rl.parentrevs(cur)
if result:
if result[-1] != parents[0]:
Greg Ward
shrink-revlog: instrument sort code to record statistics....
r10620 suboptimal += 1
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 result.append(cur)
if cur not in children:
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 # This only happens if some node's p1 == p2, which can
# happen in the manifest in certain circumstances.
continue
next = []
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 for c in children.pop(cur):
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 parents_unseen = [p for p in rl.parentrevs(c)
if p != node.nullrev and p in children]
if len(parents_unseen) == 0:
next.append(c)
visit = next + visit
Greg Ward
shrink-revlog: instrument sort code to record statistics....
r10620 ui.note(_('%d suboptimal nodes\n') % suboptimal)
Greg Ward
shrink-revlog: rename some local variables for consistency.
r10621 return result
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623 def toposort_reversepostorder(ui, rl):
# postorder of the reverse directed graph
# map rev to list of parent revs (p2 first)
parents = {}
heads = set()
ui.status(_('reading revs\n'))
try:
for rev in rl:
ui.progress(_('reading'), rev, total=len(rl))
(p1, p2) = rl.parentrevs(rev)
if p1 == p2 == node.nullrev:
parents[rev] = () # root node
elif p1 == p2 or p2 == node.nullrev:
parents[rev] = (p1,) # normal node
else:
parents[rev] = (p2, p1) # merge node
heads.add(rev)
for p in parents[rev]:
heads.discard(p)
finally:
ui.progress(_('reading'), None, total=len(rl))
ui.status(_('sorting revs\n'))
result = []
visit = list(heads)
visit.sort(reverse=True)
finished = set()
Greg Ward
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
r10624 suboptimal = 0
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623
while visit:
cur = visit[-1]
for p in parents[cur]:
if p not in finished:
visit.append(p)
break
else:
Greg Ward
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
r10624 curparents = rl.parentrevs(cur)
if result and result[-1] != curparents[0]:
suboptimal += 1
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623 result.append(cur)
finished.add(cur)
visit.pop()
Greg Ward
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
r10624
ui.note(_('%d suboptimal nodes\n') % suboptimal)
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623 return result
def toposort_postorderreverse(ui, rl):
# reverse-postorder of the reverse directed graph
children = {}
roots = set()
ui.status(_('reading revs\n'))
try:
for rev in rl:
ui.progress(_('reading'), rev, total=len(rl))
(p1, p2) = rl.parentrevs(rev)
if p1 == p2 == node.nullrev:
roots.add(rev)
children[rev] = []
if p1 != node.nullrev:
children[p1].append(rev)
if p2 != node.nullrev:
children[p2].append(rev)
finally:
ui.progress(_('reading'), None, total=len(rl))
ui.status(_('sorting revs\n'))
result = []
visit = list(roots)
visit.sort()
finished = set()
Greg Ward
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
r10624 suboptimal = 0
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623 while visit:
cur = visit[-1]
for p in children[cur]:
if p not in finished:
visit.append(p)
break
else:
Greg Ward
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
r10624 # if cur is not the first parent of its successor, then the
# successor is a suboptimal node
if result:
succparents = rl.parentrevs(result[-1])
if succparents[0] != cur:
suboptimal += 1
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623 result.append(cur)
finished.add(cur)
visit.pop()
Greg Ward
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
r10624
ui.note(_('%d suboptimal nodes\n') % suboptimal)
Greg Ward
shrink-revlog: add "reverse postorder" and "postorder reverse" toposorts....
r10623 result.reverse()
return result
Dirkjan Ochtman
contrib: use ui to write in shrink-revlog.py
r10213 def writerevs(ui, r1, r2, order, tr):
Benoit Boissinot
shrink-revlog: improve performance: use changegroup instead of revisions...
r10009
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.status(_('writing revs\n'))
Benoit Boissinot
shrink: use progress API
r10440
Benoit Boissinot
shrink-revlog: improve performance: use changegroup instead of revisions...
r10009 count = [0]
def progress(*args):
Martin Geisler
progress: mark strings for translation
r10496 ui.progress(_('writing'), count[0], total=len(order))
Benoit Boissinot
shrink-revlog: improve performance: use changegroup instead of revisions...
r10009 count[0] += 1
order = [r1.node(r) for r in order]
# this is a bit ugly, but it works
lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
unlookup = lambda x: int(x, 10)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 try:
Benoit Boissinot
shrink-revlog: improve performance: use changegroup instead of revisions...
r10009 group = util.chunkbuffer(r1.group(order, lookup, progress))
chunkiter = changegroup.chunkiter(group)
r2.addgroup(chunkiter, unlookup, tr)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 finally:
Martin Geisler
progress: mark strings for translation
r10496 ui.progress(_('writing'), None, len(order))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 def report(ui, r1, r2):
def getsize(r):
s = 0
for fn in (r.indexfile, r.datafile):
try:
s += os.stat(fn).st_size
except OSError, inst:
if inst.errno != errno.ENOENT:
raise
return s
oldsize = float(getsize(r1))
newsize = float(getsize(r2))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Dirkjan Ochtman
kill trailing whitespace
r9712 # argh: have to pass an int to %d, because a float >= 2^32
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 # blows up under Python 2.5 or earlier
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
Matt Mackall
many, many trivial check-code fixups
r10282 % (int(oldsize), oldsize / 1024 / 1024))
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
Matt Mackall
many, many trivial check-code fixups
r10282 % (int(newsize), newsize / 1024 / 1024))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
shrink_percent = (oldsize - newsize) / oldsize * 100
shrink_factor = oldsize / newsize
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
% (shrink_percent, shrink_factor))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 def shrink(ui, repo, **opts):
Greg Ward
shrink-revlog: add --sort option for user-selectable toposort algorithm.
r10622 """shrink a revlog by reordering revisions
Rewrites all the entries in some revlog of the current repository
(by default, the manifest log) to save space.
Different sort algorithms have different performance
characteristics. Use ``--sort`` to select a sort algorithm so you
can determine which works best for your data. The default
algorithm, ``branchsort``, works well for workflows with lots of
active (unmerged) branches, but not so well when all branches have
been merged and there is only one repository head.
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 """
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 if not repo.local():
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 raise util.Abort(_('not a local repository: %s') % repo.root)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 fn = opts.get('revlog')
if not fn:
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 indexfn = repo.sjoin('00manifest.i')
else:
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 if not fn.endswith('.i'):
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 raise util.Abort(_('--revlog option must specify the revlog index '
'file (*.i), not %s') % opts.get('revlog'))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 indexfn = os.path.realpath(fn)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 store = repo.sjoin('')
if not indexfn.startswith(store):
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 raise util.Abort(_('--revlog option must specify a revlog in %s, '
'not %s') % (store, indexfn))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Greg Ward
shrink-revlog: add --sort option for user-selectable toposort algorithm.
r10622 sortname = opts['sort']
try:
toposort = globals()['toposort_' + sortname]
except KeyError:
raise util.Abort(_('no such toposort algorithm: %s') % sortname)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 if not os.path.exists(indexfn):
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 raise util.Abort(_('no such file: %s') % indexfn)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 if '00changelog' in indexfn:
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 raise util.Abort(_('shrinking the changelog '
'will corrupt your repository'))
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542
ui.write(_('shrinking %s\n') % indexfn)
prefix = os.path.basename(indexfn)[:-1]
(tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
prefix=prefix,
suffix='.i')
os.close(tmpfd)
r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
datafn, tmpdatafn = r1.datafile, r2.datafile
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
oldindexfn = indexfn + '.old'
olddatafn = datafn + '.old'
if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 raise util.Abort(_('one or both of\n'
' %s\n'
' %s\n'
'exists from a previous run; please clean up '
'before running again') % (oldindexfn, olddatafn))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
# Don't use repo.transaction(), because then things get hairy with
# paths: some need to be relative to .hg, and some need to be
Dirkjan Ochtman
contrib: small documentation fixes in shrink-revlog.py
r10216 # absolute. Doing it this way keeps things simple: everything is an
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 # absolute path.
lock = repo.lock(wait=False)
Patrick Mezard
shrink-revlog: make it work on windows (issue1976)
r10234 tr = transaction.transaction(ui.warn,
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 open,
repo.sjoin('journal'))
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 def ignoremissing(func):
def f(*args, **kw):
try:
return func(*args, **kw)
except OSError, inst:
if inst.errno != errno.ENOENT:
raise
return f
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 try:
try:
Dirkjan Ochtman
contrib: use ui to write in shrink-revlog.py
r10213 order = toposort(ui, r1)
writerevs(ui, r1, r2, order, tr)
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 report(ui, r1, r2)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 tr.close()
except:
# Abort transaction first, so we truncate the files before
# deleting them.
tr.abort()
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 for fn in (tmpindexfn, tmpdatafn):
ignoremissing(os.unlink)(fn)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 raise
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 if not opts.get('dry_run'):
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 # racy, both files cannot be renamed atomically
# copy files
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 util.os_link(indexfn, oldindexfn)
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 ignoremissing(util.os_link)(datafn, olddatafn)
# rename
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 util.rename(tmpindexfn, indexfn)
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 try:
util.rename(tmpdatafn, datafn)
except OSError, inst:
if inst.errno != errno.ENOENT:
raise
ignoremissing(os.unlink)(datafn)
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 else:
Benoit Boissinot
shrink: handle all combinations of inline/non-inline revlogs
r10542 for fn in (tmpindexfn, tmpdatafn):
ignoremissing(os.unlink)(fn)
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515 finally:
lock.release()
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 if not opts.get('dry_run'):
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 ui.write(_('note: old revlog saved in:\n'
' %s\n'
' %s\n'
'(You can delete those files when you are satisfied that your\n'
'repository is still sane. '
'Running \'hg verify\' is strongly recommended.)\n')
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 % (oldindexfn, olddatafn))
Greg Ward
Add script to rewrite revlog to workaround lack of parent deltas....
r9515
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 cmdtable = {
'shrink': (shrink,
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
('n', 'dry-run', None, _('do not shrink, simulate only')),
Greg Ward
shrink-revlog: add --sort option for user-selectable toposort algorithm.
r10622 ('', 'sort', 'branchsort', 'name of sort algorithm to use'),
Patrick Mezard
shrink-revlog: add --dry-run option
r10241 ],
Benoit Boissinot
shrink-revlog: add strings for translation / import _ before using it
r10508 _('hg shrink [--revlog PATH]'))
Dirkjan Ochtman
contrib: turn shrink-revlog.py into an extension
r10215 }
Greg Ward
shrink-revlog: help/doc tweaks...
r10236
if __name__ == "__main__":
Matt Mackall
many, many trivial check-code fixups
r10282 print "shrink-revlog.py is now an extension (see hg help extensions)"