##// END OF EJS Templates
shrink-revlog: improve performance: use changegroup instead of revisions...
Benoit Boissinot -
r10009:69dca857 default
parent child Browse files
Show More
@@ -1,218 +1,224 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2
2
3 """\
3 """\
4 Reorder a revlog (by default the the manifest file in the current
4 Reorder a revlog (by default the the manifest file in the current
5 repository) to save space. Specifically, this topologically sorts the
5 repository) to save space. Specifically, this topologically sorts the
6 revisions in the revlog so that revisions on the same branch are adjacent
6 revisions in the revlog so that revisions on the same branch are adjacent
7 as much as possible. This is a workaround for the fact that Mercurial
7 as much as possible. This is a workaround for the fact that Mercurial
8 computes deltas relative to the previous revision rather than relative to a
8 computes deltas relative to the previous revision rather than relative to a
9 parent revision. This is *not* safe to run on a changelog.
9 parent revision. This is *not* safe to run on a changelog.
10 """
10 """
11
11
12 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
12 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
13 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
13 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
14 # renamed by Greg Ward <greg at gerg.ca>.
14 # renamed by Greg Ward <greg at gerg.ca>.
15
15
16 # XXX would be nice to have a way to verify the repository after shrinking,
16 # XXX would be nice to have a way to verify the repository after shrinking,
17 # e.g. by comparing "before" and "after" states of random changesets
17 # e.g. by comparing "before" and "after" states of random changesets
18 # (maybe: export before, shrink, export after, diff).
18 # (maybe: export before, shrink, export after, diff).
19
19
20 import sys, os, tempfile
20 import sys, os, tempfile
21 import optparse
21 import optparse
22 from mercurial import ui as ui_, hg, revlog, transaction, node, util
22 from mercurial import ui as ui_, hg, revlog, transaction, node, util
23 from mercurial import changegroup
23
24
24 def toposort(rl):
25 def toposort(rl):
25 write = sys.stdout.write
26 write = sys.stdout.write
26
27
27 children = {}
28 children = {}
28 root = []
29 root = []
29 # build children and roots
30 # build children and roots
30 write('reading %d revs ' % len(rl))
31 write('reading %d revs ' % len(rl))
31 try:
32 try:
32 for i in rl:
33 for i in rl:
33 children[i] = []
34 children[i] = []
34 parents = [p for p in rl.parentrevs(i) if p != node.nullrev]
35 parents = [p for p in rl.parentrevs(i) if p != node.nullrev]
35 # in case of duplicate parents
36 # in case of duplicate parents
36 if len(parents) == 2 and parents[0] == parents[1]:
37 if len(parents) == 2 and parents[0] == parents[1]:
37 del parents[1]
38 del parents[1]
38 for p in parents:
39 for p in parents:
39 assert p in children
40 assert p in children
40 children[p].append(i)
41 children[p].append(i)
41
42
42 if len(parents) == 0:
43 if len(parents) == 0:
43 root.append(i)
44 root.append(i)
44
45
45 if i % 1000 == 0:
46 if i % 1000 == 0:
46 write('.')
47 write('.')
47 finally:
48 finally:
48 write('\n')
49 write('\n')
49
50
50 # XXX this is a reimplementation of the 'branchsort' topo sort
51 # XXX this is a reimplementation of the 'branchsort' topo sort
51 # algorithm in hgext.convert.convcmd... would be nice not to duplicate
52 # algorithm in hgext.convert.convcmd... would be nice not to duplicate
52 # the algorithm
53 # the algorithm
53 write('sorting ...')
54 write('sorting ...')
54 visit = root
55 visit = root
55 ret = []
56 ret = []
56 while visit:
57 while visit:
57 i = visit.pop(0)
58 i = visit.pop(0)
58 ret.append(i)
59 ret.append(i)
59 if i not in children:
60 if i not in children:
60 # This only happens if some node's p1 == p2, which can
61 # This only happens if some node's p1 == p2, which can
61 # happen in the manifest in certain circumstances.
62 # happen in the manifest in certain circumstances.
62 continue
63 continue
63 next = []
64 next = []
64 for c in children.pop(i):
65 for c in children.pop(i):
65 parents_unseen = [p for p in rl.parentrevs(c)
66 parents_unseen = [p for p in rl.parentrevs(c)
66 if p != node.nullrev and p in children]
67 if p != node.nullrev and p in children]
67 if len(parents_unseen) == 0:
68 if len(parents_unseen) == 0:
68 next.append(c)
69 next.append(c)
69 visit = next + visit
70 visit = next + visit
70 write('\n')
71 write('\n')
71 return ret
72 return ret
72
73
73 def writerevs(r1, r2, order, tr):
74 def writerevs(r1, r2, order, tr):
74 write = sys.stdout.write
75 write = sys.stdout.write
75 write('writing %d revs ' % len(order))
76 write('writing %d revs ' % len(order))
77
78 count = [0]
79 def progress(*args):
80 if count[0] % 1000 == 0:
81 write('.')
82 count[0] += 1
83
84 order = [r1.node(r) for r in order]
85
86 # this is a bit ugly, but it works
87 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
88 unlookup = lambda x: int(x, 10)
89
76 try:
90 try:
77 count = 0
91 group = util.chunkbuffer(r1.group(order, lookup, progress))
78 for rev in order:
92 chunkiter = changegroup.chunkiter(group)
79 n = r1.node(rev)
93 r2.addgroup(chunkiter, unlookup, tr)
80 p1, p2 = r1.parents(n)
81 l = r1.linkrev(rev)
82 t = r1.revision(n)
83 n2 = r2.addrevision(t, tr, l, p1, p2)
84
85 if count % 1000 == 0:
86 write('.')
87 count += 1
88 finally:
94 finally:
89 write('\n')
95 write('\n')
90
96
91 def report(olddatafn, newdatafn):
97 def report(olddatafn, newdatafn):
92 oldsize = float(os.stat(olddatafn).st_size)
98 oldsize = float(os.stat(olddatafn).st_size)
93 newsize = float(os.stat(newdatafn).st_size)
99 newsize = float(os.stat(newdatafn).st_size)
94
100
95 # argh: have to pass an int to %d, because a float >= 2^32
101 # argh: have to pass an int to %d, because a float >= 2^32
96 # blows up under Python 2.5 or earlier
102 # blows up under Python 2.5 or earlier
97 sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n'
103 sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n'
98 % (int(oldsize), oldsize/1024/1024))
104 % (int(oldsize), oldsize/1024/1024))
99 sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n'
105 sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n'
100 % (int(newsize), newsize/1024/1024))
106 % (int(newsize), newsize/1024/1024))
101
107
102 shrink_percent = (oldsize - newsize) / oldsize * 100
108 shrink_percent = (oldsize - newsize) / oldsize * 100
103 shrink_factor = oldsize / newsize
109 shrink_factor = oldsize / newsize
104 sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n'
110 sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n'
105 % (shrink_percent, shrink_factor))
111 % (shrink_percent, shrink_factor))
106
112
107 def main():
113 def main():
108
114
109 # Unbuffer stdout for nice progress output.
115 # Unbuffer stdout for nice progress output.
110 sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
116 sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
111 write = sys.stdout.write
117 write = sys.stdout.write
112
118
113 parser = optparse.OptionParser(description=__doc__)
119 parser = optparse.OptionParser(description=__doc__)
114 parser.add_option('-R', '--repository',
120 parser.add_option('-R', '--repository',
115 default=os.path.curdir,
121 default=os.path.curdir,
116 metavar='REPO',
122 metavar='REPO',
117 help='repository root directory [default: current dir]')
123 help='repository root directory [default: current dir]')
118 parser.add_option('--revlog',
124 parser.add_option('--revlog',
119 metavar='FILE',
125 metavar='FILE',
120 help='shrink FILE [default: REPO/hg/store/00manifest.i]')
126 help='shrink FILE [default: REPO/hg/store/00manifest.i]')
121 (options, args) = parser.parse_args()
127 (options, args) = parser.parse_args()
122 if args:
128 if args:
123 parser.error('too many arguments')
129 parser.error('too many arguments')
124
130
125 # Open the specified repository.
131 # Open the specified repository.
126 ui = ui_.ui()
132 ui = ui_.ui()
127 repo = hg.repository(ui, options.repository)
133 repo = hg.repository(ui, options.repository)
128 if not repo.local():
134 if not repo.local():
129 parser.error('not a local repository: %s' % options.repository)
135 parser.error('not a local repository: %s' % options.repository)
130
136
131 if options.revlog is None:
137 if options.revlog is None:
132 indexfn = repo.sjoin('00manifest.i')
138 indexfn = repo.sjoin('00manifest.i')
133 else:
139 else:
134 if not options.revlog.endswith('.i'):
140 if not options.revlog.endswith('.i'):
135 parser.error('--revlog option must specify the revlog index file '
141 parser.error('--revlog option must specify the revlog index file '
136 '(*.i), not %s' % options.revlog)
142 '(*.i), not %s' % options.revlog)
137
143
138 indexfn = os.path.realpath(options.revlog)
144 indexfn = os.path.realpath(options.revlog)
139 store = repo.sjoin('')
145 store = repo.sjoin('')
140 if not indexfn.startswith(store):
146 if not indexfn.startswith(store):
141 parser.error('--revlog option must specify a revlog in %s, not %s'
147 parser.error('--revlog option must specify a revlog in %s, not %s'
142 % (store, indexfn))
148 % (store, indexfn))
143
149
144 datafn = indexfn[:-2] + '.d'
150 datafn = indexfn[:-2] + '.d'
145 if not os.path.exists(indexfn):
151 if not os.path.exists(indexfn):
146 parser.error('no such file: %s' % indexfn)
152 parser.error('no such file: %s' % indexfn)
147 if '00changelog' in indexfn:
153 if '00changelog' in indexfn:
148 parser.error('shrinking the changelog will corrupt your repository')
154 parser.error('shrinking the changelog will corrupt your repository')
149 if not os.path.exists(datafn):
155 if not os.path.exists(datafn):
150 # This is just a lazy shortcut because I can't be bothered to
156 # This is just a lazy shortcut because I can't be bothered to
151 # handle all the special cases that entail from no .d file.
157 # handle all the special cases that entail from no .d file.
152 parser.error('%s does not exist: revlog not big enough '
158 parser.error('%s does not exist: revlog not big enough '
153 'to be worth shrinking' % datafn)
159 'to be worth shrinking' % datafn)
154
160
155 oldindexfn = indexfn + '.old'
161 oldindexfn = indexfn + '.old'
156 olddatafn = datafn + '.old'
162 olddatafn = datafn + '.old'
157 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
163 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
158 parser.error('one or both of\n'
164 parser.error('one or both of\n'
159 ' %s\n'
165 ' %s\n'
160 ' %s\n'
166 ' %s\n'
161 'exists from a previous run; please clean up before '
167 'exists from a previous run; please clean up before '
162 'running again'
168 'running again'
163 % (oldindexfn, olddatafn))
169 % (oldindexfn, olddatafn))
164
170
165 write('shrinking %s\n' % indexfn)
171 write('shrinking %s\n' % indexfn)
166 prefix = os.path.basename(indexfn)[:-1]
172 prefix = os.path.basename(indexfn)[:-1]
167 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
173 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
168 prefix=prefix,
174 prefix=prefix,
169 suffix='.i')
175 suffix='.i')
170 tmpdatafn = tmpindexfn[:-2] + '.d'
176 tmpdatafn = tmpindexfn[:-2] + '.d'
171 os.close(tmpfd)
177 os.close(tmpfd)
172
178
173 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
179 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
174 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
180 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
175
181
176 # Don't use repo.transaction(), because then things get hairy with
182 # Don't use repo.transaction(), because then things get hairy with
177 # paths: some need to be relative to .hg, and some need to be
183 # paths: some need to be relative to .hg, and some need to be
178 # absolute. Doing it this way keeps things simple: everything is an
184 # absolute. Doing it this way keeps things simple: everything is an
179 # absolute path.
185 # absolute path.
180 lock = repo.lock(wait=False)
186 lock = repo.lock(wait=False)
181 tr = transaction.transaction(sys.stderr.write,
187 tr = transaction.transaction(sys.stderr.write,
182 open,
188 open,
183 repo.sjoin('journal'))
189 repo.sjoin('journal'))
184
190
185 try:
191 try:
186 try:
192 try:
187 order = toposort(r1)
193 order = toposort(r1)
188 writerevs(r1, r2, order, tr)
194 writerevs(r1, r2, order, tr)
189 report(datafn, tmpdatafn)
195 report(datafn, tmpdatafn)
190 tr.close()
196 tr.close()
191 except:
197 except:
192 # Abort transaction first, so we truncate the files before
198 # Abort transaction first, so we truncate the files before
193 # deleting them.
199 # deleting them.
194 tr.abort()
200 tr.abort()
195 if os.path.exists(tmpindexfn):
201 if os.path.exists(tmpindexfn):
196 os.unlink(tmpindexfn)
202 os.unlink(tmpindexfn)
197 if os.path.exists(tmpdatafn):
203 if os.path.exists(tmpdatafn):
198 os.unlink(tmpdatafn)
204 os.unlink(tmpdatafn)
199 raise
205 raise
200 finally:
206 finally:
201 lock.release()
207 lock.release()
202
208
203 os.link(indexfn, oldindexfn)
209 os.link(indexfn, oldindexfn)
204 os.link(datafn, olddatafn)
210 os.link(datafn, olddatafn)
205 os.rename(tmpindexfn, indexfn)
211 os.rename(tmpindexfn, indexfn)
206 os.rename(tmpdatafn, datafn)
212 os.rename(tmpdatafn, datafn)
207 write('note: old revlog saved in:\n'
213 write('note: old revlog saved in:\n'
208 ' %s\n'
214 ' %s\n'
209 ' %s\n'
215 ' %s\n'
210 '(You can delete those files when you are satisfied that your\n'
216 '(You can delete those files when you are satisfied that your\n'
211 'repository is still sane. '
217 'repository is still sane. '
212 'Running \'hg verify\' is strongly recommended.)\n'
218 'Running \'hg verify\' is strongly recommended.)\n'
213 % (oldindexfn, olddatafn))
219 % (oldindexfn, olddatafn))
214
220
215 try:
221 try:
216 main()
222 main()
217 except KeyboardInterrupt:
223 except KeyboardInterrupt:
218 sys.exit("interrupted")
224 sys.exit("interrupted")
General Comments 0
You need to be logged in to leave comments. Login now