##// END OF EJS Templates
shrink-revlog: factor out suboptimal computation
Benoit Boissinot -
r10626:3fc95c3b default
parent child Browse files
Show More
@@ -1,310 +1,302
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2
2
3 """\
3 """\
4 reorder a revlog (the manifest by default) to save space
4 reorder a revlog (the manifest by default) to save space
5
5
6 Specifically, this topologically sorts the revisions in the revlog so that
6 Specifically, this topologically sorts the revisions in the revlog so that
7 revisions on the same branch are adjacent as much as possible. This is a
7 revisions on the same branch are adjacent as much as possible. This is a
8 workaround for the fact that Mercurial computes deltas relative to the
8 workaround for the fact that Mercurial computes deltas relative to the
9 previous revision rather than relative to a parent revision.
9 previous revision rather than relative to a parent revision.
10
10
11 This is *not* safe to run on a changelog.
11 This is *not* safe to run on a changelog.
12 """
12 """
13
13
14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
16 # renamed by Greg Ward <greg at gerg.ca>.
16 # renamed by Greg Ward <greg at gerg.ca>.
17
17
18 # XXX would be nice to have a way to verify the repository after shrinking,
18 # XXX would be nice to have a way to verify the repository after shrinking,
19 # e.g. by comparing "before" and "after" states of random changesets
19 # e.g. by comparing "before" and "after" states of random changesets
20 # (maybe: export before, shrink, export after, diff).
20 # (maybe: export before, shrink, export after, diff).
21
21
22 import os, tempfile, errno
22 import os, tempfile, errno
23 from mercurial import revlog, transaction, node, util
23 from mercurial import revlog, transaction, node, util
24 from mercurial import changegroup
24 from mercurial import changegroup
25 from mercurial.i18n import _
25 from mercurial.i18n import _
26
26
27
27
28
28
29 def toposort_reversepostorder(ui, rl):
29 def toposort_reversepostorder(ui, rl):
30 # postorder of the reverse directed graph
30 # postorder of the reverse directed graph
31
31
32 # map rev to list of parent revs (p2 first)
32 # map rev to list of parent revs (p2 first)
33 parents = {}
33 parents = {}
34 heads = set()
34 heads = set()
35 ui.status(_('reading revs\n'))
35 ui.status(_('reading revs\n'))
36 try:
36 try:
37 for rev in rl:
37 for rev in rl:
38 ui.progress(_('reading'), rev, total=len(rl))
38 ui.progress(_('reading'), rev, total=len(rl))
39 (p1, p2) = rl.parentrevs(rev)
39 (p1, p2) = rl.parentrevs(rev)
40 if p1 == p2 == node.nullrev:
40 if p1 == p2 == node.nullrev:
41 parents[rev] = () # root node
41 parents[rev] = () # root node
42 elif p1 == p2 or p2 == node.nullrev:
42 elif p1 == p2 or p2 == node.nullrev:
43 parents[rev] = (p1,) # normal node
43 parents[rev] = (p1,) # normal node
44 else:
44 else:
45 parents[rev] = (p2, p1) # merge node
45 parents[rev] = (p2, p1) # merge node
46
46
47 heads.add(rev)
47 heads.add(rev)
48
48
49 for p in parents[rev]:
49 for p in parents[rev]:
50 heads.discard(p)
50 heads.discard(p)
51 finally:
51 finally:
52 ui.progress(_('reading'), None, total=len(rl))
52 ui.progress(_('reading'), None, total=len(rl))
53
53
54 ui.status(_('sorting revs\n'))
54 ui.status(_('sorting revs\n'))
55 result = []
55 result = []
56 visit = list(heads)
56 visit = list(heads)
57 visit.sort(reverse=True)
57 visit.sort(reverse=True)
58 finished = set()
58 finished = set()
59 suboptimal = 0
60
59
61 while visit:
60 while visit:
62 cur = visit[-1]
61 cur = visit[-1]
63 for p in parents[cur]:
62 for p in parents[cur]:
64 if p not in finished:
63 if p not in finished:
65 visit.append(p)
64 visit.append(p)
66 break
65 break
67 else:
66 else:
68 curparents = rl.parentrevs(cur)
69 if result and result[-1] != curparents[0]:
70 suboptimal += 1
71
72 result.append(cur)
67 result.append(cur)
73 finished.add(cur)
68 finished.add(cur)
74 visit.pop()
69 visit.pop()
75
70
76 ui.note(_('%d suboptimal nodes\n') % suboptimal)
77
78 return result
71 return result
79
72
80 def toposort_postorderreverse(ui, rl):
73 def toposort_postorderreverse(ui, rl):
81 # reverse-postorder of the reverse directed graph
74 # reverse-postorder of the reverse directed graph
82
75
83 children = {}
76 children = {}
84 roots = set()
77 roots = set()
85 ui.status(_('reading revs\n'))
78 ui.status(_('reading revs\n'))
86 try:
79 try:
87 for rev in rl:
80 for rev in rl:
88 ui.progress(_('reading'), rev, total=len(rl))
81 ui.progress(_('reading'), rev, total=len(rl))
89 (p1, p2) = rl.parentrevs(rev)
82 (p1, p2) = rl.parentrevs(rev)
90 if p1 == p2 == node.nullrev:
83 if p1 == p2 == node.nullrev:
91 roots.add(rev)
84 roots.add(rev)
92
85
93 children[rev] = []
86 children[rev] = []
94
87
95 if p1 != node.nullrev:
88 if p1 != node.nullrev:
96 children[p1].append(rev)
89 children[p1].append(rev)
97 if p2 != node.nullrev:
90 if p2 != node.nullrev:
98 children[p2].append(rev)
91 children[p2].append(rev)
99 finally:
92 finally:
100 ui.progress(_('reading'), None, total=len(rl))
93 ui.progress(_('reading'), None, total=len(rl))
101
94
102 ui.status(_('sorting revs\n'))
95 ui.status(_('sorting revs\n'))
103 result = []
96 result = []
104 visit = list(roots)
97 visit = list(roots)
105 visit.sort()
98 visit.sort()
106 finished = set()
99 finished = set()
107 suboptimal = 0
108
100
109 while visit:
101 while visit:
110 cur = visit[-1]
102 cur = visit[-1]
111 for p in children[cur]:
103 for p in children[cur]:
112 if p not in finished:
104 if p not in finished:
113 visit.append(p)
105 visit.append(p)
114 break
106 break
115 else:
107 else:
116 # if cur is not the first parent of its successor, then the
117 # successor is a suboptimal node
118 if result:
119 succparents = rl.parentrevs(result[-1])
120 if succparents[0] != cur:
121 suboptimal += 1
122
123 result.append(cur)
108 result.append(cur)
124 finished.add(cur)
109 finished.add(cur)
125 visit.pop()
110 visit.pop()
126
111
127 ui.note(_('%d suboptimal nodes\n') % suboptimal)
128
129 result.reverse()
112 result.reverse()
130 return result
113 return result
131
114
132 def writerevs(ui, r1, r2, order, tr):
115 def writerevs(ui, r1, r2, order, tr):
133
116
134 ui.status(_('writing revs\n'))
117 ui.status(_('writing revs\n'))
135
118
136 count = [0]
119 count = [0]
137 def progress(*args):
120 def progress(*args):
138 ui.progress(_('writing'), count[0], total=len(order))
121 ui.progress(_('writing'), count[0], total=len(order))
139 count[0] += 1
122 count[0] += 1
140
123
141 order = [r1.node(r) for r in order]
124 order = [r1.node(r) for r in order]
142
125
143 # this is a bit ugly, but it works
126 # this is a bit ugly, but it works
144 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
127 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
145 unlookup = lambda x: int(x, 10)
128 unlookup = lambda x: int(x, 10)
146
129
147 try:
130 try:
148 group = util.chunkbuffer(r1.group(order, lookup, progress))
131 group = util.chunkbuffer(r1.group(order, lookup, progress))
149 chunkiter = changegroup.chunkiter(group)
132 chunkiter = changegroup.chunkiter(group)
150 r2.addgroup(chunkiter, unlookup, tr)
133 r2.addgroup(chunkiter, unlookup, tr)
151 finally:
134 finally:
152 ui.progress(_('writing'), None, len(order))
135 ui.progress(_('writing'), None, len(order))
153
136
154 def report(ui, r1, r2):
137 def report(ui, r1, r2):
155 def getsize(r):
138 def getsize(r):
156 s = 0
139 s = 0
157 for fn in (r.indexfile, r.datafile):
140 for fn in (r.indexfile, r.datafile):
158 try:
141 try:
159 s += os.stat(fn).st_size
142 s += os.stat(fn).st_size
160 except OSError, inst:
143 except OSError, inst:
161 if inst.errno != errno.ENOENT:
144 if inst.errno != errno.ENOENT:
162 raise
145 raise
163 return s
146 return s
164
147
165 oldsize = float(getsize(r1))
148 oldsize = float(getsize(r1))
166 newsize = float(getsize(r2))
149 newsize = float(getsize(r2))
167
150
168 # argh: have to pass an int to %d, because a float >= 2^32
151 # argh: have to pass an int to %d, because a float >= 2^32
169 # blows up under Python 2.5 or earlier
152 # blows up under Python 2.5 or earlier
170 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
153 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
171 % (int(oldsize), oldsize / 1024 / 1024))
154 % (int(oldsize), oldsize / 1024 / 1024))
172 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
155 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
173 % (int(newsize), newsize / 1024 / 1024))
156 % (int(newsize), newsize / 1024 / 1024))
174
157
175 shrink_percent = (oldsize - newsize) / oldsize * 100
158 shrink_percent = (oldsize - newsize) / oldsize * 100
176 shrink_factor = oldsize / newsize
159 shrink_factor = oldsize / newsize
177 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
160 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
178 % (shrink_percent, shrink_factor))
161 % (shrink_percent, shrink_factor))
179
162
180 def shrink(ui, repo, **opts):
163 def shrink(ui, repo, **opts):
181 """shrink a revlog by reordering revisions
164 """shrink a revlog by reordering revisions
182
165
183 Rewrites all the entries in some revlog of the current repository
166 Rewrites all the entries in some revlog of the current repository
184 (by default, the manifest log) to save space.
167 (by default, the manifest log) to save space.
185
168
186 Different sort algorithms have different performance
169 Different sort algorithms have different performance
187 characteristics. Use ``--sort`` to select a sort algorithm so you
170 characteristics. Use ``--sort`` to select a sort algorithm so you
188 can determine which works best for your data.
171 can determine which works best for your data.
189 """
172 """
190
173
191 if not repo.local():
174 if not repo.local():
192 raise util.Abort(_('not a local repository: %s') % repo.root)
175 raise util.Abort(_('not a local repository: %s') % repo.root)
193
176
194 fn = opts.get('revlog')
177 fn = opts.get('revlog')
195 if not fn:
178 if not fn:
196 indexfn = repo.sjoin('00manifest.i')
179 indexfn = repo.sjoin('00manifest.i')
197 else:
180 else:
198 if not fn.endswith('.i'):
181 if not fn.endswith('.i'):
199 raise util.Abort(_('--revlog option must specify the revlog index '
182 raise util.Abort(_('--revlog option must specify the revlog index '
200 'file (*.i), not %s') % opts.get('revlog'))
183 'file (*.i), not %s') % opts.get('revlog'))
201
184
202 indexfn = os.path.realpath(fn)
185 indexfn = os.path.realpath(fn)
203 store = repo.sjoin('')
186 store = repo.sjoin('')
204 if not indexfn.startswith(store):
187 if not indexfn.startswith(store):
205 raise util.Abort(_('--revlog option must specify a revlog in %s, '
188 raise util.Abort(_('--revlog option must specify a revlog in %s, '
206 'not %s') % (store, indexfn))
189 'not %s') % (store, indexfn))
207
190
208 sortname = opts['sort']
191 sortname = opts['sort']
209 try:
192 try:
210 toposort = globals()['toposort_' + sortname]
193 toposort = globals()['toposort_' + sortname]
211 except KeyError:
194 except KeyError:
212 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
195 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
213
196
214 if not os.path.exists(indexfn):
197 if not os.path.exists(indexfn):
215 raise util.Abort(_('no such file: %s') % indexfn)
198 raise util.Abort(_('no such file: %s') % indexfn)
216 if '00changelog' in indexfn:
199 if '00changelog' in indexfn:
217 raise util.Abort(_('shrinking the changelog '
200 raise util.Abort(_('shrinking the changelog '
218 'will corrupt your repository'))
201 'will corrupt your repository'))
219
202
220 ui.write(_('shrinking %s\n') % indexfn)
203 ui.write(_('shrinking %s\n') % indexfn)
221 prefix = os.path.basename(indexfn)[:-1]
204 prefix = os.path.basename(indexfn)[:-1]
222 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
205 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
223 prefix=prefix,
206 prefix=prefix,
224 suffix='.i')
207 suffix='.i')
225 os.close(tmpfd)
208 os.close(tmpfd)
226
209
227 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
210 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
228 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
211 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
229
212
230 datafn, tmpdatafn = r1.datafile, r2.datafile
213 datafn, tmpdatafn = r1.datafile, r2.datafile
231
214
232 oldindexfn = indexfn + '.old'
215 oldindexfn = indexfn + '.old'
233 olddatafn = datafn + '.old'
216 olddatafn = datafn + '.old'
234 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
217 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
235 raise util.Abort(_('one or both of\n'
218 raise util.Abort(_('one or both of\n'
236 ' %s\n'
219 ' %s\n'
237 ' %s\n'
220 ' %s\n'
238 'exists from a previous run; please clean up '
221 'exists from a previous run; please clean up '
239 'before running again') % (oldindexfn, olddatafn))
222 'before running again') % (oldindexfn, olddatafn))
240
223
241 # Don't use repo.transaction(), because then things get hairy with
224 # Don't use repo.transaction(), because then things get hairy with
242 # paths: some need to be relative to .hg, and some need to be
225 # paths: some need to be relative to .hg, and some need to be
243 # absolute. Doing it this way keeps things simple: everything is an
226 # absolute. Doing it this way keeps things simple: everything is an
244 # absolute path.
227 # absolute path.
245 lock = repo.lock(wait=False)
228 lock = repo.lock(wait=False)
246 tr = transaction.transaction(ui.warn,
229 tr = transaction.transaction(ui.warn,
247 open,
230 open,
248 repo.sjoin('journal'))
231 repo.sjoin('journal'))
249
232
250 def ignoremissing(func):
233 def ignoremissing(func):
251 def f(*args, **kw):
234 def f(*args, **kw):
252 try:
235 try:
253 return func(*args, **kw)
236 return func(*args, **kw)
254 except OSError, inst:
237 except OSError, inst:
255 if inst.errno != errno.ENOENT:
238 if inst.errno != errno.ENOENT:
256 raise
239 raise
257 return f
240 return f
258
241
259 try:
242 try:
260 try:
243 try:
261 order = toposort(ui, r1)
244 order = toposort(ui, r1)
245
246 suboptimal = 0
247 for i in xrange(1, len(order)):
248 parents = [p for p in r1.parentrevs(order[i])
249 if p != node.nullrev]
250 if parents and order[i-1] not in parents:
251 suboptimal += 1
252 ui.note(_('%d suboptimal nodes\n') % suboptimal)
253
262 writerevs(ui, r1, r2, order, tr)
254 writerevs(ui, r1, r2, order, tr)
263 report(ui, r1, r2)
255 report(ui, r1, r2)
264 tr.close()
256 tr.close()
265 except:
257 except:
266 # Abort transaction first, so we truncate the files before
258 # Abort transaction first, so we truncate the files before
267 # deleting them.
259 # deleting them.
268 tr.abort()
260 tr.abort()
269 for fn in (tmpindexfn, tmpdatafn):
261 for fn in (tmpindexfn, tmpdatafn):
270 ignoremissing(os.unlink)(fn)
262 ignoremissing(os.unlink)(fn)
271 raise
263 raise
272 if not opts.get('dry_run'):
264 if not opts.get('dry_run'):
273 # racy, both files cannot be renamed atomically
265 # racy, both files cannot be renamed atomically
274 # copy files
266 # copy files
275 util.os_link(indexfn, oldindexfn)
267 util.os_link(indexfn, oldindexfn)
276 ignoremissing(util.os_link)(datafn, olddatafn)
268 ignoremissing(util.os_link)(datafn, olddatafn)
277 # rename
269 # rename
278 util.rename(tmpindexfn, indexfn)
270 util.rename(tmpindexfn, indexfn)
279 try:
271 try:
280 util.rename(tmpdatafn, datafn)
272 util.rename(tmpdatafn, datafn)
281 except OSError, inst:
273 except OSError, inst:
282 if inst.errno != errno.ENOENT:
274 if inst.errno != errno.ENOENT:
283 raise
275 raise
284 ignoremissing(os.unlink)(datafn)
276 ignoremissing(os.unlink)(datafn)
285 else:
277 else:
286 for fn in (tmpindexfn, tmpdatafn):
278 for fn in (tmpindexfn, tmpdatafn):
287 ignoremissing(os.unlink)(fn)
279 ignoremissing(os.unlink)(fn)
288 finally:
280 finally:
289 lock.release()
281 lock.release()
290
282
291 if not opts.get('dry_run'):
283 if not opts.get('dry_run'):
292 ui.write(_('note: old revlog saved in:\n'
284 ui.write(_('note: old revlog saved in:\n'
293 ' %s\n'
285 ' %s\n'
294 ' %s\n'
286 ' %s\n'
295 '(You can delete those files when you are satisfied that your\n'
287 '(You can delete those files when you are satisfied that your\n'
296 'repository is still sane. '
288 'repository is still sane. '
297 'Running \'hg verify\' is strongly recommended.)\n')
289 'Running \'hg verify\' is strongly recommended.)\n')
298 % (oldindexfn, olddatafn))
290 % (oldindexfn, olddatafn))
299
291
300 cmdtable = {
292 cmdtable = {
301 'shrink': (shrink,
293 'shrink': (shrink,
302 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
294 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
303 ('n', 'dry-run', None, _('do not shrink, simulate only')),
295 ('n', 'dry-run', None, _('do not shrink, simulate only')),
304 ('', 'sort', 'reversepostorder', 'name of sort algorithm to use'),
296 ('', 'sort', 'reversepostorder', 'name of sort algorithm to use'),
305 ],
297 ],
306 _('hg shrink [--revlog PATH]'))
298 _('hg shrink [--revlog PATH]'))
307 }
299 }
308
300
309 if __name__ == "__main__":
301 if __name__ == "__main__":
310 print "shrink-revlog.py is now an extension (see hg help extensions)"
302 print "shrink-revlog.py is now an extension (see hg help extensions)"
General Comments 0
You need to be logged in to leave comments. Login now