##// END OF EJS Templates
shrink-revlog: remove branchsort algorithm (it behaves poorly)
Benoit Boissinot -
r10625:add562ab default
parent child Browse files
Show More
@@ -1,368 +1,310 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2
2
3 """\
3 """\
4 reorder a revlog (the manifest by default) to save space
4 reorder a revlog (the manifest by default) to save space
5
5
6 Specifically, this topologically sorts the revisions in the revlog so that
6 Specifically, this topologically sorts the revisions in the revlog so that
7 revisions on the same branch are adjacent as much as possible. This is a
7 revisions on the same branch are adjacent as much as possible. This is a
8 workaround for the fact that Mercurial computes deltas relative to the
8 workaround for the fact that Mercurial computes deltas relative to the
9 previous revision rather than relative to a parent revision.
9 previous revision rather than relative to a parent revision.
10
10
11 This is *not* safe to run on a changelog.
11 This is *not* safe to run on a changelog.
12 """
12 """
13
13
14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
16 # renamed by Greg Ward <greg at gerg.ca>.
16 # renamed by Greg Ward <greg at gerg.ca>.
17
17
18 # XXX would be nice to have a way to verify the repository after shrinking,
18 # XXX would be nice to have a way to verify the repository after shrinking,
19 # e.g. by comparing "before" and "after" states of random changesets
19 # e.g. by comparing "before" and "after" states of random changesets
20 # (maybe: export before, shrink, export after, diff).
20 # (maybe: export before, shrink, export after, diff).
21
21
22 import os, tempfile, errno
22 import os, tempfile, errno
23 from mercurial import revlog, transaction, node, util
23 from mercurial import revlog, transaction, node, util
24 from mercurial import changegroup
24 from mercurial import changegroup
25 from mercurial.i18n import _
25 from mercurial.i18n import _
26
26
27 def toposort_branchsort(ui, rl):
28
27
29 children = {}
30 root = []
31 # build children and roots
32 ui.status(_('reading revs\n'))
33 try:
34 for rev in rl:
35 ui.progress(_('reading'), rev, total=len(rl))
36 children[rev] = []
37 parents = [p for p in rl.parentrevs(rev) if p != node.nullrev]
38 # in case of duplicate parents
39 if len(parents) == 2 and parents[0] == parents[1]:
40 del parents[1]
41 for p in parents:
42 assert p in children
43 children[p].append(rev)
44
45 if len(parents) == 0:
46 root.append(rev)
47 finally:
48 ui.progress(_('reading'), None, total=len(rl))
49
50 # XXX this is a reimplementation of the 'branchsort' topo sort
51 # algorithm in hgext.convert.convcmd... would be nice not to duplicate
52 # the algorithm
53 ui.status(_('sorting revs\n'))
54 visit = root
55 result = []
56
57 # suboptimal: nodes whose predecessor is not first parent
58 suboptimal = 0
59
60 while visit:
61 cur = visit.pop(0)
62 # revlog will compute delta relative to result[-1], so keep track
63 # of nodes where this might result in a large delta
64 parents = rl.parentrevs(cur)
65 if result:
66 if result[-1] != parents[0]:
67 suboptimal += 1
68
69 result.append(cur)
70 if cur not in children:
71 # This only happens if some node's p1 == p2, which can
72 # happen in the manifest in certain circumstances.
73 continue
74 next = []
75 for c in children.pop(cur):
76 parents_unseen = [p for p in rl.parentrevs(c)
77 if p != node.nullrev and p in children]
78 if len(parents_unseen) == 0:
79 next.append(c)
80 visit = next + visit
81 ui.note(_('%d suboptimal nodes\n') % suboptimal)
82 return result
83
28
84 def toposort_reversepostorder(ui, rl):
29 def toposort_reversepostorder(ui, rl):
85 # postorder of the reverse directed graph
30 # postorder of the reverse directed graph
86
31
87 # map rev to list of parent revs (p2 first)
32 # map rev to list of parent revs (p2 first)
88 parents = {}
33 parents = {}
89 heads = set()
34 heads = set()
90 ui.status(_('reading revs\n'))
35 ui.status(_('reading revs\n'))
91 try:
36 try:
92 for rev in rl:
37 for rev in rl:
93 ui.progress(_('reading'), rev, total=len(rl))
38 ui.progress(_('reading'), rev, total=len(rl))
94 (p1, p2) = rl.parentrevs(rev)
39 (p1, p2) = rl.parentrevs(rev)
95 if p1 == p2 == node.nullrev:
40 if p1 == p2 == node.nullrev:
96 parents[rev] = () # root node
41 parents[rev] = () # root node
97 elif p1 == p2 or p2 == node.nullrev:
42 elif p1 == p2 or p2 == node.nullrev:
98 parents[rev] = (p1,) # normal node
43 parents[rev] = (p1,) # normal node
99 else:
44 else:
100 parents[rev] = (p2, p1) # merge node
45 parents[rev] = (p2, p1) # merge node
101
46
102 heads.add(rev)
47 heads.add(rev)
103
48
104 for p in parents[rev]:
49 for p in parents[rev]:
105 heads.discard(p)
50 heads.discard(p)
106 finally:
51 finally:
107 ui.progress(_('reading'), None, total=len(rl))
52 ui.progress(_('reading'), None, total=len(rl))
108
53
109 ui.status(_('sorting revs\n'))
54 ui.status(_('sorting revs\n'))
110 result = []
55 result = []
111 visit = list(heads)
56 visit = list(heads)
112 visit.sort(reverse=True)
57 visit.sort(reverse=True)
113 finished = set()
58 finished = set()
114 suboptimal = 0
59 suboptimal = 0
115
60
116 while visit:
61 while visit:
117 cur = visit[-1]
62 cur = visit[-1]
118 for p in parents[cur]:
63 for p in parents[cur]:
119 if p not in finished:
64 if p not in finished:
120 visit.append(p)
65 visit.append(p)
121 break
66 break
122 else:
67 else:
123 curparents = rl.parentrevs(cur)
68 curparents = rl.parentrevs(cur)
124 if result and result[-1] != curparents[0]:
69 if result and result[-1] != curparents[0]:
125 suboptimal += 1
70 suboptimal += 1
126
71
127 result.append(cur)
72 result.append(cur)
128 finished.add(cur)
73 finished.add(cur)
129 visit.pop()
74 visit.pop()
130
75
131 ui.note(_('%d suboptimal nodes\n') % suboptimal)
76 ui.note(_('%d suboptimal nodes\n') % suboptimal)
132
77
133 return result
78 return result
134
79
135 def toposort_postorderreverse(ui, rl):
80 def toposort_postorderreverse(ui, rl):
136 # reverse-postorder of the reverse directed graph
81 # reverse-postorder of the reverse directed graph
137
82
138 children = {}
83 children = {}
139 roots = set()
84 roots = set()
140 ui.status(_('reading revs\n'))
85 ui.status(_('reading revs\n'))
141 try:
86 try:
142 for rev in rl:
87 for rev in rl:
143 ui.progress(_('reading'), rev, total=len(rl))
88 ui.progress(_('reading'), rev, total=len(rl))
144 (p1, p2) = rl.parentrevs(rev)
89 (p1, p2) = rl.parentrevs(rev)
145 if p1 == p2 == node.nullrev:
90 if p1 == p2 == node.nullrev:
146 roots.add(rev)
91 roots.add(rev)
147
92
148 children[rev] = []
93 children[rev] = []
149
94
150 if p1 != node.nullrev:
95 if p1 != node.nullrev:
151 children[p1].append(rev)
96 children[p1].append(rev)
152 if p2 != node.nullrev:
97 if p2 != node.nullrev:
153 children[p2].append(rev)
98 children[p2].append(rev)
154 finally:
99 finally:
155 ui.progress(_('reading'), None, total=len(rl))
100 ui.progress(_('reading'), None, total=len(rl))
156
101
157 ui.status(_('sorting revs\n'))
102 ui.status(_('sorting revs\n'))
158 result = []
103 result = []
159 visit = list(roots)
104 visit = list(roots)
160 visit.sort()
105 visit.sort()
161 finished = set()
106 finished = set()
162 suboptimal = 0
107 suboptimal = 0
163
108
164 while visit:
109 while visit:
165 cur = visit[-1]
110 cur = visit[-1]
166 for p in children[cur]:
111 for p in children[cur]:
167 if p not in finished:
112 if p not in finished:
168 visit.append(p)
113 visit.append(p)
169 break
114 break
170 else:
115 else:
171 # if cur is not the first parent of its successor, then the
116 # if cur is not the first parent of its successor, then the
172 # successor is a suboptimal node
117 # successor is a suboptimal node
173 if result:
118 if result:
174 succparents = rl.parentrevs(result[-1])
119 succparents = rl.parentrevs(result[-1])
175 if succparents[0] != cur:
120 if succparents[0] != cur:
176 suboptimal += 1
121 suboptimal += 1
177
122
178 result.append(cur)
123 result.append(cur)
179 finished.add(cur)
124 finished.add(cur)
180 visit.pop()
125 visit.pop()
181
126
182 ui.note(_('%d suboptimal nodes\n') % suboptimal)
127 ui.note(_('%d suboptimal nodes\n') % suboptimal)
183
128
184 result.reverse()
129 result.reverse()
185 return result
130 return result
186
131
187 def writerevs(ui, r1, r2, order, tr):
132 def writerevs(ui, r1, r2, order, tr):
188
133
189 ui.status(_('writing revs\n'))
134 ui.status(_('writing revs\n'))
190
135
191 count = [0]
136 count = [0]
192 def progress(*args):
137 def progress(*args):
193 ui.progress(_('writing'), count[0], total=len(order))
138 ui.progress(_('writing'), count[0], total=len(order))
194 count[0] += 1
139 count[0] += 1
195
140
196 order = [r1.node(r) for r in order]
141 order = [r1.node(r) for r in order]
197
142
198 # this is a bit ugly, but it works
143 # this is a bit ugly, but it works
199 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
144 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
200 unlookup = lambda x: int(x, 10)
145 unlookup = lambda x: int(x, 10)
201
146
202 try:
147 try:
203 group = util.chunkbuffer(r1.group(order, lookup, progress))
148 group = util.chunkbuffer(r1.group(order, lookup, progress))
204 chunkiter = changegroup.chunkiter(group)
149 chunkiter = changegroup.chunkiter(group)
205 r2.addgroup(chunkiter, unlookup, tr)
150 r2.addgroup(chunkiter, unlookup, tr)
206 finally:
151 finally:
207 ui.progress(_('writing'), None, len(order))
152 ui.progress(_('writing'), None, len(order))
208
153
209 def report(ui, r1, r2):
154 def report(ui, r1, r2):
210 def getsize(r):
155 def getsize(r):
211 s = 0
156 s = 0
212 for fn in (r.indexfile, r.datafile):
157 for fn in (r.indexfile, r.datafile):
213 try:
158 try:
214 s += os.stat(fn).st_size
159 s += os.stat(fn).st_size
215 except OSError, inst:
160 except OSError, inst:
216 if inst.errno != errno.ENOENT:
161 if inst.errno != errno.ENOENT:
217 raise
162 raise
218 return s
163 return s
219
164
220 oldsize = float(getsize(r1))
165 oldsize = float(getsize(r1))
221 newsize = float(getsize(r2))
166 newsize = float(getsize(r2))
222
167
223 # argh: have to pass an int to %d, because a float >= 2^32
168 # argh: have to pass an int to %d, because a float >= 2^32
224 # blows up under Python 2.5 or earlier
169 # blows up under Python 2.5 or earlier
225 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
170 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
226 % (int(oldsize), oldsize / 1024 / 1024))
171 % (int(oldsize), oldsize / 1024 / 1024))
227 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
172 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
228 % (int(newsize), newsize / 1024 / 1024))
173 % (int(newsize), newsize / 1024 / 1024))
229
174
230 shrink_percent = (oldsize - newsize) / oldsize * 100
175 shrink_percent = (oldsize - newsize) / oldsize * 100
231 shrink_factor = oldsize / newsize
176 shrink_factor = oldsize / newsize
232 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
177 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
233 % (shrink_percent, shrink_factor))
178 % (shrink_percent, shrink_factor))
234
179
235 def shrink(ui, repo, **opts):
180 def shrink(ui, repo, **opts):
236 """shrink a revlog by reordering revisions
181 """shrink a revlog by reordering revisions
237
182
238 Rewrites all the entries in some revlog of the current repository
183 Rewrites all the entries in some revlog of the current repository
239 (by default, the manifest log) to save space.
184 (by default, the manifest log) to save space.
240
185
241 Different sort algorithms have different performance
186 Different sort algorithms have different performance
242 characteristics. Use ``--sort`` to select a sort algorithm so you
187 characteristics. Use ``--sort`` to select a sort algorithm so you
243 can determine which works best for your data. The default
188 can determine which works best for your data.
244 algorithm, ``branchsort``, works well for workflows with lots of
245 active (unmerged) branches, but not so well when all branches have
246 been merged and there is only one repository head.
247 """
189 """
248
190
249 if not repo.local():
191 if not repo.local():
250 raise util.Abort(_('not a local repository: %s') % repo.root)
192 raise util.Abort(_('not a local repository: %s') % repo.root)
251
193
252 fn = opts.get('revlog')
194 fn = opts.get('revlog')
253 if not fn:
195 if not fn:
254 indexfn = repo.sjoin('00manifest.i')
196 indexfn = repo.sjoin('00manifest.i')
255 else:
197 else:
256 if not fn.endswith('.i'):
198 if not fn.endswith('.i'):
257 raise util.Abort(_('--revlog option must specify the revlog index '
199 raise util.Abort(_('--revlog option must specify the revlog index '
258 'file (*.i), not %s') % opts.get('revlog'))
200 'file (*.i), not %s') % opts.get('revlog'))
259
201
260 indexfn = os.path.realpath(fn)
202 indexfn = os.path.realpath(fn)
261 store = repo.sjoin('')
203 store = repo.sjoin('')
262 if not indexfn.startswith(store):
204 if not indexfn.startswith(store):
263 raise util.Abort(_('--revlog option must specify a revlog in %s, '
205 raise util.Abort(_('--revlog option must specify a revlog in %s, '
264 'not %s') % (store, indexfn))
206 'not %s') % (store, indexfn))
265
207
266 sortname = opts['sort']
208 sortname = opts['sort']
267 try:
209 try:
268 toposort = globals()['toposort_' + sortname]
210 toposort = globals()['toposort_' + sortname]
269 except KeyError:
211 except KeyError:
270 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
212 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
271
213
272 if not os.path.exists(indexfn):
214 if not os.path.exists(indexfn):
273 raise util.Abort(_('no such file: %s') % indexfn)
215 raise util.Abort(_('no such file: %s') % indexfn)
274 if '00changelog' in indexfn:
216 if '00changelog' in indexfn:
275 raise util.Abort(_('shrinking the changelog '
217 raise util.Abort(_('shrinking the changelog '
276 'will corrupt your repository'))
218 'will corrupt your repository'))
277
219
278 ui.write(_('shrinking %s\n') % indexfn)
220 ui.write(_('shrinking %s\n') % indexfn)
279 prefix = os.path.basename(indexfn)[:-1]
221 prefix = os.path.basename(indexfn)[:-1]
280 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
222 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
281 prefix=prefix,
223 prefix=prefix,
282 suffix='.i')
224 suffix='.i')
283 os.close(tmpfd)
225 os.close(tmpfd)
284
226
285 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
227 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
286 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
228 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
287
229
288 datafn, tmpdatafn = r1.datafile, r2.datafile
230 datafn, tmpdatafn = r1.datafile, r2.datafile
289
231
290 oldindexfn = indexfn + '.old'
232 oldindexfn = indexfn + '.old'
291 olddatafn = datafn + '.old'
233 olddatafn = datafn + '.old'
292 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
234 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
293 raise util.Abort(_('one or both of\n'
235 raise util.Abort(_('one or both of\n'
294 ' %s\n'
236 ' %s\n'
295 ' %s\n'
237 ' %s\n'
296 'exists from a previous run; please clean up '
238 'exists from a previous run; please clean up '
297 'before running again') % (oldindexfn, olddatafn))
239 'before running again') % (oldindexfn, olddatafn))
298
240
299 # Don't use repo.transaction(), because then things get hairy with
241 # Don't use repo.transaction(), because then things get hairy with
300 # paths: some need to be relative to .hg, and some need to be
242 # paths: some need to be relative to .hg, and some need to be
301 # absolute. Doing it this way keeps things simple: everything is an
243 # absolute. Doing it this way keeps things simple: everything is an
302 # absolute path.
244 # absolute path.
303 lock = repo.lock(wait=False)
245 lock = repo.lock(wait=False)
304 tr = transaction.transaction(ui.warn,
246 tr = transaction.transaction(ui.warn,
305 open,
247 open,
306 repo.sjoin('journal'))
248 repo.sjoin('journal'))
307
249
308 def ignoremissing(func):
250 def ignoremissing(func):
309 def f(*args, **kw):
251 def f(*args, **kw):
310 try:
252 try:
311 return func(*args, **kw)
253 return func(*args, **kw)
312 except OSError, inst:
254 except OSError, inst:
313 if inst.errno != errno.ENOENT:
255 if inst.errno != errno.ENOENT:
314 raise
256 raise
315 return f
257 return f
316
258
317 try:
259 try:
318 try:
260 try:
319 order = toposort(ui, r1)
261 order = toposort(ui, r1)
320 writerevs(ui, r1, r2, order, tr)
262 writerevs(ui, r1, r2, order, tr)
321 report(ui, r1, r2)
263 report(ui, r1, r2)
322 tr.close()
264 tr.close()
323 except:
265 except:
324 # Abort transaction first, so we truncate the files before
266 # Abort transaction first, so we truncate the files before
325 # deleting them.
267 # deleting them.
326 tr.abort()
268 tr.abort()
327 for fn in (tmpindexfn, tmpdatafn):
269 for fn in (tmpindexfn, tmpdatafn):
328 ignoremissing(os.unlink)(fn)
270 ignoremissing(os.unlink)(fn)
329 raise
271 raise
330 if not opts.get('dry_run'):
272 if not opts.get('dry_run'):
331 # racy, both files cannot be renamed atomically
273 # racy, both files cannot be renamed atomically
332 # copy files
274 # copy files
333 util.os_link(indexfn, oldindexfn)
275 util.os_link(indexfn, oldindexfn)
334 ignoremissing(util.os_link)(datafn, olddatafn)
276 ignoremissing(util.os_link)(datafn, olddatafn)
335 # rename
277 # rename
336 util.rename(tmpindexfn, indexfn)
278 util.rename(tmpindexfn, indexfn)
337 try:
279 try:
338 util.rename(tmpdatafn, datafn)
280 util.rename(tmpdatafn, datafn)
339 except OSError, inst:
281 except OSError, inst:
340 if inst.errno != errno.ENOENT:
282 if inst.errno != errno.ENOENT:
341 raise
283 raise
342 ignoremissing(os.unlink)(datafn)
284 ignoremissing(os.unlink)(datafn)
343 else:
285 else:
344 for fn in (tmpindexfn, tmpdatafn):
286 for fn in (tmpindexfn, tmpdatafn):
345 ignoremissing(os.unlink)(fn)
287 ignoremissing(os.unlink)(fn)
346 finally:
288 finally:
347 lock.release()
289 lock.release()
348
290
349 if not opts.get('dry_run'):
291 if not opts.get('dry_run'):
350 ui.write(_('note: old revlog saved in:\n'
292 ui.write(_('note: old revlog saved in:\n'
351 ' %s\n'
293 ' %s\n'
352 ' %s\n'
294 ' %s\n'
353 '(You can delete those files when you are satisfied that your\n'
295 '(You can delete those files when you are satisfied that your\n'
354 'repository is still sane. '
296 'repository is still sane. '
355 'Running \'hg verify\' is strongly recommended.)\n')
297 'Running \'hg verify\' is strongly recommended.)\n')
356 % (oldindexfn, olddatafn))
298 % (oldindexfn, olddatafn))
357
299
358 cmdtable = {
300 cmdtable = {
359 'shrink': (shrink,
301 'shrink': (shrink,
360 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
302 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
361 ('n', 'dry-run', None, _('do not shrink, simulate only')),
303 ('n', 'dry-run', None, _('do not shrink, simulate only')),
362 ('', 'sort', 'branchsort', 'name of sort algorithm to use'),
304 ('', 'sort', 'reversepostorder', 'name of sort algorithm to use'),
363 ],
305 ],
364 _('hg shrink [--revlog PATH]'))
306 _('hg shrink [--revlog PATH]'))
365 }
307 }
366
308
367 if __name__ == "__main__":
309 if __name__ == "__main__":
368 print "shrink-revlog.py is now an extension (see hg help extensions)"
310 print "shrink-revlog.py is now an extension (see hg help extensions)"
General Comments 0
You need to be logged in to leave comments. Login now