##// END OF EJS Templates
shrink-revlog: add accounting of suboptimal nodes to the new algorithms.
Greg Ward -
r10624:432eb853 default
parent child Browse files
Show More
@@ -1,348 +1,368 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2
2
3 """\
3 """\
4 reorder a revlog (the manifest by default) to save space
4 reorder a revlog (the manifest by default) to save space
5
5
6 Specifically, this topologically sorts the revisions in the revlog so that
6 Specifically, this topologically sorts the revisions in the revlog so that
7 revisions on the same branch are adjacent as much as possible. This is a
7 revisions on the same branch are adjacent as much as possible. This is a
8 workaround for the fact that Mercurial computes deltas relative to the
8 workaround for the fact that Mercurial computes deltas relative to the
9 previous revision rather than relative to a parent revision.
9 previous revision rather than relative to a parent revision.
10
10
11 This is *not* safe to run on a changelog.
11 This is *not* safe to run on a changelog.
12 """
12 """
13
13
14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
16 # renamed by Greg Ward <greg at gerg.ca>.
16 # renamed by Greg Ward <greg at gerg.ca>.
17
17
18 # XXX would be nice to have a way to verify the repository after shrinking,
18 # XXX would be nice to have a way to verify the repository after shrinking,
19 # e.g. by comparing "before" and "after" states of random changesets
19 # e.g. by comparing "before" and "after" states of random changesets
20 # (maybe: export before, shrink, export after, diff).
20 # (maybe: export before, shrink, export after, diff).
21
21
22 import os, tempfile, errno
22 import os, tempfile, errno
23 from mercurial import revlog, transaction, node, util
23 from mercurial import revlog, transaction, node, util
24 from mercurial import changegroup
24 from mercurial import changegroup
25 from mercurial.i18n import _
25 from mercurial.i18n import _
26
26
27 def toposort_branchsort(ui, rl):
27 def toposort_branchsort(ui, rl):
28
28
29 children = {}
29 children = {}
30 root = []
30 root = []
31 # build children and roots
31 # build children and roots
32 ui.status(_('reading revs\n'))
32 ui.status(_('reading revs\n'))
33 try:
33 try:
34 for rev in rl:
34 for rev in rl:
35 ui.progress(_('reading'), rev, total=len(rl))
35 ui.progress(_('reading'), rev, total=len(rl))
36 children[rev] = []
36 children[rev] = []
37 parents = [p for p in rl.parentrevs(rev) if p != node.nullrev]
37 parents = [p for p in rl.parentrevs(rev) if p != node.nullrev]
38 # in case of duplicate parents
38 # in case of duplicate parents
39 if len(parents) == 2 and parents[0] == parents[1]:
39 if len(parents) == 2 and parents[0] == parents[1]:
40 del parents[1]
40 del parents[1]
41 for p in parents:
41 for p in parents:
42 assert p in children
42 assert p in children
43 children[p].append(rev)
43 children[p].append(rev)
44
44
45 if len(parents) == 0:
45 if len(parents) == 0:
46 root.append(rev)
46 root.append(rev)
47 finally:
47 finally:
48 ui.progress(_('reading'), None, total=len(rl))
48 ui.progress(_('reading'), None, total=len(rl))
49
49
50 # XXX this is a reimplementation of the 'branchsort' topo sort
50 # XXX this is a reimplementation of the 'branchsort' topo sort
51 # algorithm in hgext.convert.convcmd... would be nice not to duplicate
51 # algorithm in hgext.convert.convcmd... would be nice not to duplicate
52 # the algorithm
52 # the algorithm
53 ui.status(_('sorting revs\n'))
53 ui.status(_('sorting revs\n'))
54 visit = root
54 visit = root
55 result = []
55 result = []
56
56
57 # suboptimal: nodes whose predecessor is not first parent
57 # suboptimal: nodes whose predecessor is not first parent
58 suboptimal = 0
58 suboptimal = 0
59
59
60 while visit:
60 while visit:
61 cur = visit.pop(0)
61 cur = visit.pop(0)
62 # revlog will compute delta relative to result[-1], so keep track
62 # revlog will compute delta relative to result[-1], so keep track
63 # of nodes where this might result in a large delta
63 # of nodes where this might result in a large delta
64 parents = rl.parentrevs(cur)
64 parents = rl.parentrevs(cur)
65 if result:
65 if result:
66 if result[-1] != parents[0]:
66 if result[-1] != parents[0]:
67 suboptimal += 1
67 suboptimal += 1
68
68
69 result.append(cur)
69 result.append(cur)
70 if cur not in children:
70 if cur not in children:
71 # This only happens if some node's p1 == p2, which can
71 # This only happens if some node's p1 == p2, which can
72 # happen in the manifest in certain circumstances.
72 # happen in the manifest in certain circumstances.
73 continue
73 continue
74 next = []
74 next = []
75 for c in children.pop(cur):
75 for c in children.pop(cur):
76 parents_unseen = [p for p in rl.parentrevs(c)
76 parents_unseen = [p for p in rl.parentrevs(c)
77 if p != node.nullrev and p in children]
77 if p != node.nullrev and p in children]
78 if len(parents_unseen) == 0:
78 if len(parents_unseen) == 0:
79 next.append(c)
79 next.append(c)
80 visit = next + visit
80 visit = next + visit
81 ui.note(_('%d suboptimal nodes\n') % suboptimal)
81 ui.note(_('%d suboptimal nodes\n') % suboptimal)
82 return result
82 return result
83
83
84 def toposort_reversepostorder(ui, rl):
84 def toposort_reversepostorder(ui, rl):
85 # postorder of the reverse directed graph
85 # postorder of the reverse directed graph
86
86
87 # map rev to list of parent revs (p2 first)
87 # map rev to list of parent revs (p2 first)
88 parents = {}
88 parents = {}
89 heads = set()
89 heads = set()
90 ui.status(_('reading revs\n'))
90 ui.status(_('reading revs\n'))
91 try:
91 try:
92 for rev in rl:
92 for rev in rl:
93 ui.progress(_('reading'), rev, total=len(rl))
93 ui.progress(_('reading'), rev, total=len(rl))
94 (p1, p2) = rl.parentrevs(rev)
94 (p1, p2) = rl.parentrevs(rev)
95 if p1 == p2 == node.nullrev:
95 if p1 == p2 == node.nullrev:
96 parents[rev] = () # root node
96 parents[rev] = () # root node
97 elif p1 == p2 or p2 == node.nullrev:
97 elif p1 == p2 or p2 == node.nullrev:
98 parents[rev] = (p1,) # normal node
98 parents[rev] = (p1,) # normal node
99 else:
99 else:
100 parents[rev] = (p2, p1) # merge node
100 parents[rev] = (p2, p1) # merge node
101
101
102 heads.add(rev)
102 heads.add(rev)
103
103
104 for p in parents[rev]:
104 for p in parents[rev]:
105 heads.discard(p)
105 heads.discard(p)
106 finally:
106 finally:
107 ui.progress(_('reading'), None, total=len(rl))
107 ui.progress(_('reading'), None, total=len(rl))
108
108
109 ui.status(_('sorting revs\n'))
109 ui.status(_('sorting revs\n'))
110 result = []
110 result = []
111 visit = list(heads)
111 visit = list(heads)
112 visit.sort(reverse=True)
112 visit.sort(reverse=True)
113 finished = set()
113 finished = set()
114 suboptimal = 0
114
115
115 while visit:
116 while visit:
116 cur = visit[-1]
117 cur = visit[-1]
117 for p in parents[cur]:
118 for p in parents[cur]:
118 if p not in finished:
119 if p not in finished:
119 visit.append(p)
120 visit.append(p)
120 break
121 break
121 else:
122 else:
123 curparents = rl.parentrevs(cur)
124 if result and result[-1] != curparents[0]:
125 suboptimal += 1
126
122 result.append(cur)
127 result.append(cur)
123 finished.add(cur)
128 finished.add(cur)
124 visit.pop()
129 visit.pop()
130
131 ui.note(_('%d suboptimal nodes\n') % suboptimal)
132
125 return result
133 return result
126
134
127 def toposort_postorderreverse(ui, rl):
135 def toposort_postorderreverse(ui, rl):
128 # reverse-postorder of the reverse directed graph
136 # reverse-postorder of the reverse directed graph
129
137
130 children = {}
138 children = {}
131 roots = set()
139 roots = set()
132 ui.status(_('reading revs\n'))
140 ui.status(_('reading revs\n'))
133 try:
141 try:
134 for rev in rl:
142 for rev in rl:
135 ui.progress(_('reading'), rev, total=len(rl))
143 ui.progress(_('reading'), rev, total=len(rl))
136 (p1, p2) = rl.parentrevs(rev)
144 (p1, p2) = rl.parentrevs(rev)
137 if p1 == p2 == node.nullrev:
145 if p1 == p2 == node.nullrev:
138 roots.add(rev)
146 roots.add(rev)
139
147
140 children[rev] = []
148 children[rev] = []
141
149
142 if p1 != node.nullrev:
150 if p1 != node.nullrev:
143 children[p1].append(rev)
151 children[p1].append(rev)
144 if p2 != node.nullrev:
152 if p2 != node.nullrev:
145 children[p2].append(rev)
153 children[p2].append(rev)
146 finally:
154 finally:
147 ui.progress(_('reading'), None, total=len(rl))
155 ui.progress(_('reading'), None, total=len(rl))
148
156
149 ui.status(_('sorting revs\n'))
157 ui.status(_('sorting revs\n'))
150 result = []
158 result = []
151 visit = list(roots)
159 visit = list(roots)
152 visit.sort()
160 visit.sort()
153 finished = set()
161 finished = set()
162 suboptimal = 0
163
154 while visit:
164 while visit:
155 cur = visit[-1]
165 cur = visit[-1]
156 for p in children[cur]:
166 for p in children[cur]:
157 if p not in finished:
167 if p not in finished:
158 visit.append(p)
168 visit.append(p)
159 break
169 break
160 else:
170 else:
171 # if cur is not the first parent of its successor, then the
172 # successor is a suboptimal node
173 if result:
174 succparents = rl.parentrevs(result[-1])
175 if succparents[0] != cur:
176 suboptimal += 1
177
161 result.append(cur)
178 result.append(cur)
162 finished.add(cur)
179 finished.add(cur)
163 visit.pop()
180 visit.pop()
181
182 ui.note(_('%d suboptimal nodes\n') % suboptimal)
183
164 result.reverse()
184 result.reverse()
165 return result
185 return result
166
186
167 def writerevs(ui, r1, r2, order, tr):
187 def writerevs(ui, r1, r2, order, tr):
168
188
169 ui.status(_('writing revs\n'))
189 ui.status(_('writing revs\n'))
170
190
171 count = [0]
191 count = [0]
172 def progress(*args):
192 def progress(*args):
173 ui.progress(_('writing'), count[0], total=len(order))
193 ui.progress(_('writing'), count[0], total=len(order))
174 count[0] += 1
194 count[0] += 1
175
195
176 order = [r1.node(r) for r in order]
196 order = [r1.node(r) for r in order]
177
197
178 # this is a bit ugly, but it works
198 # this is a bit ugly, but it works
179 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
199 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
180 unlookup = lambda x: int(x, 10)
200 unlookup = lambda x: int(x, 10)
181
201
182 try:
202 try:
183 group = util.chunkbuffer(r1.group(order, lookup, progress))
203 group = util.chunkbuffer(r1.group(order, lookup, progress))
184 chunkiter = changegroup.chunkiter(group)
204 chunkiter = changegroup.chunkiter(group)
185 r2.addgroup(chunkiter, unlookup, tr)
205 r2.addgroup(chunkiter, unlookup, tr)
186 finally:
206 finally:
187 ui.progress(_('writing'), None, len(order))
207 ui.progress(_('writing'), None, len(order))
188
208
189 def report(ui, r1, r2):
209 def report(ui, r1, r2):
190 def getsize(r):
210 def getsize(r):
191 s = 0
211 s = 0
192 for fn in (r.indexfile, r.datafile):
212 for fn in (r.indexfile, r.datafile):
193 try:
213 try:
194 s += os.stat(fn).st_size
214 s += os.stat(fn).st_size
195 except OSError, inst:
215 except OSError, inst:
196 if inst.errno != errno.ENOENT:
216 if inst.errno != errno.ENOENT:
197 raise
217 raise
198 return s
218 return s
199
219
200 oldsize = float(getsize(r1))
220 oldsize = float(getsize(r1))
201 newsize = float(getsize(r2))
221 newsize = float(getsize(r2))
202
222
203 # argh: have to pass an int to %d, because a float >= 2^32
223 # argh: have to pass an int to %d, because a float >= 2^32
204 # blows up under Python 2.5 or earlier
224 # blows up under Python 2.5 or earlier
205 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
225 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
206 % (int(oldsize), oldsize / 1024 / 1024))
226 % (int(oldsize), oldsize / 1024 / 1024))
207 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
227 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
208 % (int(newsize), newsize / 1024 / 1024))
228 % (int(newsize), newsize / 1024 / 1024))
209
229
210 shrink_percent = (oldsize - newsize) / oldsize * 100
230 shrink_percent = (oldsize - newsize) / oldsize * 100
211 shrink_factor = oldsize / newsize
231 shrink_factor = oldsize / newsize
212 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
232 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
213 % (shrink_percent, shrink_factor))
233 % (shrink_percent, shrink_factor))
214
234
215 def shrink(ui, repo, **opts):
235 def shrink(ui, repo, **opts):
216 """shrink a revlog by reordering revisions
236 """shrink a revlog by reordering revisions
217
237
218 Rewrites all the entries in some revlog of the current repository
238 Rewrites all the entries in some revlog of the current repository
219 (by default, the manifest log) to save space.
239 (by default, the manifest log) to save space.
220
240
221 Different sort algorithms have different performance
241 Different sort algorithms have different performance
222 characteristics. Use ``--sort`` to select a sort algorithm so you
242 characteristics. Use ``--sort`` to select a sort algorithm so you
223 can determine which works best for your data. The default
243 can determine which works best for your data. The default
224 algorithm, ``branchsort``, works well for workflows with lots of
244 algorithm, ``branchsort``, works well for workflows with lots of
225 active (unmerged) branches, but not so well when all branches have
245 active (unmerged) branches, but not so well when all branches have
226 been merged and there is only one repository head.
246 been merged and there is only one repository head.
227 """
247 """
228
248
229 if not repo.local():
249 if not repo.local():
230 raise util.Abort(_('not a local repository: %s') % repo.root)
250 raise util.Abort(_('not a local repository: %s') % repo.root)
231
251
232 fn = opts.get('revlog')
252 fn = opts.get('revlog')
233 if not fn:
253 if not fn:
234 indexfn = repo.sjoin('00manifest.i')
254 indexfn = repo.sjoin('00manifest.i')
235 else:
255 else:
236 if not fn.endswith('.i'):
256 if not fn.endswith('.i'):
237 raise util.Abort(_('--revlog option must specify the revlog index '
257 raise util.Abort(_('--revlog option must specify the revlog index '
238 'file (*.i), not %s') % opts.get('revlog'))
258 'file (*.i), not %s') % opts.get('revlog'))
239
259
240 indexfn = os.path.realpath(fn)
260 indexfn = os.path.realpath(fn)
241 store = repo.sjoin('')
261 store = repo.sjoin('')
242 if not indexfn.startswith(store):
262 if not indexfn.startswith(store):
243 raise util.Abort(_('--revlog option must specify a revlog in %s, '
263 raise util.Abort(_('--revlog option must specify a revlog in %s, '
244 'not %s') % (store, indexfn))
264 'not %s') % (store, indexfn))
245
265
246 sortname = opts['sort']
266 sortname = opts['sort']
247 try:
267 try:
248 toposort = globals()['toposort_' + sortname]
268 toposort = globals()['toposort_' + sortname]
249 except KeyError:
269 except KeyError:
250 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
270 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
251
271
252 if not os.path.exists(indexfn):
272 if not os.path.exists(indexfn):
253 raise util.Abort(_('no such file: %s') % indexfn)
273 raise util.Abort(_('no such file: %s') % indexfn)
254 if '00changelog' in indexfn:
274 if '00changelog' in indexfn:
255 raise util.Abort(_('shrinking the changelog '
275 raise util.Abort(_('shrinking the changelog '
256 'will corrupt your repository'))
276 'will corrupt your repository'))
257
277
258 ui.write(_('shrinking %s\n') % indexfn)
278 ui.write(_('shrinking %s\n') % indexfn)
259 prefix = os.path.basename(indexfn)[:-1]
279 prefix = os.path.basename(indexfn)[:-1]
260 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
280 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
261 prefix=prefix,
281 prefix=prefix,
262 suffix='.i')
282 suffix='.i')
263 os.close(tmpfd)
283 os.close(tmpfd)
264
284
265 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
285 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
266 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
286 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
267
287
268 datafn, tmpdatafn = r1.datafile, r2.datafile
288 datafn, tmpdatafn = r1.datafile, r2.datafile
269
289
270 oldindexfn = indexfn + '.old'
290 oldindexfn = indexfn + '.old'
271 olddatafn = datafn + '.old'
291 olddatafn = datafn + '.old'
272 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
292 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
273 raise util.Abort(_('one or both of\n'
293 raise util.Abort(_('one or both of\n'
274 ' %s\n'
294 ' %s\n'
275 ' %s\n'
295 ' %s\n'
276 'exists from a previous run; please clean up '
296 'exists from a previous run; please clean up '
277 'before running again') % (oldindexfn, olddatafn))
297 'before running again') % (oldindexfn, olddatafn))
278
298
279 # Don't use repo.transaction(), because then things get hairy with
299 # Don't use repo.transaction(), because then things get hairy with
280 # paths: some need to be relative to .hg, and some need to be
300 # paths: some need to be relative to .hg, and some need to be
281 # absolute. Doing it this way keeps things simple: everything is an
301 # absolute. Doing it this way keeps things simple: everything is an
282 # absolute path.
302 # absolute path.
283 lock = repo.lock(wait=False)
303 lock = repo.lock(wait=False)
284 tr = transaction.transaction(ui.warn,
304 tr = transaction.transaction(ui.warn,
285 open,
305 open,
286 repo.sjoin('journal'))
306 repo.sjoin('journal'))
287
307
288 def ignoremissing(func):
308 def ignoremissing(func):
289 def f(*args, **kw):
309 def f(*args, **kw):
290 try:
310 try:
291 return func(*args, **kw)
311 return func(*args, **kw)
292 except OSError, inst:
312 except OSError, inst:
293 if inst.errno != errno.ENOENT:
313 if inst.errno != errno.ENOENT:
294 raise
314 raise
295 return f
315 return f
296
316
297 try:
317 try:
298 try:
318 try:
299 order = toposort(ui, r1)
319 order = toposort(ui, r1)
300 writerevs(ui, r1, r2, order, tr)
320 writerevs(ui, r1, r2, order, tr)
301 report(ui, r1, r2)
321 report(ui, r1, r2)
302 tr.close()
322 tr.close()
303 except:
323 except:
304 # Abort transaction first, so we truncate the files before
324 # Abort transaction first, so we truncate the files before
305 # deleting them.
325 # deleting them.
306 tr.abort()
326 tr.abort()
307 for fn in (tmpindexfn, tmpdatafn):
327 for fn in (tmpindexfn, tmpdatafn):
308 ignoremissing(os.unlink)(fn)
328 ignoremissing(os.unlink)(fn)
309 raise
329 raise
310 if not opts.get('dry_run'):
330 if not opts.get('dry_run'):
311 # racy, both files cannot be renamed atomically
331 # racy, both files cannot be renamed atomically
312 # copy files
332 # copy files
313 util.os_link(indexfn, oldindexfn)
333 util.os_link(indexfn, oldindexfn)
314 ignoremissing(util.os_link)(datafn, olddatafn)
334 ignoremissing(util.os_link)(datafn, olddatafn)
315 # rename
335 # rename
316 util.rename(tmpindexfn, indexfn)
336 util.rename(tmpindexfn, indexfn)
317 try:
337 try:
318 util.rename(tmpdatafn, datafn)
338 util.rename(tmpdatafn, datafn)
319 except OSError, inst:
339 except OSError, inst:
320 if inst.errno != errno.ENOENT:
340 if inst.errno != errno.ENOENT:
321 raise
341 raise
322 ignoremissing(os.unlink)(datafn)
342 ignoremissing(os.unlink)(datafn)
323 else:
343 else:
324 for fn in (tmpindexfn, tmpdatafn):
344 for fn in (tmpindexfn, tmpdatafn):
325 ignoremissing(os.unlink)(fn)
345 ignoremissing(os.unlink)(fn)
326 finally:
346 finally:
327 lock.release()
347 lock.release()
328
348
329 if not opts.get('dry_run'):
349 if not opts.get('dry_run'):
330 ui.write(_('note: old revlog saved in:\n'
350 ui.write(_('note: old revlog saved in:\n'
331 ' %s\n'
351 ' %s\n'
332 ' %s\n'
352 ' %s\n'
333 '(You can delete those files when you are satisfied that your\n'
353 '(You can delete those files when you are satisfied that your\n'
334 'repository is still sane. '
354 'repository is still sane. '
335 'Running \'hg verify\' is strongly recommended.)\n')
355 'Running \'hg verify\' is strongly recommended.)\n')
336 % (oldindexfn, olddatafn))
356 % (oldindexfn, olddatafn))
337
357
338 cmdtable = {
358 cmdtable = {
339 'shrink': (shrink,
359 'shrink': (shrink,
340 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
360 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
341 ('n', 'dry-run', None, _('do not shrink, simulate only')),
361 ('n', 'dry-run', None, _('do not shrink, simulate only')),
342 ('', 'sort', 'branchsort', 'name of sort algorithm to use'),
362 ('', 'sort', 'branchsort', 'name of sort algorithm to use'),
343 ],
363 ],
344 _('hg shrink [--revlog PATH]'))
364 _('hg shrink [--revlog PATH]'))
345 }
365 }
346
366
347 if __name__ == "__main__":
367 if __name__ == "__main__":
348 print "shrink-revlog.py is now an extension (see hg help extensions)"
368 print "shrink-revlog.py is now an extension (see hg help extensions)"
General Comments 0
You need to be logged in to leave comments. Login now