##// END OF EJS Templates
shrink-revlog: use a bundler object (see d69c9510d648)
Augie Fackler -
r14030:e5dd974a default
parent child Browse files
Show More
@@ -1,287 +1,288 b''
1 1 """reorder a revlog (the manifest by default) to save space
2 2
3 3 Specifically, this topologically sorts the revisions in the revlog so that
4 4 revisions on the same branch are adjacent as much as possible. This is a
5 5 workaround for the fact that Mercurial computes deltas relative to the
6 6 previous revision rather than relative to a parent revision.
7 7
8 8 This is *not* safe to run on a changelog.
9 9 """
10 10
11 11 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
12 12 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
13 13 # renamed by Greg Ward <greg at gerg.ca>.
14 14
15 15 # XXX would be nice to have a way to verify the repository after shrinking,
16 16 # e.g. by comparing "before" and "after" states of random changesets
17 17 # (maybe: export before, shrink, export after, diff).
18 18
19 19 import os, tempfile, errno
20 20 from mercurial import revlog, transaction, node, util, scmutil
21 21 from mercurial import changegroup
22 22 from mercurial.i18n import _
23 23
24 24
25 25 def postorder(start, edges):
26 26 result = []
27 27 visit = list(start)
28 28 finished = set()
29 29
30 30 while visit:
31 31 cur = visit[-1]
32 32 for p in edges[cur]:
33 33 if p not in finished:
34 34 visit.append(p)
35 35 break
36 36 else:
37 37 result.append(cur)
38 38 finished.add(cur)
39 39 visit.pop()
40 40
41 41 return result
42 42
43 43 def toposort_reversepostorder(ui, rl):
44 44 # postorder of the reverse directed graph
45 45
46 46 # map rev to list of parent revs (p2 first)
47 47 parents = {}
48 48 heads = set()
49 49 ui.status(_('reading revs\n'))
50 50 try:
51 51 for rev in rl:
52 52 ui.progress(_('reading'), rev, total=len(rl))
53 53 (p1, p2) = rl.parentrevs(rev)
54 54 if p1 == p2 == node.nullrev:
55 55 parents[rev] = () # root node
56 56 elif p1 == p2 or p2 == node.nullrev:
57 57 parents[rev] = (p1,) # normal node
58 58 else:
59 59 parents[rev] = (p2, p1) # merge node
60 60 heads.add(rev)
61 61 for p in parents[rev]:
62 62 heads.discard(p)
63 63 finally:
64 64 ui.progress(_('reading'), None)
65 65
66 66 heads = list(heads)
67 67 heads.sort(reverse=True)
68 68
69 69 ui.status(_('sorting revs\n'))
70 70 return postorder(heads, parents)
71 71
72 72 def toposort_postorderreverse(ui, rl):
73 73 # reverse-postorder of the reverse directed graph
74 74
75 75 children = {}
76 76 roots = set()
77 77 ui.status(_('reading revs\n'))
78 78 try:
79 79 for rev in rl:
80 80 ui.progress(_('reading'), rev, total=len(rl))
81 81 (p1, p2) = rl.parentrevs(rev)
82 82 if p1 == p2 == node.nullrev:
83 83 roots.add(rev)
84 84 children[rev] = []
85 85 if p1 != node.nullrev:
86 86 children[p1].append(rev)
87 87 if p2 != node.nullrev:
88 88 children[p2].append(rev)
89 89 finally:
90 90 ui.progress(_('reading'), None)
91 91
92 92 roots = list(roots)
93 93 roots.sort()
94 94
95 95 ui.status(_('sorting revs\n'))
96 96 result = postorder(roots, children)
97 97 result.reverse()
98 98 return result
99 99
100 100 def writerevs(ui, r1, r2, order, tr):
101 101
102 102 ui.status(_('writing revs\n'))
103 103
104 104
105 105 order = [r1.node(r) for r in order]
106 106
107 107 # this is a bit ugly, but it works
108 108 count = [0]
109 def lookup(x):
109 def lookup(revl, x):
110 110 count[0] += 1
111 111 ui.progress(_('writing'), count[0], total=len(order))
112 return "%020d" % r1.linkrev(r1.rev(x))
112 return "%020d" % revl.linkrev(revl.rev(x))
113 113
114 114 unlookup = lambda x: int(x, 10)
115 115
116 116 try:
117 group = util.chunkbuffer(r1.group(order, lookup, progress))
117 bundler = changegroup.bundle10(lookup)
118 group = util.chunkbuffer(r1.group(order, bundler))
118 119 group = changegroup.unbundle10(group, "UN")
119 120 r2.addgroup(group, unlookup, tr)
120 121 finally:
121 122 ui.progress(_('writing'), None)
122 123
123 124 def report(ui, r1, r2):
124 125 def getsize(r):
125 126 s = 0
126 127 for fn in (r.indexfile, r.datafile):
127 128 try:
128 129 s += os.stat(fn).st_size
129 130 except OSError, inst:
130 131 if inst.errno != errno.ENOENT:
131 132 raise
132 133 return s
133 134
134 135 oldsize = float(getsize(r1))
135 136 newsize = float(getsize(r2))
136 137
137 138 # argh: have to pass an int to %d, because a float >= 2^32
138 139 # blows up under Python 2.5 or earlier
139 140 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
140 141 % (int(oldsize), oldsize / 1024 / 1024))
141 142 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
142 143 % (int(newsize), newsize / 1024 / 1024))
143 144
144 145 shrink_percent = (oldsize - newsize) / oldsize * 100
145 146 shrink_factor = oldsize / newsize
146 147 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
147 148 % (shrink_percent, shrink_factor))
148 149
149 150 def shrink(ui, repo, **opts):
150 151 """shrink a revlog by reordering revisions
151 152
152 153 Rewrites all the entries in some revlog of the current repository
153 154 (by default, the manifest log) to save space.
154 155
155 156 Different sort algorithms have different performance
156 157 characteristics. Use ``--sort`` to select a sort algorithm so you
157 158 can determine which works best for your data.
158 159 """
159 160
160 161 if not repo.local():
161 162 raise util.Abort(_('not a local repository: %s') % repo.root)
162 163
163 164 fn = opts.get('revlog')
164 165 if not fn:
165 166 indexfn = repo.sjoin('00manifest.i')
166 167 else:
167 168 if not fn.endswith('.i'):
168 169 raise util.Abort(_('--revlog option must specify the revlog index '
169 170 'file (*.i), not %s') % opts.get('revlog'))
170 171
171 172 indexfn = os.path.realpath(fn)
172 173 store = repo.sjoin('')
173 174 if not indexfn.startswith(store):
174 175 raise util.Abort(_('--revlog option must specify a revlog in %s, '
175 176 'not %s') % (store, indexfn))
176 177
177 178 sortname = opts['sort']
178 179 try:
179 180 toposort = globals()['toposort_' + sortname]
180 181 except KeyError:
181 182 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
182 183
183 184 if not os.path.exists(indexfn):
184 185 raise util.Abort(_('no such file: %s') % indexfn)
185 186 if '00changelog' in indexfn:
186 187 raise util.Abort(_('shrinking the changelog '
187 188 'will corrupt your repository'))
188 189
189 190 ui.write(_('shrinking %s\n') % indexfn)
190 191 prefix = os.path.basename(indexfn)[:-1]
191 192 tmpindexfn = util.mktempcopy(indexfn, emptyok=True)
192 193
193 194 r1 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), indexfn)
194 195 r2 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), tmpindexfn)
195 196
196 197 datafn, tmpdatafn = r1.datafile, r2.datafile
197 198
198 199 oldindexfn = indexfn + '.old'
199 200 olddatafn = datafn + '.old'
200 201 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
201 202 raise util.Abort(_('one or both of\n'
202 203 ' %s\n'
203 204 ' %s\n'
204 205 'exists from a previous run; please clean up '
205 206 'before running again') % (oldindexfn, olddatafn))
206 207
207 208 # Don't use repo.transaction(), because then things get hairy with
208 209 # paths: some need to be relative to .hg, and some need to be
209 210 # absolute. Doing it this way keeps things simple: everything is an
210 211 # absolute path.
211 212 lock = repo.lock(wait=False)
212 213 tr = transaction.transaction(ui.warn,
213 214 open,
214 215 repo.sjoin('journal'))
215 216
216 217 def ignoremissing(func):
217 218 def f(*args, **kw):
218 219 try:
219 220 return func(*args, **kw)
220 221 except OSError, inst:
221 222 if inst.errno != errno.ENOENT:
222 223 raise
223 224 return f
224 225
225 226 try:
226 227 try:
227 228 order = toposort(ui, r1)
228 229
229 230 suboptimal = 0
230 231 for i in xrange(1, len(order)):
231 232 parents = [p for p in r1.parentrevs(order[i])
232 233 if p != node.nullrev]
233 234 if parents and order[i - 1] not in parents:
234 235 suboptimal += 1
235 236 ui.note(_('%d suboptimal nodes\n') % suboptimal)
236 237
237 238 writerevs(ui, r1, r2, order, tr)
238 239 report(ui, r1, r2)
239 240 tr.close()
240 241 except:
241 242 # Abort transaction first, so we truncate the files before
242 243 # deleting them.
243 244 tr.abort()
244 245 for fn in (tmpindexfn, tmpdatafn):
245 246 ignoremissing(os.unlink)(fn)
246 247 raise
247 248 if not opts.get('dry_run'):
248 249 # racy, both files cannot be renamed atomically
249 250 # copy files
250 251 util.os_link(indexfn, oldindexfn)
251 252 ignoremissing(util.os_link)(datafn, olddatafn)
252 253
253 254 # rename
254 255 util.rename(tmpindexfn, indexfn)
255 256 try:
256 257 os.chmod(tmpdatafn, os.stat(datafn).st_mode)
257 258 util.rename(tmpdatafn, datafn)
258 259 except OSError, inst:
259 260 if inst.errno != errno.ENOENT:
260 261 raise
261 262 ignoremissing(os.unlink)(datafn)
262 263 else:
263 264 for fn in (tmpindexfn, tmpdatafn):
264 265 ignoremissing(os.unlink)(fn)
265 266 finally:
266 267 lock.release()
267 268
268 269 if not opts.get('dry_run'):
269 270 ui.write(_('note: old revlog saved in:\n'
270 271 ' %s\n'
271 272 ' %s\n'
272 273 '(You can delete those files when you are satisfied that your\n'
273 274 'repository is still sane. '
274 275 'Running \'hg verify\' is strongly recommended.)\n')
275 276 % (oldindexfn, olddatafn))
276 277
277 278 cmdtable = {
278 279 'shrink': (shrink,
279 280 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
280 281 ('n', 'dry-run', None, _('do not shrink, simulate only')),
281 282 ('', 'sort', 'reversepostorder', 'name of sort algorithm to use'),
282 283 ],
283 284 _('hg shrink [--revlog PATH]'))
284 285 }
285 286
286 287 if __name__ == "__main__":
287 288 print "shrink-revlog.py is now an extension (see hg help extensions)"
General Comments 0
You need to be logged in to leave comments. Login now