##// END OF EJS Templates
shrink-revlog: use util.mktempcopy() to preserve mode of index file....
Greg Ward -
r11294:7b5d05e0 default
parent child Browse files
Show More
@@ -1,295 +1,289 b''
1 1 #!/usr/bin/env python
2 2
3 3 """\
4 4 reorder a revlog (the manifest by default) to save space
5 5
6 6 Specifically, this topologically sorts the revisions in the revlog so that
7 7 revisions on the same branch are adjacent as much as possible. This is a
8 8 workaround for the fact that Mercurial computes deltas relative to the
9 9 previous revision rather than relative to a parent revision.
10 10
11 11 This is *not* safe to run on a changelog.
12 12 """
13 13
14 14 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
15 15 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
16 16 # renamed by Greg Ward <greg at gerg.ca>.
17 17
18 18 # XXX would be nice to have a way to verify the repository after shrinking,
19 19 # e.g. by comparing "before" and "after" states of random changesets
20 20 # (maybe: export before, shrink, export after, diff).
21 21
22 22 import os, tempfile, errno
23 23 from mercurial import revlog, transaction, node, util
24 24 from mercurial import changegroup
25 25 from mercurial.i18n import _
26 26
27 27
28 28 def postorder(start, edges):
29 29 result = []
30 30 visit = list(start)
31 31 finished = set()
32 32
33 33 while visit:
34 34 cur = visit[-1]
35 35 for p in edges[cur]:
36 36 if p not in finished:
37 37 visit.append(p)
38 38 break
39 39 else:
40 40 result.append(cur)
41 41 finished.add(cur)
42 42 visit.pop()
43 43
44 44 return result
45 45
46 46 def toposort_reversepostorder(ui, rl):
47 47 # postorder of the reverse directed graph
48 48
49 49 # map rev to list of parent revs (p2 first)
50 50 parents = {}
51 51 heads = set()
52 52 ui.status(_('reading revs\n'))
53 53 try:
54 54 for rev in rl:
55 55 ui.progress(_('reading'), rev, total=len(rl))
56 56 (p1, p2) = rl.parentrevs(rev)
57 57 if p1 == p2 == node.nullrev:
58 58 parents[rev] = () # root node
59 59 elif p1 == p2 or p2 == node.nullrev:
60 60 parents[rev] = (p1,) # normal node
61 61 else:
62 62 parents[rev] = (p2, p1) # merge node
63 63 heads.add(rev)
64 64 for p in parents[rev]:
65 65 heads.discard(p)
66 66 finally:
67 67 ui.progress(_('reading'), None)
68 68
69 69 heads = list(heads)
70 70 heads.sort(reverse=True)
71 71
72 72 ui.status(_('sorting revs\n'))
73 73 return postorder(heads, parents)
74 74
75 75 def toposort_postorderreverse(ui, rl):
76 76 # reverse-postorder of the reverse directed graph
77 77
78 78 children = {}
79 79 roots = set()
80 80 ui.status(_('reading revs\n'))
81 81 try:
82 82 for rev in rl:
83 83 ui.progress(_('reading'), rev, total=len(rl))
84 84 (p1, p2) = rl.parentrevs(rev)
85 85 if p1 == p2 == node.nullrev:
86 86 roots.add(rev)
87 87 children[rev] = []
88 88 if p1 != node.nullrev:
89 89 children[p1].append(rev)
90 90 if p2 != node.nullrev:
91 91 children[p2].append(rev)
92 92 finally:
93 93 ui.progress(_('reading'), None)
94 94
95 95 root = list(roots)
96 96 roots.sort()
97 97
98 98 ui.status(_('sorting revs\n'))
99 99 result = postorder(roots, children)
100 100 result.reverse()
101 101 return result
102 102
103 103 def writerevs(ui, r1, r2, order, tr):
104 104
105 105 ui.status(_('writing revs\n'))
106 106
107 107 count = [0]
108 108 def progress(*args):
109 109 ui.progress(_('writing'), count[0], total=len(order))
110 110 count[0] += 1
111 111
112 112 order = [r1.node(r) for r in order]
113 113
114 114 # this is a bit ugly, but it works
115 115 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
116 116 unlookup = lambda x: int(x, 10)
117 117
118 118 try:
119 119 group = util.chunkbuffer(r1.group(order, lookup, progress))
120 120 chunkiter = changegroup.chunkiter(group)
121 121 r2.addgroup(chunkiter, unlookup, tr)
122 122 finally:
123 123 ui.progress(_('writing'), None)
124 124
125 125 def report(ui, r1, r2):
126 126 def getsize(r):
127 127 s = 0
128 128 for fn in (r.indexfile, r.datafile):
129 129 try:
130 130 s += os.stat(fn).st_size
131 131 except OSError, inst:
132 132 if inst.errno != errno.ENOENT:
133 133 raise
134 134 return s
135 135
136 136 oldsize = float(getsize(r1))
137 137 newsize = float(getsize(r2))
138 138
139 139 # argh: have to pass an int to %d, because a float >= 2^32
140 140 # blows up under Python 2.5 or earlier
141 141 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
142 142 % (int(oldsize), oldsize / 1024 / 1024))
143 143 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
144 144 % (int(newsize), newsize / 1024 / 1024))
145 145
146 146 shrink_percent = (oldsize - newsize) / oldsize * 100
147 147 shrink_factor = oldsize / newsize
148 148 ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
149 149 % (shrink_percent, shrink_factor))
150 150
151 151 def shrink(ui, repo, **opts):
152 152 """shrink a revlog by reordering revisions
153 153
154 154 Rewrites all the entries in some revlog of the current repository
155 155 (by default, the manifest log) to save space.
156 156
157 157 Different sort algorithms have different performance
158 158 characteristics. Use ``--sort`` to select a sort algorithm so you
159 159 can determine which works best for your data.
160 160 """
161 161
162 162 if not repo.local():
163 163 raise util.Abort(_('not a local repository: %s') % repo.root)
164 164
165 165 fn = opts.get('revlog')
166 166 if not fn:
167 167 indexfn = repo.sjoin('00manifest.i')
168 168 else:
169 169 if not fn.endswith('.i'):
170 170 raise util.Abort(_('--revlog option must specify the revlog index '
171 171 'file (*.i), not %s') % opts.get('revlog'))
172 172
173 173 indexfn = os.path.realpath(fn)
174 174 store = repo.sjoin('')
175 175 if not indexfn.startswith(store):
176 176 raise util.Abort(_('--revlog option must specify a revlog in %s, '
177 177 'not %s') % (store, indexfn))
178 178
179 179 sortname = opts['sort']
180 180 try:
181 181 toposort = globals()['toposort_' + sortname]
182 182 except KeyError:
183 183 raise util.Abort(_('no such toposort algorithm: %s') % sortname)
184 184
185 185 if not os.path.exists(indexfn):
186 186 raise util.Abort(_('no such file: %s') % indexfn)
187 187 if '00changelog' in indexfn:
188 188 raise util.Abort(_('shrinking the changelog '
189 189 'will corrupt your repository'))
190 190
191 191 ui.write(_('shrinking %s\n') % indexfn)
192 192 prefix = os.path.basename(indexfn)[:-1]
193 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
194 prefix=prefix,
195 suffix='.i')
196 os.close(tmpfd)
193 tmpindexfn = util.mktempcopy(indexfn, emptyok=True)
197 194
198 195 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
199 196 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
200 197
201 198 datafn, tmpdatafn = r1.datafile, r2.datafile
202 199
203 200 oldindexfn = indexfn + '.old'
204 201 olddatafn = datafn + '.old'
205 202 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
206 203 raise util.Abort(_('one or both of\n'
207 204 ' %s\n'
208 205 ' %s\n'
209 206 'exists from a previous run; please clean up '
210 207 'before running again') % (oldindexfn, olddatafn))
211 208
212 209 # Don't use repo.transaction(), because then things get hairy with
213 210 # paths: some need to be relative to .hg, and some need to be
214 211 # absolute. Doing it this way keeps things simple: everything is an
215 212 # absolute path.
216 213 lock = repo.lock(wait=False)
217 214 tr = transaction.transaction(ui.warn,
218 215 open,
219 216 repo.sjoin('journal'))
220 217
221 218 def ignoremissing(func):
222 219 def f(*args, **kw):
223 220 try:
224 221 return func(*args, **kw)
225 222 except OSError, inst:
226 223 if inst.errno != errno.ENOENT:
227 224 raise
228 225 return f
229 226
230 227 try:
231 228 try:
232 229 order = toposort(ui, r1)
233 230
234 231 suboptimal = 0
235 232 for i in xrange(1, len(order)):
236 233 parents = [p for p in r1.parentrevs(order[i])
237 234 if p != node.nullrev]
238 235 if parents and order[i - 1] not in parents:
239 236 suboptimal += 1
240 237 ui.note(_('%d suboptimal nodes\n') % suboptimal)
241 238
242 239 writerevs(ui, r1, r2, order, tr)
243 240 report(ui, r1, r2)
244 241 tr.close()
245 242 except:
246 243 # Abort transaction first, so we truncate the files before
247 244 # deleting them.
248 245 tr.abort()
249 246 for fn in (tmpindexfn, tmpdatafn):
250 247 ignoremissing(os.unlink)(fn)
251 248 raise
252 249 if not opts.get('dry_run'):
253 250 # racy, both files cannot be renamed atomically
254 251 # copy files
255 252 util.os_link(indexfn, oldindexfn)
256 253 ignoremissing(util.os_link)(datafn, olddatafn)
257 254
258 # mkstemp() creates files only readable by the owner
259 os.chmod(tmpindexfn, os.stat(indexfn).st_mode)
260
261 255 # rename
262 256 util.rename(tmpindexfn, indexfn)
263 257 try:
264 258 os.chmod(tmpdatafn, os.stat(datafn).st_mode)
265 259 util.rename(tmpdatafn, datafn)
266 260 except OSError, inst:
267 261 if inst.errno != errno.ENOENT:
268 262 raise
269 263 ignoremissing(os.unlink)(datafn)
270 264 else:
271 265 for fn in (tmpindexfn, tmpdatafn):
272 266 ignoremissing(os.unlink)(fn)
273 267 finally:
274 268 lock.release()
275 269
276 270 if not opts.get('dry_run'):
277 271 ui.write(_('note: old revlog saved in:\n'
278 272 ' %s\n'
279 273 ' %s\n'
280 274 '(You can delete those files when you are satisfied that your\n'
281 275 'repository is still sane. '
282 276 'Running \'hg verify\' is strongly recommended.)\n')
283 277 % (oldindexfn, olddatafn))
284 278
285 279 cmdtable = {
286 280 'shrink': (shrink,
287 281 [('', 'revlog', '', _('index (.i) file of the revlog to shrink')),
288 282 ('n', 'dry-run', None, _('do not shrink, simulate only')),
289 283 ('', 'sort', 'reversepostorder', 'name of sort algorithm to use'),
290 284 ],
291 285 _('hg shrink [--revlog PATH]'))
292 286 }
293 287
294 288 if __name__ == "__main__":
295 289 print "shrink-revlog.py is now an extension (see hg help extensions)"
General Comments 0
You need to be logged in to leave comments. Login now