##// END OF EJS Templates
shrink-revlog: improve performance: use changegroup instead of revisions...
Benoit Boissinot -
r10009:69dca857 default
parent child Browse files
Show More
@@ -1,218 +1,224 b''
1 1 #!/usr/bin/env python
2 2
3 3 """\
4 4 Reorder a revlog (by default the the manifest file in the current
5 5 repository) to save space. Specifically, this topologically sorts the
6 6 revisions in the revlog so that revisions on the same branch are adjacent
7 7 as much as possible. This is a workaround for the fact that Mercurial
8 8 computes deltas relative to the previous revision rather than relative to a
9 9 parent revision. This is *not* safe to run on a changelog.
10 10 """
11 11
12 12 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
13 13 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
14 14 # renamed by Greg Ward <greg at gerg.ca>.
15 15
16 16 # XXX would be nice to have a way to verify the repository after shrinking,
17 17 # e.g. by comparing "before" and "after" states of random changesets
18 18 # (maybe: export before, shrink, export after, diff).
19 19
20 20 import sys, os, tempfile
21 21 import optparse
22 22 from mercurial import ui as ui_, hg, revlog, transaction, node, util
23 from mercurial import changegroup
23 24
24 25 def toposort(rl):
25 26 write = sys.stdout.write
26 27
27 28 children = {}
28 29 root = []
29 30 # build children and roots
30 31 write('reading %d revs ' % len(rl))
31 32 try:
32 33 for i in rl:
33 34 children[i] = []
34 35 parents = [p for p in rl.parentrevs(i) if p != node.nullrev]
35 36 # in case of duplicate parents
36 37 if len(parents) == 2 and parents[0] == parents[1]:
37 38 del parents[1]
38 39 for p in parents:
39 40 assert p in children
40 41 children[p].append(i)
41 42
42 43 if len(parents) == 0:
43 44 root.append(i)
44 45
45 46 if i % 1000 == 0:
46 47 write('.')
47 48 finally:
48 49 write('\n')
49 50
50 51 # XXX this is a reimplementation of the 'branchsort' topo sort
51 52 # algorithm in hgext.convert.convcmd... would be nice not to duplicate
52 53 # the algorithm
53 54 write('sorting ...')
54 55 visit = root
55 56 ret = []
56 57 while visit:
57 58 i = visit.pop(0)
58 59 ret.append(i)
59 60 if i not in children:
60 61 # This only happens if some node's p1 == p2, which can
61 62 # happen in the manifest in certain circumstances.
62 63 continue
63 64 next = []
64 65 for c in children.pop(i):
65 66 parents_unseen = [p for p in rl.parentrevs(c)
66 67 if p != node.nullrev and p in children]
67 68 if len(parents_unseen) == 0:
68 69 next.append(c)
69 70 visit = next + visit
70 71 write('\n')
71 72 return ret
72 73
73 74 def writerevs(r1, r2, order, tr):
74 75 write = sys.stdout.write
75 76 write('writing %d revs ' % len(order))
77
78 count = [0]
79 def progress(*args):
80 if count[0] % 1000 == 0:
81 write('.')
82 count[0] += 1
83
84 order = [r1.node(r) for r in order]
85
86 # this is a bit ugly, but it works
87 lookup = lambda x: "%020d" % r1.linkrev(r1.rev(x))
88 unlookup = lambda x: int(x, 10)
89
76 90 try:
77 count = 0
78 for rev in order:
79 n = r1.node(rev)
80 p1, p2 = r1.parents(n)
81 l = r1.linkrev(rev)
82 t = r1.revision(n)
83 n2 = r2.addrevision(t, tr, l, p1, p2)
84
85 if count % 1000 == 0:
86 write('.')
87 count += 1
91 group = util.chunkbuffer(r1.group(order, lookup, progress))
92 chunkiter = changegroup.chunkiter(group)
93 r2.addgroup(chunkiter, unlookup, tr)
88 94 finally:
89 95 write('\n')
90 96
91 97 def report(olddatafn, newdatafn):
92 98 oldsize = float(os.stat(olddatafn).st_size)
93 99 newsize = float(os.stat(newdatafn).st_size)
94 100
95 101 # argh: have to pass an int to %d, because a float >= 2^32
96 102 # blows up under Python 2.5 or earlier
97 103 sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n'
98 104 % (int(oldsize), oldsize/1024/1024))
99 105 sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n'
100 106 % (int(newsize), newsize/1024/1024))
101 107
102 108 shrink_percent = (oldsize - newsize) / oldsize * 100
103 109 shrink_factor = oldsize / newsize
104 110 sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n'
105 111 % (shrink_percent, shrink_factor))
106 112
107 113 def main():
108 114
109 115 # Unbuffer stdout for nice progress output.
110 116 sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
111 117 write = sys.stdout.write
112 118
113 119 parser = optparse.OptionParser(description=__doc__)
114 120 parser.add_option('-R', '--repository',
115 121 default=os.path.curdir,
116 122 metavar='REPO',
117 123 help='repository root directory [default: current dir]')
118 124 parser.add_option('--revlog',
119 125 metavar='FILE',
120 126 help='shrink FILE [default: REPO/hg/store/00manifest.i]')
121 127 (options, args) = parser.parse_args()
122 128 if args:
123 129 parser.error('too many arguments')
124 130
125 131 # Open the specified repository.
126 132 ui = ui_.ui()
127 133 repo = hg.repository(ui, options.repository)
128 134 if not repo.local():
129 135 parser.error('not a local repository: %s' % options.repository)
130 136
131 137 if options.revlog is None:
132 138 indexfn = repo.sjoin('00manifest.i')
133 139 else:
134 140 if not options.revlog.endswith('.i'):
135 141 parser.error('--revlog option must specify the revlog index file '
136 142 '(*.i), not %s' % options.revlog)
137 143
138 144 indexfn = os.path.realpath(options.revlog)
139 145 store = repo.sjoin('')
140 146 if not indexfn.startswith(store):
141 147 parser.error('--revlog option must specify a revlog in %s, not %s'
142 148 % (store, indexfn))
143 149
144 150 datafn = indexfn[:-2] + '.d'
145 151 if not os.path.exists(indexfn):
146 152 parser.error('no such file: %s' % indexfn)
147 153 if '00changelog' in indexfn:
148 154 parser.error('shrinking the changelog will corrupt your repository')
149 155 if not os.path.exists(datafn):
150 156 # This is just a lazy shortcut because I can't be bothered to
151 157 # handle all the special cases that entail from no .d file.
152 158 parser.error('%s does not exist: revlog not big enough '
153 159 'to be worth shrinking' % datafn)
154 160
155 161 oldindexfn = indexfn + '.old'
156 162 olddatafn = datafn + '.old'
157 163 if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
158 164 parser.error('one or both of\n'
159 165 ' %s\n'
160 166 ' %s\n'
161 167 'exists from a previous run; please clean up before '
162 168 'running again'
163 169 % (oldindexfn, olddatafn))
164 170
165 171 write('shrinking %s\n' % indexfn)
166 172 prefix = os.path.basename(indexfn)[:-1]
167 173 (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
168 174 prefix=prefix,
169 175 suffix='.i')
170 176 tmpdatafn = tmpindexfn[:-2] + '.d'
171 177 os.close(tmpfd)
172 178
173 179 r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
174 180 r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
175 181
176 182 # Don't use repo.transaction(), because then things get hairy with
177 183 # paths: some need to be relative to .hg, and some need to be
178 184 # absolute. Doing it this way keeps things simple: everything is an
179 185 # absolute path.
180 186 lock = repo.lock(wait=False)
181 187 tr = transaction.transaction(sys.stderr.write,
182 188 open,
183 189 repo.sjoin('journal'))
184 190
185 191 try:
186 192 try:
187 193 order = toposort(r1)
188 194 writerevs(r1, r2, order, tr)
189 195 report(datafn, tmpdatafn)
190 196 tr.close()
191 197 except:
192 198 # Abort transaction first, so we truncate the files before
193 199 # deleting them.
194 200 tr.abort()
195 201 if os.path.exists(tmpindexfn):
196 202 os.unlink(tmpindexfn)
197 203 if os.path.exists(tmpdatafn):
198 204 os.unlink(tmpdatafn)
199 205 raise
200 206 finally:
201 207 lock.release()
202 208
203 209 os.link(indexfn, oldindexfn)
204 210 os.link(datafn, olddatafn)
205 211 os.rename(tmpindexfn, indexfn)
206 212 os.rename(tmpdatafn, datafn)
207 213 write('note: old revlog saved in:\n'
208 214 ' %s\n'
209 215 ' %s\n'
210 216 '(You can delete those files when you are satisfied that your\n'
211 217 'repository is still sane. '
212 218 'Running \'hg verify\' is strongly recommended.)\n'
213 219 % (oldindexfn, olddatafn))
214 220
215 221 try:
216 222 main()
217 223 except KeyboardInterrupt:
218 224 sys.exit("interrupted")
General Comments 0
You need to be logged in to leave comments. Login now