Show More
@@ -1,259 +1,281 | |||
|
1 | 1 | #!/usr/bin/env python3 |
|
2 | 2 | # |
|
3 | 3 | # generate-branchy-bundle - generate a branch for a "large" branchy repository |
|
4 | 4 | # |
|
5 | 5 | # Copyright 2018 Octobus, contact@octobus.net |
|
6 | 6 | # |
|
7 | 7 | # This software may be used and distributed according to the terms of the |
|
8 | 8 | # GNU General Public License version 2 or any later version. |
|
9 | 9 | # |
|
10 | 10 | # This script generates a repository suitable for testing delta computation |
|
11 | 11 | # strategies. |
|
12 | 12 | # |
|
13 | 13 | # The repository update a single "large" file with many updates. One fixed part |
|
14 | 14 | # of the files always get updated while the rest of the lines get updated over |
|
15 | 15 | # time. This update happens over many topological branches, some getting merged |
|
16 | 16 | # back. |
|
17 | # | |
|
18 | # Running with `chg` in your path and `CHGHG` set is recommended for speed. | |
|
19 | 17 | |
|
20 | 18 | |
|
21 | 19 | import hashlib |
|
22 | 20 | import os |
|
23 | 21 | import shutil |
|
24 | 22 | import subprocess |
|
25 | 23 | import sys |
|
26 | 24 | import tempfile |
|
27 | 25 | |
|
26 | import mercurial.context | |
|
27 | import mercurial.hg | |
|
28 | import mercurial.ui | |
|
29 | ||
|
28 | 30 | BUNDLE_NAME = 'big-file-churn.hg' |
|
29 | 31 | |
|
30 | 32 | # constants for generating the repository |
|
31 | 33 | NB_CHANGESET = 5000 |
|
32 | 34 | PERIOD_MERGING = 8 |
|
33 | 35 | PERIOD_BRANCHING = 7 |
|
34 | 36 | MOVE_BACK_MIN = 3 |
|
35 | 37 | MOVE_BACK_RANGE = 5 |
|
36 | 38 | |
|
37 | 39 | # constants for generating the large file we keep updating |
|
38 | 40 | # |
|
39 | 41 | # At each revision, the beginning on the file change, |
|
40 | 42 | # and set of other lines changes too. |
|
41 | 43 | FILENAME = 'SPARSE-REVLOG-TEST-FILE' |
|
42 | 44 | NB_LINES = 10500 |
|
43 | 45 | ALWAYS_CHANGE_LINES = 500 |
|
44 | 46 | OTHER_CHANGES = 300 |
|
45 | 47 | |
|
46 | 48 | |
|
47 | 49 | def build_graph(): |
|
48 | 50 | heads = {0} |
|
49 | 51 | graph = {0: (None, None)} |
|
50 | 52 | for idx in range(1, NB_CHANGESET + 1): |
|
51 | 53 | p, _ = parents = [idx - 1, None] |
|
52 | 54 | if (idx % PERIOD_BRANCHING) == 0: |
|
53 | 55 | back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE) |
|
54 | 56 | for _ in range(back): |
|
55 | 57 | p = graph.get(p, (p,))[0] |
|
56 | 58 | parents[0] = p |
|
57 | 59 | if (idx % PERIOD_MERGING) == 0: |
|
58 | 60 | parents[1] = min(heads) |
|
59 | 61 | for p in parents: |
|
60 | 62 | heads.discard(p) |
|
61 | 63 | heads.add(idx) |
|
62 | 64 | graph[idx] = tuple(parents) |
|
63 | 65 | return graph |
|
64 | 66 | |
|
65 | 67 | |
|
66 | 68 | GRAPH = build_graph() |
|
67 | 69 | |
|
68 | 70 | |
|
69 | 71 | def nextcontent(previous_content): |
|
70 | 72 | """utility to produce a new file content from the previous one""" |
|
71 | 73 | return hashlib.md5(previous_content).hexdigest().encode('ascii') |
|
72 | 74 | |
|
73 | 75 | |
|
74 | 76 | def filecontent(iteridx, oldcontent): |
|
75 | 77 | """generate a new file content |
|
76 | 78 | |
|
77 | 79 | The content is generated according the iteration index and previous |
|
78 | 80 | content""" |
|
79 | 81 | |
|
80 | 82 | # initial call |
|
81 | 83 | if iteridx == 0: |
|
82 | 84 | current = b'' |
|
83 | 85 | else: |
|
84 | 86 | current = b"%d" % iteridx |
|
85 | 87 | |
|
86 | 88 | for idx in range(NB_LINES): |
|
87 | 89 | do_change_line = True |
|
88 | 90 | if oldcontent is not None and ALWAYS_CHANGE_LINES < idx: |
|
89 | 91 | do_change_line = not ((idx - iteridx) % OTHER_CHANGES) |
|
90 | 92 | |
|
91 | 93 | if do_change_line: |
|
92 | 94 | to_write = current + b'\n' |
|
93 | 95 | current = nextcontent(current) |
|
94 | 96 | else: |
|
95 | 97 | to_write = oldcontent[idx] |
|
96 | 98 | yield to_write |
|
97 | 99 | |
|
98 | 100 | |
|
99 | 101 | def merge_content(base, left, right): |
|
100 | 102 | """merge two file content to produce a new one |
|
101 | 103 | |
|
102 | 104 | use unambiguous update on each side when possible, and produce a new line |
|
103 | 105 | whenever a merge is needed. Similar to what the manifest would do. |
|
104 | 106 | """ |
|
105 | 107 | for old, left, right in zip(base, left, right): |
|
106 | 108 | if old == left and old == right: |
|
107 | 109 | yield old |
|
108 | 110 | elif old == left and old != right: |
|
109 | 111 | yield right |
|
110 | 112 | elif old != left and old == right: |
|
111 | 113 | yield left |
|
112 | 114 | else: |
|
113 | 115 | yield nextcontent(left + right) |
|
114 | 116 | |
|
115 | 117 | |
|
116 | 118 | def ancestors(graph, rev): |
|
117 | 119 | """return the set of ancestors of revision <rev>""" |
|
118 | 120 | to_proceed = {rev} |
|
119 | 121 | seen = set(to_proceed) |
|
120 | 122 | while to_proceed: |
|
121 | 123 | current = to_proceed.pop() |
|
122 | 124 | for p in graph[current]: |
|
123 | 125 | if p is None: |
|
124 | 126 | continue |
|
125 | 127 | if p in seen: |
|
126 | 128 | continue |
|
127 | 129 | to_proceed.add(p) |
|
128 | 130 | seen.add(p) |
|
129 | 131 | return seen |
|
130 | 132 | |
|
131 | 133 | |
|
132 | 134 | def gca(graph, left, right): |
|
133 | 135 | """find the greater common ancestors of left and right |
|
134 | 136 | |
|
135 | 137 | Note that the algorithm is stupid and NΒ² when run on all merge, however |
|
136 | 138 | this should not be a too much issue given the current scale. |
|
137 | 139 | """ |
|
138 | 140 | return max(ancestors(graph, left) & ancestors(graph, right)) |
|
139 | 141 | |
|
140 | 142 | |
|
141 | 143 | def make_one_content_fn(idx, base, left, right): |
|
142 | 144 | """build a function that build the content on demand |
|
143 | 145 | |
|
144 | 146 | The dependency are kept are reference to make sure they are not |
|
145 | 147 | garbage-collected until we use them. Once we computed the current content, |
|
146 | 148 | we make sure to drop their reference to allow them to be garbage collected. |
|
147 | 149 | """ |
|
148 | 150 | |
|
149 | 151 | def content_fn(idx=idx, base=base, left=left, right=right): |
|
150 | 152 | if left is None: |
|
151 | 153 | new = filecontent(idx, None) |
|
152 | 154 | elif base is None: |
|
153 | 155 | new = filecontent(idx, left()) |
|
154 | 156 | else: |
|
155 | 157 | merged = merge_content(base(), left(), right()) |
|
156 | 158 | new = filecontent(idx, list(merged)) |
|
157 | 159 | return list(new) |
|
158 | 160 | |
|
159 | 161 | del idx |
|
160 | 162 | del base |
|
161 | 163 | del left |
|
162 | 164 | del right |
|
163 | 165 | |
|
164 | 166 | value = None |
|
165 | 167 | cf = [content_fn] |
|
166 | 168 | del content_fn |
|
167 | 169 | |
|
168 | 170 | def final_fn(): |
|
169 | 171 | nonlocal value |
|
170 | 172 | if value is None: |
|
171 | 173 | content_fn = cf.pop() |
|
172 | 174 | value = list(content_fn()) |
|
173 | 175 | del content_fn |
|
174 | 176 | return value |
|
175 | 177 | |
|
176 | 178 | return final_fn |
|
177 | 179 | |
|
178 | 180 | |
|
179 | 181 | def build_content_graph(graph): |
|
180 | 182 | """produce file content for all revision |
|
181 | 183 | |
|
182 | 184 | The content will be generated on demande and cached. Cleanup the |
|
183 | 185 | dictionnary are you use it to reduce memory usage. |
|
184 | 186 | """ |
|
185 | 187 | content = {} |
|
186 | 188 | for idx, (p1, p2) in graph.items(): |
|
187 | 189 | base = left = right = None |
|
188 | 190 | if p1 is not None: |
|
189 | 191 | left = content[p1] |
|
190 | 192 | if p2 is not None: |
|
191 | 193 | right = content[p2] |
|
192 | 194 | base_rev = gca(graph, p1, p2) |
|
193 | 195 | base = content[base_rev] |
|
194 | 196 | content[idx] = make_one_content_fn(idx, base, left, right) |
|
195 | 197 | return content |
|
196 | 198 | |
|
197 | 199 | |
|
198 | 200 | CONTENT = build_content_graph(GRAPH) |
|
199 | 201 | |
|
200 | 202 | |
|
201 | 203 | def hg(command, *args): |
|
202 | 204 | """call a mercurial command with appropriate config and argument""" |
|
203 | 205 | env = os.environ.copy() |
|
204 | 206 | if 'CHGHG' in env: |
|
205 | 207 | full_cmd = ['chg'] |
|
206 | 208 | else: |
|
207 | 209 | full_cmd = ['hg'] |
|
208 | 210 | full_cmd.append('--quiet') |
|
209 | 211 | full_cmd.append(command) |
|
210 | 212 | if command == 'commit': |
|
211 | 213 | # reproducible commit metadata |
|
212 | 214 | full_cmd.extend(['--date', '0 0', '--user', 'test']) |
|
213 | 215 | elif command == 'merge': |
|
214 | 216 | # avoid conflicts by picking the local variant |
|
215 | 217 | full_cmd.extend(['--tool', ':merge-local']) |
|
216 | 218 | full_cmd.extend(args) |
|
217 | 219 | env['HGRCPATH'] = '' |
|
218 | 220 | return subprocess.check_call(full_cmd, env=env) |
|
219 | 221 | |
|
220 | 222 | |
|
223 | def write_repo(path): | |
|
224 | """write repository content in memory""" | |
|
225 | repo = mercurial.hg.repository( | |
|
226 | mercurial.ui.ui.load(), | |
|
227 | path=path.encode('utf-8'), | |
|
228 | ) | |
|
229 | nodemap = {None: repo.nodeconstants.nullid} | |
|
230 | with repo.lock(), repo.transaction(b'bundle-generation'): | |
|
231 | for idx, (p1, p2) in GRAPH.items(): | |
|
232 | if sys.stdout.isatty(): | |
|
233 | print("generating commit #%d/%d" % (idx, NB_CHANGESET)) | |
|
234 | ||
|
235 | file_fn = lambda repo, memctx, path: mercurial.context.memfilectx( | |
|
236 | repo, | |
|
237 | memctx, | |
|
238 | path, | |
|
239 | data=b''.join(CONTENT.pop(idx)()), | |
|
240 | ) | |
|
241 | ||
|
242 | mc = mercurial.context.memctx( | |
|
243 | repo, | |
|
244 | (nodemap[p1], nodemap[p2]), | |
|
245 | b'commit #%d' % idx if idx else b'initial commit', | |
|
246 | [FILENAME.encode('ascii')], | |
|
247 | file_fn, | |
|
248 | user=b"test", | |
|
249 | date=(0, 0), | |
|
250 | ) | |
|
251 | nodemap[idx] = repo.commitctx(mc) | |
|
252 | ||
|
253 | ||
|
221 | 254 | def run(target): |
|
222 | 255 | tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-') |
|
223 | 256 | try: |
|
224 | 257 | os.chdir(tmpdir) |
|
225 |
hg( |
|
|
226 | for idx, (p1, p2) in GRAPH.items(): | |
|
227 | if sys.stdout.isatty(): | |
|
228 | print("generating commit #%d/%d" % (idx, NB_CHANGESET)) | |
|
229 | if p1 is not None and p1 != idx - 1: | |
|
230 | hg('update', "%d" % p1) | |
|
231 | if p2 is not None: | |
|
232 | hg('merge', "%d" % p2) | |
|
233 | with open(FILENAME, 'wb') as f: | |
|
234 | # pop the value to let it be garbage collection eventually. | |
|
235 | for line in CONTENT.pop(idx)(): | |
|
236 | f.write(line) | |
|
237 | if idx == 0: | |
|
238 | hg('add', FILENAME) | |
|
239 | hg('commit', '--addremove', '--message', 'initial commit') | |
|
240 | else: | |
|
241 | hg('commit', '--message', 'commit #%d' % idx) | |
|
258 | hg( | |
|
259 | 'init', | |
|
260 | '--config', | |
|
261 | 'format.maxchainlen=%d' % NB_CHANGESET, | |
|
262 | ) | |
|
263 | write_repo(tmpdir) | |
|
242 | 264 | hg('bundle', '--all', target, '--config', 'devel.bundle.delta=p1') |
|
243 | 265 | with open(target, 'rb') as bundle: |
|
244 | 266 | data = bundle.read() |
|
245 | 267 | digest = hashlib.md5(data).hexdigest() |
|
246 | 268 | with open(target + '.md5', 'wb') as md5file: |
|
247 | 269 | md5file.write(digest.encode('ascii') + b'\n') |
|
248 | 270 | if sys.stdout.isatty(): |
|
249 | 271 | print('bundle generated at "%s" md5: %s' % (target, digest)) |
|
250 | 272 | |
|
251 | 273 | finally: |
|
252 | 274 | shutil.rmtree(tmpdir) |
|
253 | 275 | return 0 |
|
254 | 276 | |
|
255 | 277 | |
|
256 | 278 | if __name__ == '__main__': |
|
257 | 279 | orig = os.path.realpath(os.path.dirname(sys.argv[0])) |
|
258 | 280 | target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME) |
|
259 | 281 | sys.exit(run(target)) |
General Comments 0
You need to be logged in to leave comments.
Login now