##// END OF EJS Templates
nodemap: keep track of the ondisk id of nodemap blocks...
marmoute -
r44802:f0862ee1 default
parent child Browse files
Show More
@@ -1,394 +1,399
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import os
12 12 import re
13 13 import struct
14 14
15 15 from .. import (
16 16 error,
17 17 node as nodemod,
18 18 util,
19 19 )
20 20
21 21
22 22 class NodeMap(dict):
23 23 def __missing__(self, x):
24 24 raise error.RevlogError(b'unknown node: %s' % x)
25 25
26 26
27 27 def persisted_data(revlog):
28 28 """read the nodemap for a revlog from disk"""
29 29 if revlog.nodemap_file is None:
30 30 return None
31 31 pdata = revlog.opener.tryread(revlog.nodemap_file)
32 32 if not pdata:
33 33 return None
34 34 offset = 0
35 35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
36 36 if version != ONDISK_VERSION:
37 37 return None
38 38 offset += S_VERSION.size
39 39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
40 40 offset += S_HEADER.size
41 41 uid = pdata[offset : offset + uuid_size]
42 42
43 43 filename = _rawdata_filepath(revlog, uid)
44 44 return revlog.opener.tryread(filename)
45 45
46 46
47 47 def setup_persistent_nodemap(tr, revlog):
48 48 """Install whatever is needed transaction side to persist a nodemap on disk
49 49
50 50 (only actually persist the nodemap if this is relevant for this revlog)
51 51 """
52 52 if revlog._inline:
53 53 return # inlined revlog are too small for this to be relevant
54 54 if revlog.nodemap_file is None:
55 55 return # we do not use persistent_nodemap on this revlog
56 56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
57 57 if tr.hasfinalize(callback_id):
58 58 return # no need to register again
59 59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
60 60
61 61
62 62 def _persist_nodemap(tr, revlog):
63 63 """Write nodemap data on disk for a given revlog
64 64 """
65 65 if getattr(revlog, 'filteredrevs', ()):
66 66 raise error.ProgrammingError(
67 67 "cannot persist nodemap of a filtered changelog"
68 68 )
69 69 if revlog.nodemap_file is None:
70 70 msg = "calling persist nodemap on a revlog without the feature enableb"
71 71 raise error.ProgrammingError(msg)
72 72 if util.safehasattr(revlog.index, "nodemap_data_all"):
73 73 data = revlog.index.nodemap_data_all()
74 74 else:
75 75 data = persistent_data(revlog.index)
76 76 uid = _make_uid()
77 77 datafile = _rawdata_filepath(revlog, uid)
78 78 olds = _other_rawdata_filepath(revlog, uid)
79 79 if olds:
80 80 realvfs = getattr(revlog, '_realopener', revlog.opener)
81 81
82 82 def cleanup(tr):
83 83 for oldfile in olds:
84 84 realvfs.tryunlink(oldfile)
85 85
86 86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
87 87 tr.addpostclose(callback_id, cleanup)
88 88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
89 89 # store vfs
90 90 with revlog.opener(datafile, b'w') as fd:
91 91 fd.write(data)
92 92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
93 93 # store vfs
94 94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
95 95 fp.write(_serialize_docket(uid))
96 96 # EXP-TODO: if the transaction abort, we should remove the new data and
97 97 # reinstall the old one.
98 98
99 99
100 100 ### Nodemap docket file
101 101 #
102 102 # The nodemap data are stored on disk using 2 files:
103 103 #
104 104 # * a raw data files containing a persistent nodemap
105 105 # (see `Nodemap Trie` section)
106 106 #
107 107 # * a small "docket" file containing medatadata
108 108 #
109 109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
110 110 # small, it is easy to update it automatically or to duplicated its content
111 111 # during a transaction.
112 112 #
113 113 # Multiple raw data can exist at the same time (The currently valid one and a
114 114 # new one beind used by an in progress transaction). To accomodate this, the
115 115 # filename hosting the raw data has a variable parts. The exact filename is
116 116 # specified inside the "docket" file.
117 117 #
118 118 # The docket file contains information to find, qualify and validate the raw
119 119 # data. Its content is currently very light, but it will expand as the on disk
120 120 # nodemap gains the necessary features to be used in production.
121 121
122 122 # version 0 is experimental, no BC garantee, do no use outside of tests.
123 123 ONDISK_VERSION = 0
124 124
125 125 S_VERSION = struct.Struct(">B")
126 126 S_HEADER = struct.Struct(">B")
127 127
128 128 ID_SIZE = 8
129 129
130 130
131 131 def _make_uid():
132 132 """return a new unique identifier.
133 133
134 134 The identifier is random and composed of ascii characters."""
135 135 return nodemod.hex(os.urandom(ID_SIZE))
136 136
137 137
138 138 def _serialize_docket(uid):
139 139 """return serialized bytes for a docket using the passed uid"""
140 140 data = []
141 141 data.append(S_VERSION.pack(ONDISK_VERSION))
142 142 data.append(S_HEADER.pack(len(uid)))
143 143 data.append(uid)
144 144 return b''.join(data)
145 145
146 146
147 147 def _rawdata_filepath(revlog, uid):
148 148 """The (vfs relative) nodemap's rawdata file for a given uid"""
149 149 prefix = revlog.nodemap_file[:-2]
150 150 return b"%s-%s.nd" % (prefix, uid)
151 151
152 152
153 153 def _other_rawdata_filepath(revlog, uid):
154 154 prefix = revlog.nodemap_file[:-2]
155 155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
156 156 new_file_path = _rawdata_filepath(revlog, uid)
157 157 new_file_name = revlog.opener.basename(new_file_path)
158 158 dirpath = revlog.opener.dirname(new_file_path)
159 159 others = []
160 160 for f in revlog.opener.listdir(dirpath):
161 161 if pattern.match(f) and f != new_file_name:
162 162 others.append(f)
163 163 return others
164 164
165 165
166 166 ### Nodemap Trie
167 167 #
168 168 # This is a simple reference implementation to compute and persist a nodemap
169 169 # trie. This reference implementation is write only. The python version of this
170 170 # is not expected to be actually used, since it wont provide performance
171 171 # improvement over existing non-persistent C implementation.
172 172 #
173 173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
174 174 # revision can be adressed using its node shortest prefix.
175 175 #
176 176 # The trie is stored as a sequence of block. Each block contains 16 entries
177 177 # (signed 64bit integer, big endian). Each entry can be one of the following:
178 178 #
179 179 # * value >= 0 -> index of sub-block
180 180 # * value == -1 -> no value
181 181 # * value < -1 -> a revision value: rev = -(value+10)
182 182 #
183 183 # The implementation focus on simplicity, not on performance. A Rust
184 184 # implementation should provide a efficient version of the same binary
185 185 # persistence. This reference python implementation is never meant to be
186 186 # extensively use in production.
187 187
188 188
189 189 def persistent_data(index):
190 190 """return the persistent binary form for a nodemap for a given index
191 191 """
192 192 trie = _build_trie(index)
193 193 return _persist_trie(trie)
194 194
195 195
196 196 S_BLOCK = struct.Struct(">" + ("l" * 16))
197 197
198 198 NO_ENTRY = -1
199 199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
200 200 REV_OFFSET = 2
201 201
202 202
203 203 def _transform_rev(rev):
204 204 """Return the number used to represent the rev in the tree.
205 205
206 206 (or retrieve a rev number from such representation)
207 207
208 208 Note that this is an involution, a function equal to its inverse (i.e.
209 209 which gives the identity when applied to itself).
210 210 """
211 211 return -(rev + REV_OFFSET)
212 212
213 213
214 214 def _to_int(hex_digit):
215 215 """turn an hexadecimal digit into a proper integer"""
216 216 return int(hex_digit, 16)
217 217
218 218
219 219 class Block(dict):
220 220 """represent a block of the Trie
221 221
222 222 contains up to 16 entry indexed from 0 to 15"""
223 223
224 def __init__(self):
225 super(Block, self).__init__()
226 # If this block exist on disk, here is its ID
227 self.ondisk_id = None
228
224 229 def __iter__(self):
225 230 return iter(self.get(i) for i in range(16))
226 231
227 232
228 233 def _build_trie(index):
229 234 """build a nodemap trie
230 235
231 236 The nodemap stores revision number for each unique prefix.
232 237
233 238 Each block is a dictionary with keys in `[0, 15]`. Values are either
234 239 another block or a revision number.
235 240 """
236 241 root = Block()
237 242 for rev in range(len(index)):
238 243 hex = nodemod.hex(index[rev][7])
239 244 _insert_into_block(index, 0, root, rev, hex)
240 245 return root
241 246
242 247
243 248 def _insert_into_block(index, level, block, current_rev, current_hex):
244 249 """insert a new revision in a block
245 250
246 251 index: the index we are adding revision for
247 252 level: the depth of the current block in the trie
248 253 block: the block currently being considered
249 254 current_rev: the revision number we are adding
250 255 current_hex: the hexadecimal representation of the of that revision
251 256 """
252 257 hex_digit = _to_int(current_hex[level : level + 1])
253 258 entry = block.get(hex_digit)
254 259 if entry is None:
255 260 # no entry, simply store the revision number
256 261 block[hex_digit] = current_rev
257 262 elif isinstance(entry, dict):
258 263 # need to recurse to an underlying block
259 264 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
260 265 else:
261 266 # collision with a previously unique prefix, inserting new
262 267 # vertices to fit both entry.
263 268 other_hex = nodemod.hex(index[entry][7])
264 269 other_rev = entry
265 270 new = Block()
266 271 block[hex_digit] = new
267 272 _insert_into_block(index, level + 1, new, other_rev, other_hex)
268 273 _insert_into_block(index, level + 1, new, current_rev, current_hex)
269 274
270 275
271 276 def _persist_trie(root):
272 277 """turn a nodemap trie into persistent binary data
273 278
274 279 See `_build_trie` for nodemap trie structure"""
275 280 block_map = {}
276 281 chunks = []
277 282 for tn in _walk_trie(root):
278 283 block_map[id(tn)] = len(chunks)
279 284 chunks.append(_persist_block(tn, block_map))
280 285 return b''.join(chunks)
281 286
282 287
283 288 def _walk_trie(block):
284 289 """yield all the block in a trie
285 290
286 291 Children blocks are always yield before their parent block.
287 292 """
288 293 for (_, item) in sorted(block.items()):
289 294 if isinstance(item, dict):
290 295 for sub_block in _walk_trie(item):
291 296 yield sub_block
292 297 yield block
293 298
294 299
295 300 def _persist_block(block_node, block_map):
296 301 """produce persistent binary data for a single block
297 302
298 303 Children block are assumed to be already persisted and present in
299 304 block_map.
300 305 """
301 306 data = tuple(_to_value(v, block_map) for v in block_node)
302 307 return S_BLOCK.pack(*data)
303 308
304 309
305 310 def _to_value(item, block_map):
306 311 """persist any value as an integer"""
307 312 if item is None:
308 313 return NO_ENTRY
309 314 elif isinstance(item, dict):
310 315 return block_map[id(item)]
311 316 else:
312 317 return _transform_rev(item)
313 318
314 319
315 320 def parse_data(data):
316 321 """parse parse nodemap data into a nodemap Trie"""
317 322 if (len(data) % S_BLOCK.size) != 0:
318 323 msg = "nodemap data size is not a multiple of block size (%d): %d"
319 324 raise error.Abort(msg % (S_BLOCK.size, len(data)))
320 325 if not data:
321 326 return Block()
322 327 block_map = {}
323 328 new_blocks = []
324 329 for i in range(0, len(data), S_BLOCK.size):
325 330 block = Block()
326 ondisk_id = len(block_map)
327 block_map[ondisk_id] = block
331 block.ondisk_id = len(block_map)
332 block_map[block.ondisk_id] = block
328 333 block_data = data[i : i + S_BLOCK.size]
329 334 values = S_BLOCK.unpack(block_data)
330 335 new_blocks.append((block, values))
331 336 for b, values in new_blocks:
332 337 for idx, v in enumerate(values):
333 338 if v == NO_ENTRY:
334 339 continue
335 340 elif v >= 0:
336 341 b[idx] = block_map[v]
337 342 else:
338 343 b[idx] = _transform_rev(v)
339 344 return block
340 345
341 346
342 347 # debug utility
343 348
344 349
345 350 def check_data(ui, index, data):
346 351 """verify that the provided nodemap data are valid for the given idex"""
347 352 ret = 0
348 353 ui.status((b"revision in index: %d\n") % len(index))
349 354 root = parse_data(data)
350 355 all_revs = set(_all_revisions(root))
351 356 ui.status((b"revision in nodemap: %d\n") % len(all_revs))
352 357 for r in range(len(index)):
353 358 if r not in all_revs:
354 359 msg = b" revision missing from nodemap: %d\n" % r
355 360 ui.write_err(msg)
356 361 ret = 1
357 362 else:
358 363 all_revs.remove(r)
359 364 nm_rev = _find_node(root, nodemod.hex(index[r][7]))
360 365 if nm_rev is None:
361 366 msg = b" revision node does not match any entries: %d\n" % r
362 367 ui.write_err(msg)
363 368 ret = 1
364 369 elif nm_rev != r:
365 370 msg = (
366 371 b" revision node does not match the expected revision: "
367 372 b"%d != %d\n" % (r, nm_rev)
368 373 )
369 374 ui.write_err(msg)
370 375 ret = 1
371 376
372 377 if all_revs:
373 378 for r in sorted(all_revs):
374 379 msg = b" extra revision in nodemap: %d\n" % r
375 380 ui.write_err(msg)
376 381 ret = 1
377 382 return ret
378 383
379 384
380 385 def _all_revisions(root):
381 386 """return all revisions stored in a Trie"""
382 387 for block in _walk_trie(root):
383 388 for v in block:
384 389 if v is None or isinstance(v, Block):
385 390 continue
386 391 yield v
387 392
388 393
389 394 def _find_node(block, node):
390 395 """find the revision associated with a given node"""
391 396 entry = block.get(_to_int(node[0:1]))
392 397 if isinstance(entry, dict):
393 398 return _find_node(entry, node[1:])
394 399 return entry
General Comments 0
You need to be logged in to leave comments. Login now