##// END OF EJS Templates
nodemap: code to parse the persistent binary nodemap data...
marmoute -
r44798:78721bbd default
parent child Browse files
Show More
@@ -1,312 +1,339 b''
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import os
12 12 import re
13 13 import struct
14 14
15 15 from .. import (
16 16 error,
17 17 node as nodemod,
18 18 util,
19 19 )
20 20
21 21
22 22 class NodeMap(dict):
23 23 def __missing__(self, x):
24 24 raise error.RevlogError(b'unknown node: %s' % x)
25 25
26 26
27 27 def persisted_data(revlog):
28 28 """read the nodemap for a revlog from disk"""
29 29 if revlog.nodemap_file is None:
30 30 return None
31 31 pdata = revlog.opener.tryread(revlog.nodemap_file)
32 32 if not pdata:
33 33 return None
34 34 offset = 0
35 35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
36 36 if version != ONDISK_VERSION:
37 37 return None
38 38 offset += S_VERSION.size
39 39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
40 40 offset += S_HEADER.size
41 41 uid = pdata[offset : offset + uuid_size]
42 42
43 43 filename = _rawdata_filepath(revlog, uid)
44 44 return revlog.opener.tryread(filename)
45 45
46 46
47 47 def setup_persistent_nodemap(tr, revlog):
48 48 """Install whatever is needed transaction side to persist a nodemap on disk
49 49
50 50 (only actually persist the nodemap if this is relevant for this revlog)
51 51 """
52 52 if revlog._inline:
53 53 return # inlined revlog are too small for this to be relevant
54 54 if revlog.nodemap_file is None:
55 55 return # we do not use persistent_nodemap on this revlog
56 56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
57 57 if tr.hasfinalize(callback_id):
58 58 return # no need to register again
59 59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
60 60
61 61
62 62 def _persist_nodemap(tr, revlog):
63 63 """Write nodemap data on disk for a given revlog
64 64 """
65 65 if getattr(revlog, 'filteredrevs', ()):
66 66 raise error.ProgrammingError(
67 67 "cannot persist nodemap of a filtered changelog"
68 68 )
69 69 if revlog.nodemap_file is None:
70 70 msg = "calling persist nodemap on a revlog without the feature enableb"
71 71 raise error.ProgrammingError(msg)
72 72 if util.safehasattr(revlog.index, "nodemap_data_all"):
73 73 data = revlog.index.nodemap_data_all()
74 74 else:
75 75 data = persistent_data(revlog.index)
76 76 uid = _make_uid()
77 77 datafile = _rawdata_filepath(revlog, uid)
78 78 olds = _other_rawdata_filepath(revlog, uid)
79 79 if olds:
80 80 realvfs = getattr(revlog, '_realopener', revlog.opener)
81 81
82 82 def cleanup(tr):
83 83 for oldfile in olds:
84 84 realvfs.tryunlink(oldfile)
85 85
86 86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
87 87 tr.addpostclose(callback_id, cleanup)
88 88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
89 89 # store vfs
90 90 with revlog.opener(datafile, b'w') as fd:
91 91 fd.write(data)
92 92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
93 93 # store vfs
94 94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
95 95 fp.write(_serialize_docket(uid))
96 96 # EXP-TODO: if the transaction abort, we should remove the new data and
97 97 # reinstall the old one.
98 98
99 99
100 100 ### Nodemap docket file
101 101 #
102 102 # The nodemap data are stored on disk using 2 files:
103 103 #
104 104 # * a raw data files containing a persistent nodemap
105 105 # (see `Nodemap Trie` section)
106 106 #
107 107 # * a small "docket" file containing medatadata
108 108 #
109 109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
110 110 # small, it is easy to update it automatically or to duplicated its content
111 111 # during a transaction.
112 112 #
113 113 # Multiple raw data can exist at the same time (The currently valid one and a
114 114 # new one beind used by an in progress transaction). To accomodate this, the
115 115 # filename hosting the raw data has a variable parts. The exact filename is
116 116 # specified inside the "docket" file.
117 117 #
118 118 # The docket file contains information to find, qualify and validate the raw
119 119 # data. Its content is currently very light, but it will expand as the on disk
120 120 # nodemap gains the necessary features to be used in production.
121 121
122 122 # version 0 is experimental, no BC garantee, do no use outside of tests.
123 123 ONDISK_VERSION = 0
124 124
125 125 S_VERSION = struct.Struct(">B")
126 126 S_HEADER = struct.Struct(">B")
127 127
128 128 ID_SIZE = 8
129 129
130 130
131 131 def _make_uid():
132 132 """return a new unique identifier.
133 133
134 134 The identifier is random and composed of ascii characters."""
135 135 return nodemod.hex(os.urandom(ID_SIZE))
136 136
137 137
138 138 def _serialize_docket(uid):
139 139 """return serialized bytes for a docket using the passed uid"""
140 140 data = []
141 141 data.append(S_VERSION.pack(ONDISK_VERSION))
142 142 data.append(S_HEADER.pack(len(uid)))
143 143 data.append(uid)
144 144 return b''.join(data)
145 145
146 146
147 147 def _rawdata_filepath(revlog, uid):
148 148 """The (vfs relative) nodemap's rawdata file for a given uid"""
149 149 prefix = revlog.nodemap_file[:-2]
150 150 return b"%s-%s.nd" % (prefix, uid)
151 151
152 152
153 153 def _other_rawdata_filepath(revlog, uid):
154 154 prefix = revlog.nodemap_file[:-2]
155 155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
156 156 new_file_path = _rawdata_filepath(revlog, uid)
157 157 new_file_name = revlog.opener.basename(new_file_path)
158 158 dirpath = revlog.opener.dirname(new_file_path)
159 159 others = []
160 160 for f in revlog.opener.listdir(dirpath):
161 161 if pattern.match(f) and f != new_file_name:
162 162 others.append(f)
163 163 return others
164 164
165 165
166 166 ### Nodemap Trie
167 167 #
168 168 # This is a simple reference implementation to compute and persist a nodemap
169 169 # trie. This reference implementation is write only. The python version of this
170 170 # is not expected to be actually used, since it wont provide performance
171 171 # improvement over existing non-persistent C implementation.
172 172 #
173 173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
174 174 # revision can be adressed using its node shortest prefix.
175 175 #
176 176 # The trie is stored as a sequence of block. Each block contains 16 entries
177 177 # (signed 64bit integer, big endian). Each entry can be one of the following:
178 178 #
179 179 # * value >= 0 -> index of sub-block
180 180 # * value == -1 -> no value
181 181 # * value < -1 -> a revision value: rev = -(value+10)
182 182 #
183 183 # The implementation focus on simplicity, not on performance. A Rust
184 184 # implementation should provide a efficient version of the same binary
185 185 # persistence. This reference python implementation is never meant to be
186 186 # extensively use in production.
187 187
188 188
189 189 def persistent_data(index):
190 190 """return the persistent binary form for a nodemap for a given index
191 191 """
192 192 trie = _build_trie(index)
193 193 return _persist_trie(trie)
194 194
195 195
196 196 S_BLOCK = struct.Struct(">" + ("l" * 16))
197 197
198 198 NO_ENTRY = -1
199 199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
200 200 REV_OFFSET = 2
201 201
202 202
203 203 def _transform_rev(rev):
204 204 """Return the number used to represent the rev in the tree.
205 205
206 206 (or retrieve a rev number from such representation)
207 207
208 208 Note that this is an involution, a function equal to its inverse (i.e.
209 209 which gives the identity when applied to itself).
210 210 """
211 211 return -(rev + REV_OFFSET)
212 212
213 213
214 214 def _to_int(hex_digit):
215 215 """turn an hexadecimal digit into a proper integer"""
216 216 return int(hex_digit, 16)
217 217
218 218
219 219 class Block(dict):
220 220 """represent a block of the Trie
221 221
222 222 contains up to 16 entry indexed from 0 to 15"""
223 223
224 224 def __iter__(self):
225 225 return iter(self.get(i) for i in range(16))
226 226
227 227
228 228 def _build_trie(index):
229 229 """build a nodemap trie
230 230
231 231 The nodemap stores revision number for each unique prefix.
232 232
233 233 Each block is a dictionary with keys in `[0, 15]`. Values are either
234 234 another block or a revision number.
235 235 """
236 236 root = Block()
237 237 for rev in range(len(index)):
238 238 hex = nodemod.hex(index[rev][7])
239 239 _insert_into_block(index, 0, root, rev, hex)
240 240 return root
241 241
242 242
243 243 def _insert_into_block(index, level, block, current_rev, current_hex):
244 244 """insert a new revision in a block
245 245
246 246 index: the index we are adding revision for
247 247 level: the depth of the current block in the trie
248 248 block: the block currently being considered
249 249 current_rev: the revision number we are adding
250 250 current_hex: the hexadecimal representation of the of that revision
251 251 """
252 252 hex_digit = _to_int(current_hex[level : level + 1])
253 253 entry = block.get(hex_digit)
254 254 if entry is None:
255 255 # no entry, simply store the revision number
256 256 block[hex_digit] = current_rev
257 257 elif isinstance(entry, dict):
258 258 # need to recurse to an underlying block
259 259 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
260 260 else:
261 261 # collision with a previously unique prefix, inserting new
262 262 # vertices to fit both entry.
263 263 other_hex = nodemod.hex(index[entry][7])
264 264 other_rev = entry
265 265 new = Block()
266 266 block[hex_digit] = new
267 267 _insert_into_block(index, level + 1, new, other_rev, other_hex)
268 268 _insert_into_block(index, level + 1, new, current_rev, current_hex)
269 269
270 270
271 271 def _persist_trie(root):
272 272 """turn a nodemap trie into persistent binary data
273 273
274 274 See `_build_trie` for nodemap trie structure"""
275 275 block_map = {}
276 276 chunks = []
277 277 for tn in _walk_trie(root):
278 278 block_map[id(tn)] = len(chunks)
279 279 chunks.append(_persist_block(tn, block_map))
280 280 return b''.join(chunks)
281 281
282 282
283 283 def _walk_trie(block):
284 284 """yield all the block in a trie
285 285
286 286 Children blocks are always yield before their parent block.
287 287 """
288 288 for (_, item) in sorted(block.items()):
289 289 if isinstance(item, dict):
290 290 for sub_block in _walk_trie(item):
291 291 yield sub_block
292 292 yield block
293 293
294 294
295 295 def _persist_block(block_node, block_map):
296 296 """produce persistent binary data for a single block
297 297
298 298 Children block are assumed to be already persisted and present in
299 299 block_map.
300 300 """
301 301 data = tuple(_to_value(v, block_map) for v in block_node)
302 302 return S_BLOCK.pack(*data)
303 303
304 304
305 305 def _to_value(item, block_map):
306 306 """persist any value as an integer"""
307 307 if item is None:
308 308 return NO_ENTRY
309 309 elif isinstance(item, dict):
310 310 return block_map[id(item)]
311 311 else:
312 312 return _transform_rev(item)
313
314
315 def parse_data(data):
316 """parse parse nodemap data into a nodemap Trie"""
317 if (len(data) % S_BLOCK.size) != 0:
318 msg = "nodemap data size is not a multiple of block size (%d): %d"
319 raise error.Abort(msg % (S_BLOCK.size, len(data)))
320 if not data:
321 return Block()
322 block_map = {}
323 new_blocks = []
324 for i in range(0, len(data), S_BLOCK.size):
325 block = Block()
326 ondisk_id = len(block_map)
327 block_map[ondisk_id] = block
328 block_data = data[i : i + S_BLOCK.size]
329 values = S_BLOCK.unpack(block_data)
330 new_blocks.append((block, values))
331 for b, values in new_blocks:
332 for idx, v in enumerate(values):
333 if v == NO_ENTRY:
334 continue
335 elif v >= 0:
336 b[idx] = block_map[v]
337 else:
338 b[idx] = _transform_rev(v)
339 return block
General Comments 0
You need to be logged in to leave comments. Login now