##// END OF EJS Templates
nodemap: code to parse the persistent binary nodemap data...
marmoute -
r44798:78721bbd default
parent child Browse files
Show More
@@ -1,312 +1,339 b''
1 # nodemap.py - nodemap related code and utilities
1 # nodemap.py - nodemap related code and utilities
2 #
2 #
3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 #
5 #
6 # This software may be used and distributed according to the terms of the
6 # This software may be used and distributed according to the terms of the
7 # GNU General Public License version 2 or any later version.
7 # GNU General Public License version 2 or any later version.
8
8
9 from __future__ import absolute_import
9 from __future__ import absolute_import
10
10
11 import os
11 import os
12 import re
12 import re
13 import struct
13 import struct
14
14
15 from .. import (
15 from .. import (
16 error,
16 error,
17 node as nodemod,
17 node as nodemod,
18 util,
18 util,
19 )
19 )
20
20
21
21
22 class NodeMap(dict):
22 class NodeMap(dict):
23 def __missing__(self, x):
23 def __missing__(self, x):
24 raise error.RevlogError(b'unknown node: %s' % x)
24 raise error.RevlogError(b'unknown node: %s' % x)
25
25
26
26
27 def persisted_data(revlog):
27 def persisted_data(revlog):
28 """read the nodemap for a revlog from disk"""
28 """read the nodemap for a revlog from disk"""
29 if revlog.nodemap_file is None:
29 if revlog.nodemap_file is None:
30 return None
30 return None
31 pdata = revlog.opener.tryread(revlog.nodemap_file)
31 pdata = revlog.opener.tryread(revlog.nodemap_file)
32 if not pdata:
32 if not pdata:
33 return None
33 return None
34 offset = 0
34 offset = 0
35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
36 if version != ONDISK_VERSION:
36 if version != ONDISK_VERSION:
37 return None
37 return None
38 offset += S_VERSION.size
38 offset += S_VERSION.size
39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
40 offset += S_HEADER.size
40 offset += S_HEADER.size
41 uid = pdata[offset : offset + uuid_size]
41 uid = pdata[offset : offset + uuid_size]
42
42
43 filename = _rawdata_filepath(revlog, uid)
43 filename = _rawdata_filepath(revlog, uid)
44 return revlog.opener.tryread(filename)
44 return revlog.opener.tryread(filename)
45
45
46
46
47 def setup_persistent_nodemap(tr, revlog):
47 def setup_persistent_nodemap(tr, revlog):
48 """Install whatever is needed transaction side to persist a nodemap on disk
48 """Install whatever is needed transaction side to persist a nodemap on disk
49
49
50 (only actually persist the nodemap if this is relevant for this revlog)
50 (only actually persist the nodemap if this is relevant for this revlog)
51 """
51 """
52 if revlog._inline:
52 if revlog._inline:
53 return # inlined revlog are too small for this to be relevant
53 return # inlined revlog are too small for this to be relevant
54 if revlog.nodemap_file is None:
54 if revlog.nodemap_file is None:
55 return # we do not use persistent_nodemap on this revlog
55 return # we do not use persistent_nodemap on this revlog
56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
57 if tr.hasfinalize(callback_id):
57 if tr.hasfinalize(callback_id):
58 return # no need to register again
58 return # no need to register again
59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
60
60
61
61
62 def _persist_nodemap(tr, revlog):
62 def _persist_nodemap(tr, revlog):
63 """Write nodemap data on disk for a given revlog
63 """Write nodemap data on disk for a given revlog
64 """
64 """
65 if getattr(revlog, 'filteredrevs', ()):
65 if getattr(revlog, 'filteredrevs', ()):
66 raise error.ProgrammingError(
66 raise error.ProgrammingError(
67 "cannot persist nodemap of a filtered changelog"
67 "cannot persist nodemap of a filtered changelog"
68 )
68 )
69 if revlog.nodemap_file is None:
69 if revlog.nodemap_file is None:
70 msg = "calling persist nodemap on a revlog without the feature enableb"
70 msg = "calling persist nodemap on a revlog without the feature enableb"
71 raise error.ProgrammingError(msg)
71 raise error.ProgrammingError(msg)
72 if util.safehasattr(revlog.index, "nodemap_data_all"):
72 if util.safehasattr(revlog.index, "nodemap_data_all"):
73 data = revlog.index.nodemap_data_all()
73 data = revlog.index.nodemap_data_all()
74 else:
74 else:
75 data = persistent_data(revlog.index)
75 data = persistent_data(revlog.index)
76 uid = _make_uid()
76 uid = _make_uid()
77 datafile = _rawdata_filepath(revlog, uid)
77 datafile = _rawdata_filepath(revlog, uid)
78 olds = _other_rawdata_filepath(revlog, uid)
78 olds = _other_rawdata_filepath(revlog, uid)
79 if olds:
79 if olds:
80 realvfs = getattr(revlog, '_realopener', revlog.opener)
80 realvfs = getattr(revlog, '_realopener', revlog.opener)
81
81
82 def cleanup(tr):
82 def cleanup(tr):
83 for oldfile in olds:
83 for oldfile in olds:
84 realvfs.tryunlink(oldfile)
84 realvfs.tryunlink(oldfile)
85
85
86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
87 tr.addpostclose(callback_id, cleanup)
87 tr.addpostclose(callback_id, cleanup)
88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
89 # store vfs
89 # store vfs
90 with revlog.opener(datafile, b'w') as fd:
90 with revlog.opener(datafile, b'w') as fd:
91 fd.write(data)
91 fd.write(data)
92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
93 # store vfs
93 # store vfs
94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
95 fp.write(_serialize_docket(uid))
95 fp.write(_serialize_docket(uid))
96 # EXP-TODO: if the transaction abort, we should remove the new data and
96 # EXP-TODO: if the transaction abort, we should remove the new data and
97 # reinstall the old one.
97 # reinstall the old one.
98
98
99
99
100 ### Nodemap docket file
100 ### Nodemap docket file
101 #
101 #
102 # The nodemap data are stored on disk using 2 files:
102 # The nodemap data are stored on disk using 2 files:
103 #
103 #
104 # * a raw data files containing a persistent nodemap
104 # * a raw data files containing a persistent nodemap
105 # (see `Nodemap Trie` section)
105 # (see `Nodemap Trie` section)
106 #
106 #
107 # * a small "docket" file containing medatadata
107 # * a small "docket" file containing medatadata
108 #
108 #
109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
110 # small, it is easy to update it automatically or to duplicated its content
110 # small, it is easy to update it automatically or to duplicated its content
111 # during a transaction.
111 # during a transaction.
112 #
112 #
113 # Multiple raw data can exist at the same time (The currently valid one and a
113 # Multiple raw data can exist at the same time (The currently valid one and a
114 # new one beind used by an in progress transaction). To accomodate this, the
114 # new one beind used by an in progress transaction). To accomodate this, the
115 # filename hosting the raw data has a variable parts. The exact filename is
115 # filename hosting the raw data has a variable parts. The exact filename is
116 # specified inside the "docket" file.
116 # specified inside the "docket" file.
117 #
117 #
118 # The docket file contains information to find, qualify and validate the raw
118 # The docket file contains information to find, qualify and validate the raw
119 # data. Its content is currently very light, but it will expand as the on disk
119 # data. Its content is currently very light, but it will expand as the on disk
120 # nodemap gains the necessary features to be used in production.
120 # nodemap gains the necessary features to be used in production.
121
121
122 # version 0 is experimental, no BC garantee, do no use outside of tests.
122 # version 0 is experimental, no BC garantee, do no use outside of tests.
123 ONDISK_VERSION = 0
123 ONDISK_VERSION = 0
124
124
125 S_VERSION = struct.Struct(">B")
125 S_VERSION = struct.Struct(">B")
126 S_HEADER = struct.Struct(">B")
126 S_HEADER = struct.Struct(">B")
127
127
128 ID_SIZE = 8
128 ID_SIZE = 8
129
129
130
130
131 def _make_uid():
131 def _make_uid():
132 """return a new unique identifier.
132 """return a new unique identifier.
133
133
134 The identifier is random and composed of ascii characters."""
134 The identifier is random and composed of ascii characters."""
135 return nodemod.hex(os.urandom(ID_SIZE))
135 return nodemod.hex(os.urandom(ID_SIZE))
136
136
137
137
138 def _serialize_docket(uid):
138 def _serialize_docket(uid):
139 """return serialized bytes for a docket using the passed uid"""
139 """return serialized bytes for a docket using the passed uid"""
140 data = []
140 data = []
141 data.append(S_VERSION.pack(ONDISK_VERSION))
141 data.append(S_VERSION.pack(ONDISK_VERSION))
142 data.append(S_HEADER.pack(len(uid)))
142 data.append(S_HEADER.pack(len(uid)))
143 data.append(uid)
143 data.append(uid)
144 return b''.join(data)
144 return b''.join(data)
145
145
146
146
147 def _rawdata_filepath(revlog, uid):
147 def _rawdata_filepath(revlog, uid):
148 """The (vfs relative) nodemap's rawdata file for a given uid"""
148 """The (vfs relative) nodemap's rawdata file for a given uid"""
149 prefix = revlog.nodemap_file[:-2]
149 prefix = revlog.nodemap_file[:-2]
150 return b"%s-%s.nd" % (prefix, uid)
150 return b"%s-%s.nd" % (prefix, uid)
151
151
152
152
153 def _other_rawdata_filepath(revlog, uid):
153 def _other_rawdata_filepath(revlog, uid):
154 prefix = revlog.nodemap_file[:-2]
154 prefix = revlog.nodemap_file[:-2]
155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
156 new_file_path = _rawdata_filepath(revlog, uid)
156 new_file_path = _rawdata_filepath(revlog, uid)
157 new_file_name = revlog.opener.basename(new_file_path)
157 new_file_name = revlog.opener.basename(new_file_path)
158 dirpath = revlog.opener.dirname(new_file_path)
158 dirpath = revlog.opener.dirname(new_file_path)
159 others = []
159 others = []
160 for f in revlog.opener.listdir(dirpath):
160 for f in revlog.opener.listdir(dirpath):
161 if pattern.match(f) and f != new_file_name:
161 if pattern.match(f) and f != new_file_name:
162 others.append(f)
162 others.append(f)
163 return others
163 return others
164
164
165
165
166 ### Nodemap Trie
166 ### Nodemap Trie
167 #
167 #
168 # This is a simple reference implementation to compute and persist a nodemap
168 # This is a simple reference implementation to compute and persist a nodemap
169 # trie. This reference implementation is write only. The python version of this
169 # trie. This reference implementation is write only. The python version of this
170 # is not expected to be actually used, since it wont provide performance
170 # is not expected to be actually used, since it wont provide performance
171 # improvement over existing non-persistent C implementation.
171 # improvement over existing non-persistent C implementation.
172 #
172 #
173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
174 # revision can be adressed using its node shortest prefix.
174 # revision can be adressed using its node shortest prefix.
175 #
175 #
176 # The trie is stored as a sequence of block. Each block contains 16 entries
176 # The trie is stored as a sequence of block. Each block contains 16 entries
177 # (signed 64bit integer, big endian). Each entry can be one of the following:
177 # (signed 64bit integer, big endian). Each entry can be one of the following:
178 #
178 #
179 # * value >= 0 -> index of sub-block
179 # * value >= 0 -> index of sub-block
180 # * value == -1 -> no value
180 # * value == -1 -> no value
181 # * value < -1 -> a revision value: rev = -(value+10)
181 # * value < -1 -> a revision value: rev = -(value+10)
182 #
182 #
183 # The implementation focus on simplicity, not on performance. A Rust
183 # The implementation focus on simplicity, not on performance. A Rust
184 # implementation should provide a efficient version of the same binary
184 # implementation should provide a efficient version of the same binary
185 # persistence. This reference python implementation is never meant to be
185 # persistence. This reference python implementation is never meant to be
186 # extensively use in production.
186 # extensively use in production.
187
187
188
188
189 def persistent_data(index):
189 def persistent_data(index):
190 """return the persistent binary form for a nodemap for a given index
190 """return the persistent binary form for a nodemap for a given index
191 """
191 """
192 trie = _build_trie(index)
192 trie = _build_trie(index)
193 return _persist_trie(trie)
193 return _persist_trie(trie)
194
194
195
195
196 S_BLOCK = struct.Struct(">" + ("l" * 16))
196 S_BLOCK = struct.Struct(">" + ("l" * 16))
197
197
198 NO_ENTRY = -1
198 NO_ENTRY = -1
199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
200 REV_OFFSET = 2
200 REV_OFFSET = 2
201
201
202
202
203 def _transform_rev(rev):
203 def _transform_rev(rev):
204 """Return the number used to represent the rev in the tree.
204 """Return the number used to represent the rev in the tree.
205
205
206 (or retrieve a rev number from such representation)
206 (or retrieve a rev number from such representation)
207
207
208 Note that this is an involution, a function equal to its inverse (i.e.
208 Note that this is an involution, a function equal to its inverse (i.e.
209 which gives the identity when applied to itself).
209 which gives the identity when applied to itself).
210 """
210 """
211 return -(rev + REV_OFFSET)
211 return -(rev + REV_OFFSET)
212
212
213
213
214 def _to_int(hex_digit):
214 def _to_int(hex_digit):
215 """turn an hexadecimal digit into a proper integer"""
215 """turn an hexadecimal digit into a proper integer"""
216 return int(hex_digit, 16)
216 return int(hex_digit, 16)
217
217
218
218
219 class Block(dict):
219 class Block(dict):
220 """represent a block of the Trie
220 """represent a block of the Trie
221
221
222 contains up to 16 entry indexed from 0 to 15"""
222 contains up to 16 entry indexed from 0 to 15"""
223
223
224 def __iter__(self):
224 def __iter__(self):
225 return iter(self.get(i) for i in range(16))
225 return iter(self.get(i) for i in range(16))
226
226
227
227
228 def _build_trie(index):
228 def _build_trie(index):
229 """build a nodemap trie
229 """build a nodemap trie
230
230
231 The nodemap stores revision number for each unique prefix.
231 The nodemap stores revision number for each unique prefix.
232
232
233 Each block is a dictionary with keys in `[0, 15]`. Values are either
233 Each block is a dictionary with keys in `[0, 15]`. Values are either
234 another block or a revision number.
234 another block or a revision number.
235 """
235 """
236 root = Block()
236 root = Block()
237 for rev in range(len(index)):
237 for rev in range(len(index)):
238 hex = nodemod.hex(index[rev][7])
238 hex = nodemod.hex(index[rev][7])
239 _insert_into_block(index, 0, root, rev, hex)
239 _insert_into_block(index, 0, root, rev, hex)
240 return root
240 return root
241
241
242
242
243 def _insert_into_block(index, level, block, current_rev, current_hex):
243 def _insert_into_block(index, level, block, current_rev, current_hex):
244 """insert a new revision in a block
244 """insert a new revision in a block
245
245
246 index: the index we are adding revision for
246 index: the index we are adding revision for
247 level: the depth of the current block in the trie
247 level: the depth of the current block in the trie
248 block: the block currently being considered
248 block: the block currently being considered
249 current_rev: the revision number we are adding
249 current_rev: the revision number we are adding
250 current_hex: the hexadecimal representation of the of that revision
250 current_hex: the hexadecimal representation of the of that revision
251 """
251 """
252 hex_digit = _to_int(current_hex[level : level + 1])
252 hex_digit = _to_int(current_hex[level : level + 1])
253 entry = block.get(hex_digit)
253 entry = block.get(hex_digit)
254 if entry is None:
254 if entry is None:
255 # no entry, simply store the revision number
255 # no entry, simply store the revision number
256 block[hex_digit] = current_rev
256 block[hex_digit] = current_rev
257 elif isinstance(entry, dict):
257 elif isinstance(entry, dict):
258 # need to recurse to an underlying block
258 # need to recurse to an underlying block
259 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
259 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
260 else:
260 else:
261 # collision with a previously unique prefix, inserting new
261 # collision with a previously unique prefix, inserting new
262 # vertices to fit both entry.
262 # vertices to fit both entry.
263 other_hex = nodemod.hex(index[entry][7])
263 other_hex = nodemod.hex(index[entry][7])
264 other_rev = entry
264 other_rev = entry
265 new = Block()
265 new = Block()
266 block[hex_digit] = new
266 block[hex_digit] = new
267 _insert_into_block(index, level + 1, new, other_rev, other_hex)
267 _insert_into_block(index, level + 1, new, other_rev, other_hex)
268 _insert_into_block(index, level + 1, new, current_rev, current_hex)
268 _insert_into_block(index, level + 1, new, current_rev, current_hex)
269
269
270
270
271 def _persist_trie(root):
271 def _persist_trie(root):
272 """turn a nodemap trie into persistent binary data
272 """turn a nodemap trie into persistent binary data
273
273
274 See `_build_trie` for nodemap trie structure"""
274 See `_build_trie` for nodemap trie structure"""
275 block_map = {}
275 block_map = {}
276 chunks = []
276 chunks = []
277 for tn in _walk_trie(root):
277 for tn in _walk_trie(root):
278 block_map[id(tn)] = len(chunks)
278 block_map[id(tn)] = len(chunks)
279 chunks.append(_persist_block(tn, block_map))
279 chunks.append(_persist_block(tn, block_map))
280 return b''.join(chunks)
280 return b''.join(chunks)
281
281
282
282
283 def _walk_trie(block):
283 def _walk_trie(block):
284 """yield all the block in a trie
284 """yield all the block in a trie
285
285
286 Children blocks are always yield before their parent block.
286 Children blocks are always yield before their parent block.
287 """
287 """
288 for (_, item) in sorted(block.items()):
288 for (_, item) in sorted(block.items()):
289 if isinstance(item, dict):
289 if isinstance(item, dict):
290 for sub_block in _walk_trie(item):
290 for sub_block in _walk_trie(item):
291 yield sub_block
291 yield sub_block
292 yield block
292 yield block
293
293
294
294
295 def _persist_block(block_node, block_map):
295 def _persist_block(block_node, block_map):
296 """produce persistent binary data for a single block
296 """produce persistent binary data for a single block
297
297
298 Children block are assumed to be already persisted and present in
298 Children block are assumed to be already persisted and present in
299 block_map.
299 block_map.
300 """
300 """
301 data = tuple(_to_value(v, block_map) for v in block_node)
301 data = tuple(_to_value(v, block_map) for v in block_node)
302 return S_BLOCK.pack(*data)
302 return S_BLOCK.pack(*data)
303
303
304
304
305 def _to_value(item, block_map):
305 def _to_value(item, block_map):
306 """persist any value as an integer"""
306 """persist any value as an integer"""
307 if item is None:
307 if item is None:
308 return NO_ENTRY
308 return NO_ENTRY
309 elif isinstance(item, dict):
309 elif isinstance(item, dict):
310 return block_map[id(item)]
310 return block_map[id(item)]
311 else:
311 else:
312 return _transform_rev(item)
312 return _transform_rev(item)
313
314
315 def parse_data(data):
316 """parse parse nodemap data into a nodemap Trie"""
317 if (len(data) % S_BLOCK.size) != 0:
318 msg = "nodemap data size is not a multiple of block size (%d): %d"
319 raise error.Abort(msg % (S_BLOCK.size, len(data)))
320 if not data:
321 return Block()
322 block_map = {}
323 new_blocks = []
324 for i in range(0, len(data), S_BLOCK.size):
325 block = Block()
326 ondisk_id = len(block_map)
327 block_map[ondisk_id] = block
328 block_data = data[i : i + S_BLOCK.size]
329 values = S_BLOCK.unpack(block_data)
330 new_blocks.append((block, values))
331 for b, values in new_blocks:
332 for idx, v in enumerate(values):
333 if v == NO_ENTRY:
334 continue
335 elif v >= 0:
336 b[idx] = block_map[v]
337 else:
338 b[idx] = _transform_rev(v)
339 return block
General Comments 0
You need to be logged in to leave comments. Login now