##// END OF EJS Templates
nodemap: use an explicit "Block" object in the reference implementation...
marmoute -
r44796:7762a295 default
parent child Browse files
Show More
@@ -1,303 +1,309
1 # nodemap.py - nodemap related code and utilities
1 # nodemap.py - nodemap related code and utilities
2 #
2 #
3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 #
5 #
6 # This software may be used and distributed according to the terms of the
6 # This software may be used and distributed according to the terms of the
7 # GNU General Public License version 2 or any later version.
7 # GNU General Public License version 2 or any later version.
8
8
9 from __future__ import absolute_import
9 from __future__ import absolute_import
10
10
11 import os
11 import os
12 import re
12 import re
13 import struct
13 import struct
14
14
15 from .. import (
15 from .. import (
16 error,
16 error,
17 node as nodemod,
17 node as nodemod,
18 util,
18 util,
19 )
19 )
20
20
21
21
22 class NodeMap(dict):
22 class NodeMap(dict):
23 def __missing__(self, x):
23 def __missing__(self, x):
24 raise error.RevlogError(b'unknown node: %s' % x)
24 raise error.RevlogError(b'unknown node: %s' % x)
25
25
26
26
27 def persisted_data(revlog):
27 def persisted_data(revlog):
28 """read the nodemap for a revlog from disk"""
28 """read the nodemap for a revlog from disk"""
29 if revlog.nodemap_file is None:
29 if revlog.nodemap_file is None:
30 return None
30 return None
31 pdata = revlog.opener.tryread(revlog.nodemap_file)
31 pdata = revlog.opener.tryread(revlog.nodemap_file)
32 if not pdata:
32 if not pdata:
33 return None
33 return None
34 offset = 0
34 offset = 0
35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
36 if version != ONDISK_VERSION:
36 if version != ONDISK_VERSION:
37 return None
37 return None
38 offset += S_VERSION.size
38 offset += S_VERSION.size
39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
40 offset += S_HEADER.size
40 offset += S_HEADER.size
41 uid = pdata[offset : offset + uuid_size]
41 uid = pdata[offset : offset + uuid_size]
42
42
43 filename = _rawdata_filepath(revlog, uid)
43 filename = _rawdata_filepath(revlog, uid)
44 return revlog.opener.tryread(filename)
44 return revlog.opener.tryread(filename)
45
45
46
46
47 def setup_persistent_nodemap(tr, revlog):
47 def setup_persistent_nodemap(tr, revlog):
48 """Install whatever is needed transaction side to persist a nodemap on disk
48 """Install whatever is needed transaction side to persist a nodemap on disk
49
49
50 (only actually persist the nodemap if this is relevant for this revlog)
50 (only actually persist the nodemap if this is relevant for this revlog)
51 """
51 """
52 if revlog._inline:
52 if revlog._inline:
53 return # inlined revlog are too small for this to be relevant
53 return # inlined revlog are too small for this to be relevant
54 if revlog.nodemap_file is None:
54 if revlog.nodemap_file is None:
55 return # we do not use persistent_nodemap on this revlog
55 return # we do not use persistent_nodemap on this revlog
56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
57 if tr.hasfinalize(callback_id):
57 if tr.hasfinalize(callback_id):
58 return # no need to register again
58 return # no need to register again
59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
60
60
61
61
62 def _persist_nodemap(tr, revlog):
62 def _persist_nodemap(tr, revlog):
63 """Write nodemap data on disk for a given revlog
63 """Write nodemap data on disk for a given revlog
64 """
64 """
65 if getattr(revlog, 'filteredrevs', ()):
65 if getattr(revlog, 'filteredrevs', ()):
66 raise error.ProgrammingError(
66 raise error.ProgrammingError(
67 "cannot persist nodemap of a filtered changelog"
67 "cannot persist nodemap of a filtered changelog"
68 )
68 )
69 if revlog.nodemap_file is None:
69 if revlog.nodemap_file is None:
70 msg = "calling persist nodemap on a revlog without the feature enableb"
70 msg = "calling persist nodemap on a revlog without the feature enableb"
71 raise error.ProgrammingError(msg)
71 raise error.ProgrammingError(msg)
72 if util.safehasattr(revlog.index, "nodemap_data_all"):
72 if util.safehasattr(revlog.index, "nodemap_data_all"):
73 data = revlog.index.nodemap_data_all()
73 data = revlog.index.nodemap_data_all()
74 else:
74 else:
75 data = persistent_data(revlog.index)
75 data = persistent_data(revlog.index)
76 uid = _make_uid()
76 uid = _make_uid()
77 datafile = _rawdata_filepath(revlog, uid)
77 datafile = _rawdata_filepath(revlog, uid)
78 olds = _other_rawdata_filepath(revlog, uid)
78 olds = _other_rawdata_filepath(revlog, uid)
79 if olds:
79 if olds:
80 realvfs = getattr(revlog, '_realopener', revlog.opener)
80 realvfs = getattr(revlog, '_realopener', revlog.opener)
81
81
82 def cleanup(tr):
82 def cleanup(tr):
83 for oldfile in olds:
83 for oldfile in olds:
84 realvfs.tryunlink(oldfile)
84 realvfs.tryunlink(oldfile)
85
85
86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
87 tr.addpostclose(callback_id, cleanup)
87 tr.addpostclose(callback_id, cleanup)
88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
89 # store vfs
89 # store vfs
90 with revlog.opener(datafile, b'w') as fd:
90 with revlog.opener(datafile, b'w') as fd:
91 fd.write(data)
91 fd.write(data)
92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
93 # store vfs
93 # store vfs
94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
95 fp.write(_serialize_docket(uid))
95 fp.write(_serialize_docket(uid))
96 # EXP-TODO: if the transaction abort, we should remove the new data and
96 # EXP-TODO: if the transaction abort, we should remove the new data and
97 # reinstall the old one.
97 # reinstall the old one.
98
98
99
99
100 ### Nodemap docket file
100 ### Nodemap docket file
101 #
101 #
102 # The nodemap data are stored on disk using 2 files:
102 # The nodemap data are stored on disk using 2 files:
103 #
103 #
104 # * a raw data files containing a persistent nodemap
104 # * a raw data files containing a persistent nodemap
105 # (see `Nodemap Trie` section)
105 # (see `Nodemap Trie` section)
106 #
106 #
107 # * a small "docket" file containing medatadata
107 # * a small "docket" file containing medatadata
108 #
108 #
109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
110 # small, it is easy to update it automatically or to duplicated its content
110 # small, it is easy to update it automatically or to duplicated its content
111 # during a transaction.
111 # during a transaction.
112 #
112 #
113 # Multiple raw data can exist at the same time (The currently valid one and a
113 # Multiple raw data can exist at the same time (The currently valid one and a
114 # new one beind used by an in progress transaction). To accomodate this, the
114 # new one beind used by an in progress transaction). To accomodate this, the
115 # filename hosting the raw data has a variable parts. The exact filename is
115 # filename hosting the raw data has a variable parts. The exact filename is
116 # specified inside the "docket" file.
116 # specified inside the "docket" file.
117 #
117 #
118 # The docket file contains information to find, qualify and validate the raw
118 # The docket file contains information to find, qualify and validate the raw
119 # data. Its content is currently very light, but it will expand as the on disk
119 # data. Its content is currently very light, but it will expand as the on disk
120 # nodemap gains the necessary features to be used in production.
120 # nodemap gains the necessary features to be used in production.
121
121
122 # version 0 is experimental, no BC garantee, do no use outside of tests.
122 # version 0 is experimental, no BC garantee, do no use outside of tests.
123 ONDISK_VERSION = 0
123 ONDISK_VERSION = 0
124
124
125 S_VERSION = struct.Struct(">B")
125 S_VERSION = struct.Struct(">B")
126 S_HEADER = struct.Struct(">B")
126 S_HEADER = struct.Struct(">B")
127
127
128 ID_SIZE = 8
128 ID_SIZE = 8
129
129
130
130
131 def _make_uid():
131 def _make_uid():
132 """return a new unique identifier.
132 """return a new unique identifier.
133
133
134 The identifier is random and composed of ascii characters."""
134 The identifier is random and composed of ascii characters."""
135 return nodemod.hex(os.urandom(ID_SIZE))
135 return nodemod.hex(os.urandom(ID_SIZE))
136
136
137
137
138 def _serialize_docket(uid):
138 def _serialize_docket(uid):
139 """return serialized bytes for a docket using the passed uid"""
139 """return serialized bytes for a docket using the passed uid"""
140 data = []
140 data = []
141 data.append(S_VERSION.pack(ONDISK_VERSION))
141 data.append(S_VERSION.pack(ONDISK_VERSION))
142 data.append(S_HEADER.pack(len(uid)))
142 data.append(S_HEADER.pack(len(uid)))
143 data.append(uid)
143 data.append(uid)
144 return b''.join(data)
144 return b''.join(data)
145
145
146
146
147 def _rawdata_filepath(revlog, uid):
147 def _rawdata_filepath(revlog, uid):
148 """The (vfs relative) nodemap's rawdata file for a given uid"""
148 """The (vfs relative) nodemap's rawdata file for a given uid"""
149 prefix = revlog.nodemap_file[:-2]
149 prefix = revlog.nodemap_file[:-2]
150 return b"%s-%s.nd" % (prefix, uid)
150 return b"%s-%s.nd" % (prefix, uid)
151
151
152
152
153 def _other_rawdata_filepath(revlog, uid):
153 def _other_rawdata_filepath(revlog, uid):
154 prefix = revlog.nodemap_file[:-2]
154 prefix = revlog.nodemap_file[:-2]
155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
156 new_file_path = _rawdata_filepath(revlog, uid)
156 new_file_path = _rawdata_filepath(revlog, uid)
157 new_file_name = revlog.opener.basename(new_file_path)
157 new_file_name = revlog.opener.basename(new_file_path)
158 dirpath = revlog.opener.dirname(new_file_path)
158 dirpath = revlog.opener.dirname(new_file_path)
159 others = []
159 others = []
160 for f in revlog.opener.listdir(dirpath):
160 for f in revlog.opener.listdir(dirpath):
161 if pattern.match(f) and f != new_file_name:
161 if pattern.match(f) and f != new_file_name:
162 others.append(f)
162 others.append(f)
163 return others
163 return others
164
164
165
165
166 ### Nodemap Trie
166 ### Nodemap Trie
167 #
167 #
168 # This is a simple reference implementation to compute and persist a nodemap
168 # This is a simple reference implementation to compute and persist a nodemap
169 # trie. This reference implementation is write only. The python version of this
169 # trie. This reference implementation is write only. The python version of this
170 # is not expected to be actually used, since it wont provide performance
170 # is not expected to be actually used, since it wont provide performance
171 # improvement over existing non-persistent C implementation.
171 # improvement over existing non-persistent C implementation.
172 #
172 #
173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
174 # revision can be adressed using its node shortest prefix.
174 # revision can be adressed using its node shortest prefix.
175 #
175 #
176 # The trie is stored as a sequence of block. Each block contains 16 entries
176 # The trie is stored as a sequence of block. Each block contains 16 entries
177 # (signed 64bit integer, big endian). Each entry can be one of the following:
177 # (signed 64bit integer, big endian). Each entry can be one of the following:
178 #
178 #
179 # * value >= 0 -> index of sub-block
179 # * value >= 0 -> index of sub-block
180 # * value == -1 -> no value
180 # * value == -1 -> no value
181 # * value < -1 -> a revision value: rev = -(value+10)
181 # * value < -1 -> a revision value: rev = -(value+10)
182 #
182 #
183 # The implementation focus on simplicity, not on performance. A Rust
183 # The implementation focus on simplicity, not on performance. A Rust
184 # implementation should provide a efficient version of the same binary
184 # implementation should provide a efficient version of the same binary
185 # persistence. This reference python implementation is never meant to be
185 # persistence. This reference python implementation is never meant to be
186 # extensively use in production.
186 # extensively use in production.
187
187
188
188
189 def persistent_data(index):
189 def persistent_data(index):
190 """return the persistent binary form for a nodemap for a given index
190 """return the persistent binary form for a nodemap for a given index
191 """
191 """
192 trie = _build_trie(index)
192 trie = _build_trie(index)
193 return _persist_trie(trie)
193 return _persist_trie(trie)
194
194
195
195
196 S_BLOCK = struct.Struct(">" + ("l" * 16))
196 S_BLOCK = struct.Struct(">" + ("l" * 16))
197
197
198 NO_ENTRY = -1
198 NO_ENTRY = -1
199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
200 REV_OFFSET = 2
200 REV_OFFSET = 2
201
201
202
202
203 def _transform_rev(rev):
203 def _transform_rev(rev):
204 """Return the number used to represent the rev in the tree.
204 """Return the number used to represent the rev in the tree.
205
205
206 (or retrieve a rev number from such representation)
206 (or retrieve a rev number from such representation)
207
207
208 Note that this is an involution, a function equal to its inverse (i.e.
208 Note that this is an involution, a function equal to its inverse (i.e.
209 which gives the identity when applied to itself).
209 which gives the identity when applied to itself).
210 """
210 """
211 return -(rev + REV_OFFSET)
211 return -(rev + REV_OFFSET)
212
212
213
213
214 def _to_int(hex_digit):
214 def _to_int(hex_digit):
215 """turn an hexadecimal digit into a proper integer"""
215 """turn an hexadecimal digit into a proper integer"""
216 return int(hex_digit, 16)
216 return int(hex_digit, 16)
217
217
218
218
219 class Block(dict):
220 """represent a block of the Trie
221
222 contains up to 16 entry indexed from 0 to 15"""
223
224
219 def _build_trie(index):
225 def _build_trie(index):
220 """build a nodemap trie
226 """build a nodemap trie
221
227
222 The nodemap stores revision number for each unique prefix.
228 The nodemap stores revision number for each unique prefix.
223
229
224 Each block is a dictionary with keys in `[0, 15]`. Values are either
230 Each block is a dictionary with keys in `[0, 15]`. Values are either
225 another block or a revision number.
231 another block or a revision number.
226 """
232 """
227 root = {}
233 root = Block()
228 for rev in range(len(index)):
234 for rev in range(len(index)):
229 hex = nodemod.hex(index[rev][7])
235 hex = nodemod.hex(index[rev][7])
230 _insert_into_block(index, 0, root, rev, hex)
236 _insert_into_block(index, 0, root, rev, hex)
231 return root
237 return root
232
238
233
239
234 def _insert_into_block(index, level, block, current_rev, current_hex):
240 def _insert_into_block(index, level, block, current_rev, current_hex):
235 """insert a new revision in a block
241 """insert a new revision in a block
236
242
237 index: the index we are adding revision for
243 index: the index we are adding revision for
238 level: the depth of the current block in the trie
244 level: the depth of the current block in the trie
239 block: the block currently being considered
245 block: the block currently being considered
240 current_rev: the revision number we are adding
246 current_rev: the revision number we are adding
241 current_hex: the hexadecimal representation of the of that revision
247 current_hex: the hexadecimal representation of the of that revision
242 """
248 """
243 hex_digit = _to_int(current_hex[level : level + 1])
249 hex_digit = _to_int(current_hex[level : level + 1])
244 entry = block.get(hex_digit)
250 entry = block.get(hex_digit)
245 if entry is None:
251 if entry is None:
246 # no entry, simply store the revision number
252 # no entry, simply store the revision number
247 block[hex_digit] = current_rev
253 block[hex_digit] = current_rev
248 elif isinstance(entry, dict):
254 elif isinstance(entry, dict):
249 # need to recurse to an underlying block
255 # need to recurse to an underlying block
250 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
256 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
251 else:
257 else:
252 # collision with a previously unique prefix, inserting new
258 # collision with a previously unique prefix, inserting new
253 # vertices to fit both entry.
259 # vertices to fit both entry.
254 other_hex = nodemod.hex(index[entry][7])
260 other_hex = nodemod.hex(index[entry][7])
255 other_rev = entry
261 other_rev = entry
256 new = {}
262 new = Block()
257 block[hex_digit] = new
263 block[hex_digit] = new
258 _insert_into_block(index, level + 1, new, other_rev, other_hex)
264 _insert_into_block(index, level + 1, new, other_rev, other_hex)
259 _insert_into_block(index, level + 1, new, current_rev, current_hex)
265 _insert_into_block(index, level + 1, new, current_rev, current_hex)
260
266
261
267
262 def _persist_trie(root):
268 def _persist_trie(root):
263 """turn a nodemap trie into persistent binary data
269 """turn a nodemap trie into persistent binary data
264
270
265 See `_build_trie` for nodemap trie structure"""
271 See `_build_trie` for nodemap trie structure"""
266 block_map = {}
272 block_map = {}
267 chunks = []
273 chunks = []
268 for tn in _walk_trie(root):
274 for tn in _walk_trie(root):
269 block_map[id(tn)] = len(chunks)
275 block_map[id(tn)] = len(chunks)
270 chunks.append(_persist_block(tn, block_map))
276 chunks.append(_persist_block(tn, block_map))
271 return b''.join(chunks)
277 return b''.join(chunks)
272
278
273
279
274 def _walk_trie(block):
280 def _walk_trie(block):
275 """yield all the block in a trie
281 """yield all the block in a trie
276
282
277 Children blocks are always yield before their parent block.
283 Children blocks are always yield before their parent block.
278 """
284 """
279 for (_, item) in sorted(block.items()):
285 for (_, item) in sorted(block.items()):
280 if isinstance(item, dict):
286 if isinstance(item, dict):
281 for sub_block in _walk_trie(item):
287 for sub_block in _walk_trie(item):
282 yield sub_block
288 yield sub_block
283 yield block
289 yield block
284
290
285
291
286 def _persist_block(block_node, block_map):
292 def _persist_block(block_node, block_map):
287 """produce persistent binary data for a single block
293 """produce persistent binary data for a single block
288
294
289 Children block are assumed to be already persisted and present in
295 Children block are assumed to be already persisted and present in
290 block_map.
296 block_map.
291 """
297 """
292 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
298 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
293 return S_BLOCK.pack(*data)
299 return S_BLOCK.pack(*data)
294
300
295
301
296 def _to_value(item, block_map):
302 def _to_value(item, block_map):
297 """persist any value as an integer"""
303 """persist any value as an integer"""
298 if item is None:
304 if item is None:
299 return NO_ENTRY
305 return NO_ENTRY
300 elif isinstance(item, dict):
306 elif isinstance(item, dict):
301 return block_map[id(item)]
307 return block_map[id(item)]
302 else:
308 else:
303 return _transform_rev(item)
309 return _transform_rev(item)
General Comments 0
You need to be logged in to leave comments. Login now