##// END OF EJS Templates
nodemap: use an explicit "Block" object in the reference implementation...
marmoute -
r44796:7762a295 default
parent child Browse files
Show More
@@ -1,303 +1,309
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import os
12 12 import re
13 13 import struct
14 14
15 15 from .. import (
16 16 error,
17 17 node as nodemod,
18 18 util,
19 19 )
20 20
21 21
22 22 class NodeMap(dict):
23 23 def __missing__(self, x):
24 24 raise error.RevlogError(b'unknown node: %s' % x)
25 25
26 26
27 27 def persisted_data(revlog):
28 28 """read the nodemap for a revlog from disk"""
29 29 if revlog.nodemap_file is None:
30 30 return None
31 31 pdata = revlog.opener.tryread(revlog.nodemap_file)
32 32 if not pdata:
33 33 return None
34 34 offset = 0
35 35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
36 36 if version != ONDISK_VERSION:
37 37 return None
38 38 offset += S_VERSION.size
39 39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
40 40 offset += S_HEADER.size
41 41 uid = pdata[offset : offset + uuid_size]
42 42
43 43 filename = _rawdata_filepath(revlog, uid)
44 44 return revlog.opener.tryread(filename)
45 45
46 46
47 47 def setup_persistent_nodemap(tr, revlog):
48 48 """Install whatever is needed transaction side to persist a nodemap on disk
49 49
50 50 (only actually persist the nodemap if this is relevant for this revlog)
51 51 """
52 52 if revlog._inline:
53 53 return # inlined revlog are too small for this to be relevant
54 54 if revlog.nodemap_file is None:
55 55 return # we do not use persistent_nodemap on this revlog
56 56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
57 57 if tr.hasfinalize(callback_id):
58 58 return # no need to register again
59 59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
60 60
61 61
62 62 def _persist_nodemap(tr, revlog):
63 63 """Write nodemap data on disk for a given revlog
64 64 """
65 65 if getattr(revlog, 'filteredrevs', ()):
66 66 raise error.ProgrammingError(
67 67 "cannot persist nodemap of a filtered changelog"
68 68 )
69 69 if revlog.nodemap_file is None:
70 70 msg = "calling persist nodemap on a revlog without the feature enableb"
71 71 raise error.ProgrammingError(msg)
72 72 if util.safehasattr(revlog.index, "nodemap_data_all"):
73 73 data = revlog.index.nodemap_data_all()
74 74 else:
75 75 data = persistent_data(revlog.index)
76 76 uid = _make_uid()
77 77 datafile = _rawdata_filepath(revlog, uid)
78 78 olds = _other_rawdata_filepath(revlog, uid)
79 79 if olds:
80 80 realvfs = getattr(revlog, '_realopener', revlog.opener)
81 81
82 82 def cleanup(tr):
83 83 for oldfile in olds:
84 84 realvfs.tryunlink(oldfile)
85 85
86 86 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
87 87 tr.addpostclose(callback_id, cleanup)
88 88 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
89 89 # store vfs
90 90 with revlog.opener(datafile, b'w') as fd:
91 91 fd.write(data)
92 92 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
93 93 # store vfs
94 94 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
95 95 fp.write(_serialize_docket(uid))
96 96 # EXP-TODO: if the transaction abort, we should remove the new data and
97 97 # reinstall the old one.
98 98
99 99
100 100 ### Nodemap docket file
101 101 #
102 102 # The nodemap data are stored on disk using 2 files:
103 103 #
104 104 # * a raw data files containing a persistent nodemap
105 105 # (see `Nodemap Trie` section)
106 106 #
107 107 # * a small "docket" file containing medatadata
108 108 #
109 109 # While the nodemap data can be multiple tens of megabytes, the "docket" is
110 110 # small, it is easy to update it automatically or to duplicated its content
111 111 # during a transaction.
112 112 #
113 113 # Multiple raw data can exist at the same time (The currently valid one and a
114 114 # new one beind used by an in progress transaction). To accomodate this, the
115 115 # filename hosting the raw data has a variable parts. The exact filename is
116 116 # specified inside the "docket" file.
117 117 #
118 118 # The docket file contains information to find, qualify and validate the raw
119 119 # data. Its content is currently very light, but it will expand as the on disk
120 120 # nodemap gains the necessary features to be used in production.
121 121
122 122 # version 0 is experimental, no BC garantee, do no use outside of tests.
123 123 ONDISK_VERSION = 0
124 124
125 125 S_VERSION = struct.Struct(">B")
126 126 S_HEADER = struct.Struct(">B")
127 127
128 128 ID_SIZE = 8
129 129
130 130
131 131 def _make_uid():
132 132 """return a new unique identifier.
133 133
134 134 The identifier is random and composed of ascii characters."""
135 135 return nodemod.hex(os.urandom(ID_SIZE))
136 136
137 137
138 138 def _serialize_docket(uid):
139 139 """return serialized bytes for a docket using the passed uid"""
140 140 data = []
141 141 data.append(S_VERSION.pack(ONDISK_VERSION))
142 142 data.append(S_HEADER.pack(len(uid)))
143 143 data.append(uid)
144 144 return b''.join(data)
145 145
146 146
147 147 def _rawdata_filepath(revlog, uid):
148 148 """The (vfs relative) nodemap's rawdata file for a given uid"""
149 149 prefix = revlog.nodemap_file[:-2]
150 150 return b"%s-%s.nd" % (prefix, uid)
151 151
152 152
153 153 def _other_rawdata_filepath(revlog, uid):
154 154 prefix = revlog.nodemap_file[:-2]
155 155 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
156 156 new_file_path = _rawdata_filepath(revlog, uid)
157 157 new_file_name = revlog.opener.basename(new_file_path)
158 158 dirpath = revlog.opener.dirname(new_file_path)
159 159 others = []
160 160 for f in revlog.opener.listdir(dirpath):
161 161 if pattern.match(f) and f != new_file_name:
162 162 others.append(f)
163 163 return others
164 164
165 165
166 166 ### Nodemap Trie
167 167 #
168 168 # This is a simple reference implementation to compute and persist a nodemap
169 169 # trie. This reference implementation is write only. The python version of this
170 170 # is not expected to be actually used, since it wont provide performance
171 171 # improvement over existing non-persistent C implementation.
172 172 #
173 173 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
174 174 # revision can be adressed using its node shortest prefix.
175 175 #
176 176 # The trie is stored as a sequence of block. Each block contains 16 entries
177 177 # (signed 64bit integer, big endian). Each entry can be one of the following:
178 178 #
179 179 # * value >= 0 -> index of sub-block
180 180 # * value == -1 -> no value
181 181 # * value < -1 -> a revision value: rev = -(value+10)
182 182 #
183 183 # The implementation focus on simplicity, not on performance. A Rust
184 184 # implementation should provide a efficient version of the same binary
185 185 # persistence. This reference python implementation is never meant to be
186 186 # extensively use in production.
187 187
188 188
189 189 def persistent_data(index):
190 190 """return the persistent binary form for a nodemap for a given index
191 191 """
192 192 trie = _build_trie(index)
193 193 return _persist_trie(trie)
194 194
195 195
196 196 S_BLOCK = struct.Struct(">" + ("l" * 16))
197 197
198 198 NO_ENTRY = -1
199 199 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
200 200 REV_OFFSET = 2
201 201
202 202
203 203 def _transform_rev(rev):
204 204 """Return the number used to represent the rev in the tree.
205 205
206 206 (or retrieve a rev number from such representation)
207 207
208 208 Note that this is an involution, a function equal to its inverse (i.e.
209 209 which gives the identity when applied to itself).
210 210 """
211 211 return -(rev + REV_OFFSET)
212 212
213 213
214 214 def _to_int(hex_digit):
215 215 """turn an hexadecimal digit into a proper integer"""
216 216 return int(hex_digit, 16)
217 217
218 218
219 class Block(dict):
220 """represent a block of the Trie
221
222 contains up to 16 entry indexed from 0 to 15"""
223
224
219 225 def _build_trie(index):
220 226 """build a nodemap trie
221 227
222 228 The nodemap stores revision number for each unique prefix.
223 229
224 230 Each block is a dictionary with keys in `[0, 15]`. Values are either
225 231 another block or a revision number.
226 232 """
227 root = {}
233 root = Block()
228 234 for rev in range(len(index)):
229 235 hex = nodemod.hex(index[rev][7])
230 236 _insert_into_block(index, 0, root, rev, hex)
231 237 return root
232 238
233 239
234 240 def _insert_into_block(index, level, block, current_rev, current_hex):
235 241 """insert a new revision in a block
236 242
237 243 index: the index we are adding revision for
238 244 level: the depth of the current block in the trie
239 245 block: the block currently being considered
240 246 current_rev: the revision number we are adding
241 247 current_hex: the hexadecimal representation of the of that revision
242 248 """
243 249 hex_digit = _to_int(current_hex[level : level + 1])
244 250 entry = block.get(hex_digit)
245 251 if entry is None:
246 252 # no entry, simply store the revision number
247 253 block[hex_digit] = current_rev
248 254 elif isinstance(entry, dict):
249 255 # need to recurse to an underlying block
250 256 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
251 257 else:
252 258 # collision with a previously unique prefix, inserting new
253 259 # vertices to fit both entry.
254 260 other_hex = nodemod.hex(index[entry][7])
255 261 other_rev = entry
256 new = {}
262 new = Block()
257 263 block[hex_digit] = new
258 264 _insert_into_block(index, level + 1, new, other_rev, other_hex)
259 265 _insert_into_block(index, level + 1, new, current_rev, current_hex)
260 266
261 267
262 268 def _persist_trie(root):
263 269 """turn a nodemap trie into persistent binary data
264 270
265 271 See `_build_trie` for nodemap trie structure"""
266 272 block_map = {}
267 273 chunks = []
268 274 for tn in _walk_trie(root):
269 275 block_map[id(tn)] = len(chunks)
270 276 chunks.append(_persist_block(tn, block_map))
271 277 return b''.join(chunks)
272 278
273 279
274 280 def _walk_trie(block):
275 281 """yield all the block in a trie
276 282
277 283 Children blocks are always yield before their parent block.
278 284 """
279 285 for (_, item) in sorted(block.items()):
280 286 if isinstance(item, dict):
281 287 for sub_block in _walk_trie(item):
282 288 yield sub_block
283 289 yield block
284 290
285 291
286 292 def _persist_block(block_node, block_map):
287 293 """produce persistent binary data for a single block
288 294
289 295 Children block are assumed to be already persisted and present in
290 296 block_map.
291 297 """
292 298 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
293 299 return S_BLOCK.pack(*data)
294 300
295 301
296 302 def _to_value(item, block_map):
297 303 """persist any value as an integer"""
298 304 if item is None:
299 305 return NO_ENTRY
300 306 elif isinstance(item, dict):
301 307 return block_map[id(item)]
302 308 else:
303 309 return _transform_rev(item)
General Comments 0
You need to be logged in to leave comments. Login now