##// END OF EJS Templates
nodemap: use an intermediate "docket" file to carry small metadata...
marmoute -
r44792:2b72c4ff default
parent child Browse files
Show More
@@ -1,204 +1,276
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 import os
11 12 import struct
12 13
13 14 from .. import (
14 15 error,
15 16 node as nodemod,
16 17 pycompat,
17 18 )
18 19
19 20
20 21 class NodeMap(dict):
21 22 def __missing__(self, x):
22 23 raise error.RevlogError(b'unknown node: %s' % x)
23 24
24 25
25 26 def persisted_data(revlog):
26 27 """read the nodemap for a revlog from disk"""
27 28 if revlog.nodemap_file is None:
28 29 return None
29 return revlog.opener.tryread(revlog.nodemap_file)
30 pdata = revlog.opener.tryread(revlog.nodemap_file)
31 if not pdata:
32 return None
33 offset = 0
34 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
35 if version != ONDISK_VERSION:
36 return None
37 offset += S_VERSION.size
38 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
39 offset += S_HEADER.size
40 uid = pdata[offset : offset + uuid_size]
41
42 filename = _rawdata_filepath(revlog, uid)
43 return revlog.opener.tryread(filename)
30 44
31 45
32 46 def setup_persistent_nodemap(tr, revlog):
33 47 """Install whatever is needed transaction side to persist a nodemap on disk
34 48
35 49 (only actually persist the nodemap if this is relevant for this revlog)
36 50 """
37 51 if revlog._inline:
38 52 return # inlined revlog are too small for this to be relevant
39 53 if revlog.nodemap_file is None:
40 54 return # we do not use persistent_nodemap on this revlog
41 55 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
42 56 if tr.hasfinalize(callback_id):
43 57 return # no need to register again
44 58 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
45 59
46 60
47 61 def _persist_nodemap(tr, revlog):
48 62 """Write nodemap data on disk for a given revlog
49 63 """
50 64 if getattr(revlog, 'filteredrevs', ()):
51 65 raise error.ProgrammingError(
52 66 "cannot persist nodemap of a filtered changelog"
53 67 )
54 68 if revlog.nodemap_file is None:
55 69 msg = "calling persist nodemap on a revlog without the feature enableb"
56 70 raise error.ProgrammingError(msg)
57 71 data = persistent_data(revlog.index)
72 uid = _make_uid()
73 datafile = _rawdata_filepath(revlog, uid)
74 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
75 # store vfs
76 with revlog.opener(datafile, b'w') as fd:
77 fd.write(data)
58 78 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
59 79 # store vfs
60 with revlog.opener(revlog.nodemap_file, b'w') as f:
61 f.write(data)
80 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
81 fp.write(_serialize_docket(uid))
62 82 # EXP-TODO: if the transaction abort, we should remove the new data and
63 # reinstall the old one. (This will be simpler when the file format get a
64 # bit more advanced)
83 # reinstall the old one.
84
85
86 ### Nodemap docket file
87 #
88 # The nodemap data are stored on disk using 2 files:
89 #
90 # * a raw data files containing a persistent nodemap
91 # (see `Nodemap Trie` section)
92 #
93 # * a small "docket" file containing medatadata
94 #
95 # While the nodemap data can be multiple tens of megabytes, the "docket" is
96 # small, it is easy to update it automatically or to duplicated its content
97 # during a transaction.
98 #
99 # Multiple raw data can exist at the same time (The currently valid one and a
100 # new one beind used by an in progress transaction). To accomodate this, the
101 # filename hosting the raw data has a variable parts. The exact filename is
102 # specified inside the "docket" file.
103 #
104 # The docket file contains information to find, qualify and validate the raw
105 # data. Its content is currently very light, but it will expand as the on disk
106 # nodemap gains the necessary features to be used in production.
107
108 # version 0 is experimental, no BC garantee, do no use outside of tests.
109 ONDISK_VERSION = 0
110
111 S_VERSION = struct.Struct(">B")
112 S_HEADER = struct.Struct(">B")
113
114 ID_SIZE = 8
115
116
117 def _make_uid():
118 """return a new unique identifier.
119
120 The identifier is random and composed of ascii characters."""
121 return nodemod.hex(os.urandom(ID_SIZE))
122
123
124 def _serialize_docket(uid):
125 """return serialized bytes for a docket using the passed uid"""
126 data = []
127 data.append(S_VERSION.pack(ONDISK_VERSION))
128 data.append(S_HEADER.pack(len(uid)))
129 data.append(uid)
130 return b''.join(data)
131
132
133 def _rawdata_filepath(revlog, uid):
134 """The (vfs relative) nodemap's rawdata file for a given uid"""
135 prefix = revlog.nodemap_file[:-2]
136 return b"%s-%s.nd" % (prefix, uid)
65 137
66 138
67 139 ### Nodemap Trie
68 140 #
69 141 # This is a simple reference implementation to compute and persist a nodemap
70 142 # trie. This reference implementation is write only. The python version of this
71 143 # is not expected to be actually used, since it wont provide performance
72 144 # improvement over existing non-persistent C implementation.
73 145 #
74 146 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
75 147 # revision can be adressed using its node shortest prefix.
76 148 #
77 149 # The trie is stored as a sequence of block. Each block contains 16 entries
78 150 # (signed 64bit integer, big endian). Each entry can be one of the following:
79 151 #
80 152 # * value >= 0 -> index of sub-block
81 153 # * value == -1 -> no value
82 154 # * value < -1 -> a revision value: rev = -(value+10)
83 155 #
84 156 # The implementation focus on simplicity, not on performance. A Rust
85 157 # implementation should provide a efficient version of the same binary
86 158 # persistence. This reference python implementation is never meant to be
87 159 # extensively use in production.
88 160
89 161
90 162 def persistent_data(index):
91 163 """return the persistent binary form for a nodemap for a given index
92 164 """
93 165 trie = _build_trie(index)
94 166 return _persist_trie(trie)
95 167
96 168
97 169 S_BLOCK = struct.Struct(">" + ("l" * 16))
98 170
99 171 NO_ENTRY = -1
100 172 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
101 173 REV_OFFSET = 2
102 174
103 175
104 176 def _transform_rev(rev):
105 177 """Return the number used to represent the rev in the tree.
106 178
107 179 (or retrieve a rev number from such representation)
108 180
109 181 Note that this is an involution, a function equal to its inverse (i.e.
110 182 which gives the identity when applied to itself).
111 183 """
112 184 return -(rev + REV_OFFSET)
113 185
114 186
115 187 def _to_int(hex_digit):
116 188 """turn an hexadecimal digit into a proper integer"""
117 189 return int(hex_digit, 16)
118 190
119 191
120 192 def _build_trie(index):
121 193 """build a nodemap trie
122 194
123 195 The nodemap stores revision number for each unique prefix.
124 196
125 197 Each block is a dictionary with keys in `[0, 15]`. Values are either
126 198 another block or a revision number.
127 199 """
128 200 root = {}
129 201 for rev in range(len(index)):
130 202 hex = nodemod.hex(index[rev][7])
131 203 _insert_into_block(index, 0, root, rev, hex)
132 204 return root
133 205
134 206
135 207 def _insert_into_block(index, level, block, current_rev, current_hex):
136 208 """insert a new revision in a block
137 209
138 210 index: the index we are adding revision for
139 211 level: the depth of the current block in the trie
140 212 block: the block currently being considered
141 213 current_rev: the revision number we are adding
142 214 current_hex: the hexadecimal representation of the of that revision
143 215 """
144 216 hex_digit = _to_int(current_hex[level : level + 1])
145 217 entry = block.get(hex_digit)
146 218 if entry is None:
147 219 # no entry, simply store the revision number
148 220 block[hex_digit] = current_rev
149 221 elif isinstance(entry, dict):
150 222 # need to recurse to an underlying block
151 223 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
152 224 else:
153 225 # collision with a previously unique prefix, inserting new
154 226 # vertices to fit both entry.
155 227 other_hex = nodemod.hex(index[entry][7])
156 228 other_rev = entry
157 229 new = {}
158 230 block[hex_digit] = new
159 231 _insert_into_block(index, level + 1, new, other_rev, other_hex)
160 232 _insert_into_block(index, level + 1, new, current_rev, current_hex)
161 233
162 234
163 235 def _persist_trie(root):
164 236 """turn a nodemap trie into persistent binary data
165 237
166 238 See `_build_trie` for nodemap trie structure"""
167 239 block_map = {}
168 240 chunks = []
169 241 for tn in _walk_trie(root):
170 242 block_map[id(tn)] = len(chunks)
171 243 chunks.append(_persist_block(tn, block_map))
172 244 return b''.join(chunks)
173 245
174 246
175 247 def _walk_trie(block):
176 248 """yield all the block in a trie
177 249
178 250 Children blocks are always yield before their parent block.
179 251 """
180 252 for (_, item) in sorted(block.items()):
181 253 if isinstance(item, dict):
182 254 for sub_block in _walk_trie(item):
183 255 yield sub_block
184 256 yield block
185 257
186 258
187 259 def _persist_block(block_node, block_map):
188 260 """produce persistent binary data for a single block
189 261
190 262 Children block are assumed to be already persisted and present in
191 263 block_map.
192 264 """
193 265 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
194 266 return S_BLOCK.pack(*data)
195 267
196 268
197 269 def _to_value(item, block_map):
198 270 """persist any value as an integer"""
199 271 if item is None:
200 272 return NO_ENTRY
201 273 elif isinstance(item, dict):
202 274 return block_map[id(item)]
203 275 else:
204 276 return _transform_rev(item)
@@ -1,32 +1,34
1 1 ===================================
2 2 Test the persistent on-disk nodemap
3 3 ===================================
4 4
5 5
6 6 $ hg init test-repo
7 7 $ cd test-repo
8 8 $ cat << EOF >> .hg/hgrc
9 9 > [experimental]
10 10 > exp-persistent-nodemap=yes
11 11 > EOF
12 12 $ hg debugbuilddag .+5000
13 $ f --size .hg/store/00changelog.n
14 .hg/store/00changelog.n: size=18
13 15 $ hg debugnodemap --dump-new | f --sha256 --size
14 16 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
15 17 $ hg debugnodemap --dump-disk | f --sha256 --bytes=256 --hexdump --size
16 18 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
17 19 0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
18 20 0010: ff ff ff ff ff ff ff ff ff ff fa c2 ff ff ff ff |................|
19 21 0020: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
20 22 0030: ff ff ff ff ff ff ed b3 ff ff ff ff ff ff ff ff |................|
21 23 0040: ff ff ff ff ff ff ee 34 00 00 00 00 ff ff ff ff |.......4........|
22 24 0050: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
23 25 0060: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
24 26 0070: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
25 27 0080: ff ff ff ff ff ff f8 50 ff ff ff ff ff ff ff ff |.......P........|
26 28 0090: ff ff ff ff ff ff ff ff ff ff ec c7 ff ff ff ff |................|
27 29 00a0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
28 30 00b0: ff ff ff ff ff ff fa be ff ff f2 fc ff ff ff ff |................|
29 31 00c0: ff ff ff ff ff ff ef ea ff ff ff ff ff ff f9 17 |................|
30 32 00d0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
31 33 00e0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
32 34 00f0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
General Comments 0
You need to be logged in to leave comments. Login now