##// END OF EJS Templates
nodemap: use an intermediate "docket" file to carry small metadata...
marmoute -
r44792:2b72c4ff default
parent child Browse files
Show More
@@ -1,204 +1,276
1 # nodemap.py - nodemap related code and utilities
1 # nodemap.py - nodemap related code and utilities
2 #
2 #
3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 #
5 #
6 # This software may be used and distributed according to the terms of the
6 # This software may be used and distributed according to the terms of the
7 # GNU General Public License version 2 or any later version.
7 # GNU General Public License version 2 or any later version.
8
8
9 from __future__ import absolute_import
9 from __future__ import absolute_import
10
10
11 import os
11 import struct
12 import struct
12
13
13 from .. import (
14 from .. import (
14 error,
15 error,
15 node as nodemod,
16 node as nodemod,
16 pycompat,
17 pycompat,
17 )
18 )
18
19
19
20
20 class NodeMap(dict):
21 class NodeMap(dict):
21 def __missing__(self, x):
22 def __missing__(self, x):
22 raise error.RevlogError(b'unknown node: %s' % x)
23 raise error.RevlogError(b'unknown node: %s' % x)
23
24
24
25
25 def persisted_data(revlog):
26 def persisted_data(revlog):
26 """read the nodemap for a revlog from disk"""
27 """read the nodemap for a revlog from disk"""
27 if revlog.nodemap_file is None:
28 if revlog.nodemap_file is None:
28 return None
29 return None
29 return revlog.opener.tryread(revlog.nodemap_file)
30 pdata = revlog.opener.tryread(revlog.nodemap_file)
31 if not pdata:
32 return None
33 offset = 0
34 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
35 if version != ONDISK_VERSION:
36 return None
37 offset += S_VERSION.size
38 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
39 offset += S_HEADER.size
40 uid = pdata[offset : offset + uuid_size]
41
42 filename = _rawdata_filepath(revlog, uid)
43 return revlog.opener.tryread(filename)
30
44
31
45
32 def setup_persistent_nodemap(tr, revlog):
46 def setup_persistent_nodemap(tr, revlog):
33 """Install whatever is needed transaction side to persist a nodemap on disk
47 """Install whatever is needed transaction side to persist a nodemap on disk
34
48
35 (only actually persist the nodemap if this is relevant for this revlog)
49 (only actually persist the nodemap if this is relevant for this revlog)
36 """
50 """
37 if revlog._inline:
51 if revlog._inline:
38 return # inlined revlog are too small for this to be relevant
52 return # inlined revlog are too small for this to be relevant
39 if revlog.nodemap_file is None:
53 if revlog.nodemap_file is None:
40 return # we do not use persistent_nodemap on this revlog
54 return # we do not use persistent_nodemap on this revlog
41 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
55 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
42 if tr.hasfinalize(callback_id):
56 if tr.hasfinalize(callback_id):
43 return # no need to register again
57 return # no need to register again
44 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
58 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
45
59
46
60
47 def _persist_nodemap(tr, revlog):
61 def _persist_nodemap(tr, revlog):
48 """Write nodemap data on disk for a given revlog
62 """Write nodemap data on disk for a given revlog
49 """
63 """
50 if getattr(revlog, 'filteredrevs', ()):
64 if getattr(revlog, 'filteredrevs', ()):
51 raise error.ProgrammingError(
65 raise error.ProgrammingError(
52 "cannot persist nodemap of a filtered changelog"
66 "cannot persist nodemap of a filtered changelog"
53 )
67 )
54 if revlog.nodemap_file is None:
68 if revlog.nodemap_file is None:
55 msg = "calling persist nodemap on a revlog without the feature enableb"
69 msg = "calling persist nodemap on a revlog without the feature enableb"
56 raise error.ProgrammingError(msg)
70 raise error.ProgrammingError(msg)
57 data = persistent_data(revlog.index)
71 data = persistent_data(revlog.index)
72 uid = _make_uid()
73 datafile = _rawdata_filepath(revlog, uid)
74 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
75 # store vfs
76 with revlog.opener(datafile, b'w') as fd:
77 fd.write(data)
58 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
78 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
59 # store vfs
79 # store vfs
60 with revlog.opener(revlog.nodemap_file, b'w') as f:
80 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
61 f.write(data)
81 fp.write(_serialize_docket(uid))
62 # EXP-TODO: if the transaction abort, we should remove the new data and
82 # EXP-TODO: if the transaction abort, we should remove the new data and
63 # reinstall the old one. (This will be simpler when the file format get a
83 # reinstall the old one.
64 # bit more advanced)
84
85
86 ### Nodemap docket file
87 #
88 # The nodemap data are stored on disk using 2 files:
89 #
90 # * a raw data files containing a persistent nodemap
91 # (see `Nodemap Trie` section)
92 #
93 # * a small "docket" file containing medatadata
94 #
95 # While the nodemap data can be multiple tens of megabytes, the "docket" is
96 # small, it is easy to update it automatically or to duplicated its content
97 # during a transaction.
98 #
99 # Multiple raw data can exist at the same time (The currently valid one and a
100 # new one beind used by an in progress transaction). To accomodate this, the
101 # filename hosting the raw data has a variable parts. The exact filename is
102 # specified inside the "docket" file.
103 #
104 # The docket file contains information to find, qualify and validate the raw
105 # data. Its content is currently very light, but it will expand as the on disk
106 # nodemap gains the necessary features to be used in production.
107
108 # version 0 is experimental, no BC garantee, do no use outside of tests.
109 ONDISK_VERSION = 0
110
111 S_VERSION = struct.Struct(">B")
112 S_HEADER = struct.Struct(">B")
113
114 ID_SIZE = 8
115
116
117 def _make_uid():
118 """return a new unique identifier.
119
120 The identifier is random and composed of ascii characters."""
121 return nodemod.hex(os.urandom(ID_SIZE))
122
123
124 def _serialize_docket(uid):
125 """return serialized bytes for a docket using the passed uid"""
126 data = []
127 data.append(S_VERSION.pack(ONDISK_VERSION))
128 data.append(S_HEADER.pack(len(uid)))
129 data.append(uid)
130 return b''.join(data)
131
132
133 def _rawdata_filepath(revlog, uid):
134 """The (vfs relative) nodemap's rawdata file for a given uid"""
135 prefix = revlog.nodemap_file[:-2]
136 return b"%s-%s.nd" % (prefix, uid)
65
137
66
138
67 ### Nodemap Trie
139 ### Nodemap Trie
68 #
140 #
69 # This is a simple reference implementation to compute and persist a nodemap
141 # This is a simple reference implementation to compute and persist a nodemap
70 # trie. This reference implementation is write only. The python version of this
142 # trie. This reference implementation is write only. The python version of this
71 # is not expected to be actually used, since it wont provide performance
143 # is not expected to be actually used, since it wont provide performance
72 # improvement over existing non-persistent C implementation.
144 # improvement over existing non-persistent C implementation.
73 #
145 #
74 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
146 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
75 # revision can be adressed using its node shortest prefix.
147 # revision can be adressed using its node shortest prefix.
76 #
148 #
77 # The trie is stored as a sequence of block. Each block contains 16 entries
149 # The trie is stored as a sequence of block. Each block contains 16 entries
78 # (signed 64bit integer, big endian). Each entry can be one of the following:
150 # (signed 64bit integer, big endian). Each entry can be one of the following:
79 #
151 #
80 # * value >= 0 -> index of sub-block
152 # * value >= 0 -> index of sub-block
81 # * value == -1 -> no value
153 # * value == -1 -> no value
82 # * value < -1 -> a revision value: rev = -(value+10)
154 # * value < -1 -> a revision value: rev = -(value+10)
83 #
155 #
84 # The implementation focus on simplicity, not on performance. A Rust
156 # The implementation focus on simplicity, not on performance. A Rust
85 # implementation should provide a efficient version of the same binary
157 # implementation should provide a efficient version of the same binary
86 # persistence. This reference python implementation is never meant to be
158 # persistence. This reference python implementation is never meant to be
87 # extensively use in production.
159 # extensively use in production.
88
160
89
161
90 def persistent_data(index):
162 def persistent_data(index):
91 """return the persistent binary form for a nodemap for a given index
163 """return the persistent binary form for a nodemap for a given index
92 """
164 """
93 trie = _build_trie(index)
165 trie = _build_trie(index)
94 return _persist_trie(trie)
166 return _persist_trie(trie)
95
167
96
168
97 S_BLOCK = struct.Struct(">" + ("l" * 16))
169 S_BLOCK = struct.Struct(">" + ("l" * 16))
98
170
99 NO_ENTRY = -1
171 NO_ENTRY = -1
100 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
172 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
101 REV_OFFSET = 2
173 REV_OFFSET = 2
102
174
103
175
104 def _transform_rev(rev):
176 def _transform_rev(rev):
105 """Return the number used to represent the rev in the tree.
177 """Return the number used to represent the rev in the tree.
106
178
107 (or retrieve a rev number from such representation)
179 (or retrieve a rev number from such representation)
108
180
109 Note that this is an involution, a function equal to its inverse (i.e.
181 Note that this is an involution, a function equal to its inverse (i.e.
110 which gives the identity when applied to itself).
182 which gives the identity when applied to itself).
111 """
183 """
112 return -(rev + REV_OFFSET)
184 return -(rev + REV_OFFSET)
113
185
114
186
115 def _to_int(hex_digit):
187 def _to_int(hex_digit):
116 """turn an hexadecimal digit into a proper integer"""
188 """turn an hexadecimal digit into a proper integer"""
117 return int(hex_digit, 16)
189 return int(hex_digit, 16)
118
190
119
191
120 def _build_trie(index):
192 def _build_trie(index):
121 """build a nodemap trie
193 """build a nodemap trie
122
194
123 The nodemap stores revision number for each unique prefix.
195 The nodemap stores revision number for each unique prefix.
124
196
125 Each block is a dictionary with keys in `[0, 15]`. Values are either
197 Each block is a dictionary with keys in `[0, 15]`. Values are either
126 another block or a revision number.
198 another block or a revision number.
127 """
199 """
128 root = {}
200 root = {}
129 for rev in range(len(index)):
201 for rev in range(len(index)):
130 hex = nodemod.hex(index[rev][7])
202 hex = nodemod.hex(index[rev][7])
131 _insert_into_block(index, 0, root, rev, hex)
203 _insert_into_block(index, 0, root, rev, hex)
132 return root
204 return root
133
205
134
206
135 def _insert_into_block(index, level, block, current_rev, current_hex):
207 def _insert_into_block(index, level, block, current_rev, current_hex):
136 """insert a new revision in a block
208 """insert a new revision in a block
137
209
138 index: the index we are adding revision for
210 index: the index we are adding revision for
139 level: the depth of the current block in the trie
211 level: the depth of the current block in the trie
140 block: the block currently being considered
212 block: the block currently being considered
141 current_rev: the revision number we are adding
213 current_rev: the revision number we are adding
142 current_hex: the hexadecimal representation of the of that revision
214 current_hex: the hexadecimal representation of the of that revision
143 """
215 """
144 hex_digit = _to_int(current_hex[level : level + 1])
216 hex_digit = _to_int(current_hex[level : level + 1])
145 entry = block.get(hex_digit)
217 entry = block.get(hex_digit)
146 if entry is None:
218 if entry is None:
147 # no entry, simply store the revision number
219 # no entry, simply store the revision number
148 block[hex_digit] = current_rev
220 block[hex_digit] = current_rev
149 elif isinstance(entry, dict):
221 elif isinstance(entry, dict):
150 # need to recurse to an underlying block
222 # need to recurse to an underlying block
151 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
223 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
152 else:
224 else:
153 # collision with a previously unique prefix, inserting new
225 # collision with a previously unique prefix, inserting new
154 # vertices to fit both entry.
226 # vertices to fit both entry.
155 other_hex = nodemod.hex(index[entry][7])
227 other_hex = nodemod.hex(index[entry][7])
156 other_rev = entry
228 other_rev = entry
157 new = {}
229 new = {}
158 block[hex_digit] = new
230 block[hex_digit] = new
159 _insert_into_block(index, level + 1, new, other_rev, other_hex)
231 _insert_into_block(index, level + 1, new, other_rev, other_hex)
160 _insert_into_block(index, level + 1, new, current_rev, current_hex)
232 _insert_into_block(index, level + 1, new, current_rev, current_hex)
161
233
162
234
163 def _persist_trie(root):
235 def _persist_trie(root):
164 """turn a nodemap trie into persistent binary data
236 """turn a nodemap trie into persistent binary data
165
237
166 See `_build_trie` for nodemap trie structure"""
238 See `_build_trie` for nodemap trie structure"""
167 block_map = {}
239 block_map = {}
168 chunks = []
240 chunks = []
169 for tn in _walk_trie(root):
241 for tn in _walk_trie(root):
170 block_map[id(tn)] = len(chunks)
242 block_map[id(tn)] = len(chunks)
171 chunks.append(_persist_block(tn, block_map))
243 chunks.append(_persist_block(tn, block_map))
172 return b''.join(chunks)
244 return b''.join(chunks)
173
245
174
246
175 def _walk_trie(block):
247 def _walk_trie(block):
176 """yield all the block in a trie
248 """yield all the block in a trie
177
249
178 Children blocks are always yield before their parent block.
250 Children blocks are always yield before their parent block.
179 """
251 """
180 for (_, item) in sorted(block.items()):
252 for (_, item) in sorted(block.items()):
181 if isinstance(item, dict):
253 if isinstance(item, dict):
182 for sub_block in _walk_trie(item):
254 for sub_block in _walk_trie(item):
183 yield sub_block
255 yield sub_block
184 yield block
256 yield block
185
257
186
258
187 def _persist_block(block_node, block_map):
259 def _persist_block(block_node, block_map):
188 """produce persistent binary data for a single block
260 """produce persistent binary data for a single block
189
261
190 Children block are assumed to be already persisted and present in
262 Children block are assumed to be already persisted and present in
191 block_map.
263 block_map.
192 """
264 """
193 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
265 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
194 return S_BLOCK.pack(*data)
266 return S_BLOCK.pack(*data)
195
267
196
268
197 def _to_value(item, block_map):
269 def _to_value(item, block_map):
198 """persist any value as an integer"""
270 """persist any value as an integer"""
199 if item is None:
271 if item is None:
200 return NO_ENTRY
272 return NO_ENTRY
201 elif isinstance(item, dict):
273 elif isinstance(item, dict):
202 return block_map[id(item)]
274 return block_map[id(item)]
203 else:
275 else:
204 return _transform_rev(item)
276 return _transform_rev(item)
@@ -1,32 +1,34
1 ===================================
1 ===================================
2 Test the persistent on-disk nodemap
2 Test the persistent on-disk nodemap
3 ===================================
3 ===================================
4
4
5
5
6 $ hg init test-repo
6 $ hg init test-repo
7 $ cd test-repo
7 $ cd test-repo
8 $ cat << EOF >> .hg/hgrc
8 $ cat << EOF >> .hg/hgrc
9 > [experimental]
9 > [experimental]
10 > exp-persistent-nodemap=yes
10 > exp-persistent-nodemap=yes
11 > EOF
11 > EOF
12 $ hg debugbuilddag .+5000
12 $ hg debugbuilddag .+5000
13 $ f --size .hg/store/00changelog.n
14 .hg/store/00changelog.n: size=18
13 $ hg debugnodemap --dump-new | f --sha256 --size
15 $ hg debugnodemap --dump-new | f --sha256 --size
14 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
16 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
15 $ hg debugnodemap --dump-disk | f --sha256 --bytes=256 --hexdump --size
17 $ hg debugnodemap --dump-disk | f --sha256 --bytes=256 --hexdump --size
16 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
18 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
17 0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
19 0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
18 0010: ff ff ff ff ff ff ff ff ff ff fa c2 ff ff ff ff |................|
20 0010: ff ff ff ff ff ff ff ff ff ff fa c2 ff ff ff ff |................|
19 0020: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
21 0020: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
20 0030: ff ff ff ff ff ff ed b3 ff ff ff ff ff ff ff ff |................|
22 0030: ff ff ff ff ff ff ed b3 ff ff ff ff ff ff ff ff |................|
21 0040: ff ff ff ff ff ff ee 34 00 00 00 00 ff ff ff ff |.......4........|
23 0040: ff ff ff ff ff ff ee 34 00 00 00 00 ff ff ff ff |.......4........|
22 0050: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
24 0050: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
23 0060: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
25 0060: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
24 0070: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
26 0070: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
25 0080: ff ff ff ff ff ff f8 50 ff ff ff ff ff ff ff ff |.......P........|
27 0080: ff ff ff ff ff ff f8 50 ff ff ff ff ff ff ff ff |.......P........|
26 0090: ff ff ff ff ff ff ff ff ff ff ec c7 ff ff ff ff |................|
28 0090: ff ff ff ff ff ff ff ff ff ff ec c7 ff ff ff ff |................|
27 00a0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
29 00a0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
28 00b0: ff ff ff ff ff ff fa be ff ff f2 fc ff ff ff ff |................|
30 00b0: ff ff ff ff ff ff fa be ff ff f2 fc ff ff ff ff |................|
29 00c0: ff ff ff ff ff ff ef ea ff ff ff ff ff ff f9 17 |................|
31 00c0: ff ff ff ff ff ff ef ea ff ff ff ff ff ff f9 17 |................|
30 00d0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
32 00d0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
31 00e0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
33 00e0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
32 00f0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
34 00f0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
General Comments 0
You need to be logged in to leave comments. Login now