##// END OF EJS Templates
nodemap: delete older raw data file when creating a new ones...
marmoute -
r44793:563dfdfd default
parent child Browse files
Show More
@@ -1,276 +1,300 b''
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import os
12 import re
12 13 import struct
13 14
14 15 from .. import (
15 16 error,
16 17 node as nodemod,
17 18 pycompat,
18 19 )
19 20
20 21
21 22 class NodeMap(dict):
22 23 def __missing__(self, x):
23 24 raise error.RevlogError(b'unknown node: %s' % x)
24 25
25 26
26 27 def persisted_data(revlog):
27 28 """read the nodemap for a revlog from disk"""
28 29 if revlog.nodemap_file is None:
29 30 return None
30 31 pdata = revlog.opener.tryread(revlog.nodemap_file)
31 32 if not pdata:
32 33 return None
33 34 offset = 0
34 35 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
35 36 if version != ONDISK_VERSION:
36 37 return None
37 38 offset += S_VERSION.size
38 39 (uuid_size,) = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
39 40 offset += S_HEADER.size
40 41 uid = pdata[offset : offset + uuid_size]
41 42
42 43 filename = _rawdata_filepath(revlog, uid)
43 44 return revlog.opener.tryread(filename)
44 45
45 46
46 47 def setup_persistent_nodemap(tr, revlog):
47 48 """Install whatever is needed transaction side to persist a nodemap on disk
48 49
49 50 (only actually persist the nodemap if this is relevant for this revlog)
50 51 """
51 52 if revlog._inline:
52 53 return # inlined revlog are too small for this to be relevant
53 54 if revlog.nodemap_file is None:
54 55 return # we do not use persistent_nodemap on this revlog
55 56 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
56 57 if tr.hasfinalize(callback_id):
57 58 return # no need to register again
58 59 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
59 60
60 61
61 62 def _persist_nodemap(tr, revlog):
62 63 """Write nodemap data on disk for a given revlog
63 64 """
64 65 if getattr(revlog, 'filteredrevs', ()):
65 66 raise error.ProgrammingError(
66 67 "cannot persist nodemap of a filtered changelog"
67 68 )
68 69 if revlog.nodemap_file is None:
69 70 msg = "calling persist nodemap on a revlog without the feature enableb"
70 71 raise error.ProgrammingError(msg)
71 72 data = persistent_data(revlog.index)
72 73 uid = _make_uid()
73 74 datafile = _rawdata_filepath(revlog, uid)
75 olds = _other_rawdata_filepath(revlog, uid)
76 if olds:
77 realvfs = getattr(revlog, '_realopener', revlog.opener)
78
79 def cleanup(tr):
80 for oldfile in olds:
81 realvfs.tryunlink(oldfile)
82
83 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
84 tr.addpostclose(callback_id, cleanup)
74 85 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
75 86 # store vfs
76 87 with revlog.opener(datafile, b'w') as fd:
77 88 fd.write(data)
78 89 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
79 90 # store vfs
80 91 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
81 92 fp.write(_serialize_docket(uid))
82 93 # EXP-TODO: if the transaction abort, we should remove the new data and
83 94 # reinstall the old one.
84 95
85 96
86 97 ### Nodemap docket file
87 98 #
88 99 # The nodemap data are stored on disk using 2 files:
89 100 #
90 101 # * a raw data files containing a persistent nodemap
91 102 # (see `Nodemap Trie` section)
92 103 #
93 104 # * a small "docket" file containing medatadata
94 105 #
95 106 # While the nodemap data can be multiple tens of megabytes, the "docket" is
96 107 # small, it is easy to update it automatically or to duplicated its content
97 108 # during a transaction.
98 109 #
99 110 # Multiple raw data can exist at the same time (The currently valid one and a
100 111 # new one beind used by an in progress transaction). To accomodate this, the
101 112 # filename hosting the raw data has a variable parts. The exact filename is
102 113 # specified inside the "docket" file.
103 114 #
104 115 # The docket file contains information to find, qualify and validate the raw
105 116 # data. Its content is currently very light, but it will expand as the on disk
106 117 # nodemap gains the necessary features to be used in production.
107 118
108 119 # version 0 is experimental, no BC garantee, do no use outside of tests.
109 120 ONDISK_VERSION = 0
110 121
111 122 S_VERSION = struct.Struct(">B")
112 123 S_HEADER = struct.Struct(">B")
113 124
114 125 ID_SIZE = 8
115 126
116 127
117 128 def _make_uid():
118 129 """return a new unique identifier.
119 130
120 131 The identifier is random and composed of ascii characters."""
121 132 return nodemod.hex(os.urandom(ID_SIZE))
122 133
123 134
124 135 def _serialize_docket(uid):
125 136 """return serialized bytes for a docket using the passed uid"""
126 137 data = []
127 138 data.append(S_VERSION.pack(ONDISK_VERSION))
128 139 data.append(S_HEADER.pack(len(uid)))
129 140 data.append(uid)
130 141 return b''.join(data)
131 142
132 143
133 144 def _rawdata_filepath(revlog, uid):
134 145 """The (vfs relative) nodemap's rawdata file for a given uid"""
135 146 prefix = revlog.nodemap_file[:-2]
136 147 return b"%s-%s.nd" % (prefix, uid)
137 148
138 149
150 def _other_rawdata_filepath(revlog, uid):
151 prefix = revlog.nodemap_file[:-2]
152 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
153 new_file_path = _rawdata_filepath(revlog, uid)
154 new_file_name = revlog.opener.basename(new_file_path)
155 dirpath = revlog.opener.dirname(new_file_path)
156 others = []
157 for f in revlog.opener.listdir(dirpath):
158 if pattern.match(f) and f != new_file_name:
159 others.append(f)
160 return others
161
162
139 163 ### Nodemap Trie
140 164 #
141 165 # This is a simple reference implementation to compute and persist a nodemap
142 166 # trie. This reference implementation is write only. The python version of this
143 167 # is not expected to be actually used, since it wont provide performance
144 168 # improvement over existing non-persistent C implementation.
145 169 #
146 170 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
147 171 # revision can be adressed using its node shortest prefix.
148 172 #
149 173 # The trie is stored as a sequence of block. Each block contains 16 entries
150 174 # (signed 64bit integer, big endian). Each entry can be one of the following:
151 175 #
152 176 # * value >= 0 -> index of sub-block
153 177 # * value == -1 -> no value
154 178 # * value < -1 -> a revision value: rev = -(value+10)
155 179 #
156 180 # The implementation focus on simplicity, not on performance. A Rust
157 181 # implementation should provide a efficient version of the same binary
158 182 # persistence. This reference python implementation is never meant to be
159 183 # extensively use in production.
160 184
161 185
162 186 def persistent_data(index):
163 187 """return the persistent binary form for a nodemap for a given index
164 188 """
165 189 trie = _build_trie(index)
166 190 return _persist_trie(trie)
167 191
168 192
169 193 S_BLOCK = struct.Struct(">" + ("l" * 16))
170 194
171 195 NO_ENTRY = -1
172 196 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
173 197 REV_OFFSET = 2
174 198
175 199
176 200 def _transform_rev(rev):
177 201 """Return the number used to represent the rev in the tree.
178 202
179 203 (or retrieve a rev number from such representation)
180 204
181 205 Note that this is an involution, a function equal to its inverse (i.e.
182 206 which gives the identity when applied to itself).
183 207 """
184 208 return -(rev + REV_OFFSET)
185 209
186 210
187 211 def _to_int(hex_digit):
188 212 """turn an hexadecimal digit into a proper integer"""
189 213 return int(hex_digit, 16)
190 214
191 215
192 216 def _build_trie(index):
193 217 """build a nodemap trie
194 218
195 219 The nodemap stores revision number for each unique prefix.
196 220
197 221 Each block is a dictionary with keys in `[0, 15]`. Values are either
198 222 another block or a revision number.
199 223 """
200 224 root = {}
201 225 for rev in range(len(index)):
202 226 hex = nodemod.hex(index[rev][7])
203 227 _insert_into_block(index, 0, root, rev, hex)
204 228 return root
205 229
206 230
207 231 def _insert_into_block(index, level, block, current_rev, current_hex):
208 232 """insert a new revision in a block
209 233
210 234 index: the index we are adding revision for
211 235 level: the depth of the current block in the trie
212 236 block: the block currently being considered
213 237 current_rev: the revision number we are adding
214 238 current_hex: the hexadecimal representation of the of that revision
215 239 """
216 240 hex_digit = _to_int(current_hex[level : level + 1])
217 241 entry = block.get(hex_digit)
218 242 if entry is None:
219 243 # no entry, simply store the revision number
220 244 block[hex_digit] = current_rev
221 245 elif isinstance(entry, dict):
222 246 # need to recurse to an underlying block
223 247 _insert_into_block(index, level + 1, entry, current_rev, current_hex)
224 248 else:
225 249 # collision with a previously unique prefix, inserting new
226 250 # vertices to fit both entry.
227 251 other_hex = nodemod.hex(index[entry][7])
228 252 other_rev = entry
229 253 new = {}
230 254 block[hex_digit] = new
231 255 _insert_into_block(index, level + 1, new, other_rev, other_hex)
232 256 _insert_into_block(index, level + 1, new, current_rev, current_hex)
233 257
234 258
235 259 def _persist_trie(root):
236 260 """turn a nodemap trie into persistent binary data
237 261
238 262 See `_build_trie` for nodemap trie structure"""
239 263 block_map = {}
240 264 chunks = []
241 265 for tn in _walk_trie(root):
242 266 block_map[id(tn)] = len(chunks)
243 267 chunks.append(_persist_block(tn, block_map))
244 268 return b''.join(chunks)
245 269
246 270
247 271 def _walk_trie(block):
248 272 """yield all the block in a trie
249 273
250 274 Children blocks are always yield before their parent block.
251 275 """
252 276 for (_, item) in sorted(block.items()):
253 277 if isinstance(item, dict):
254 278 for sub_block in _walk_trie(item):
255 279 yield sub_block
256 280 yield block
257 281
258 282
259 283 def _persist_block(block_node, block_map):
260 284 """produce persistent binary data for a single block
261 285
262 286 Children block are assumed to be already persisted and present in
263 287 block_map.
264 288 """
265 289 data = tuple(_to_value(block_node.get(i), block_map) for i in range(16))
266 290 return S_BLOCK.pack(*data)
267 291
268 292
269 293 def _to_value(item, block_map):
270 294 """persist any value as an integer"""
271 295 if item is None:
272 296 return NO_ENTRY
273 297 elif isinstance(item, dict):
274 298 return block_map[id(item)]
275 299 else:
276 300 return _transform_rev(item)
@@ -1,34 +1,48 b''
1 1 ===================================
2 2 Test the persistent on-disk nodemap
3 3 ===================================
4 4
5 5
6 6 $ hg init test-repo
7 7 $ cd test-repo
8 8 $ cat << EOF >> .hg/hgrc
9 9 > [experimental]
10 10 > exp-persistent-nodemap=yes
11 11 > EOF
12 12 $ hg debugbuilddag .+5000
13 13 $ f --size .hg/store/00changelog.n
14 14 .hg/store/00changelog.n: size=18
15 $ f --sha256 .hg/store/00changelog-*.nd
16 .hg/store/00changelog-????????????????.nd: sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7 (glob)
15 17 $ hg debugnodemap --dump-new | f --sha256 --size
16 18 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
17 19 $ hg debugnodemap --dump-disk | f --sha256 --bytes=256 --hexdump --size
18 20 size=122880, sha256=b961925120e1c9bc345c199b2cc442abc477029fdece37ef9d99cbe59c0558b7
19 21 0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
20 22 0010: ff ff ff ff ff ff ff ff ff ff fa c2 ff ff ff ff |................|
21 23 0020: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
22 24 0030: ff ff ff ff ff ff ed b3 ff ff ff ff ff ff ff ff |................|
23 25 0040: ff ff ff ff ff ff ee 34 00 00 00 00 ff ff ff ff |.......4........|
24 26 0050: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
25 27 0060: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
26 28 0070: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
27 29 0080: ff ff ff ff ff ff f8 50 ff ff ff ff ff ff ff ff |.......P........|
28 30 0090: ff ff ff ff ff ff ff ff ff ff ec c7 ff ff ff ff |................|
29 31 00a0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
30 32 00b0: ff ff ff ff ff ff fa be ff ff f2 fc ff ff ff ff |................|
31 33 00c0: ff ff ff ff ff ff ef ea ff ff ff ff ff ff f9 17 |................|
32 34 00d0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
33 35 00e0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
34 36 00f0: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
37
38 add a new commit
39
40 $ hg up
41 0 files updated, 0 files merged, 0 files removed, 0 files unresolved
42 $ echo foo > foo
43 $ hg add foo
44 $ hg ci -m 'foo'
45 $ f --size .hg/store/00changelog.n
46 .hg/store/00changelog.n: size=18
47 $ f --sha256 .hg/store/00changelog-*.nd --size
48 .hg/store/00changelog-????????????????.nd: size=122880, sha256=bfafebd751c4f6d116a76a37a1dee2a251747affe7efbcc4f4842ccc746d4db9 (glob)
General Comments 0
You need to be logged in to leave comments. Login now