##// END OF EJS Templates
nodemap: fix missing r-prefix on regular expression...
Augie Fackler -
r44952:6aee0647 default
parent child Browse files
Show More
@@ -1,554 +1,554 b''
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import errno
12 12 import os
13 13 import re
14 14 import struct
15 15
16 16 from .. import (
17 17 error,
18 18 node as nodemod,
19 19 util,
20 20 )
21 21
22 22
23 23 class NodeMap(dict):
24 24 def __missing__(self, x):
25 25 raise error.RevlogError(b'unknown node: %s' % x)
26 26
27 27
28 28 def persisted_data(revlog):
29 29 """read the nodemap for a revlog from disk"""
30 30 if revlog.nodemap_file is None:
31 31 return None
32 32 pdata = revlog.opener.tryread(revlog.nodemap_file)
33 33 if not pdata:
34 34 return None
35 35 offset = 0
36 36 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
37 37 if version != ONDISK_VERSION:
38 38 return None
39 39 offset += S_VERSION.size
40 40 headers = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
41 41 uid_size, tip_rev, data_length, data_unused = headers
42 42 offset += S_HEADER.size
43 43 docket = NodeMapDocket(pdata[offset : offset + uid_size])
44 44 docket.tip_rev = tip_rev
45 45 docket.data_length = data_length
46 46 docket.data_unused = data_unused
47 47
48 48 filename = _rawdata_filepath(revlog, docket)
49 49 use_mmap = revlog.opener.options.get("exp-persistent-nodemap.mmap")
50 50 try:
51 51 with revlog.opener(filename) as fd:
52 52 if use_mmap:
53 53 data = util.buffer(util.mmapread(fd, data_length))
54 54 else:
55 55 data = fd.read(data_length)
56 56 except OSError as e:
57 57 if e.errno != errno.ENOENT:
58 58 raise
59 59 if len(data) < data_length:
60 60 return None
61 61 return docket, data
62 62
63 63
64 64 def setup_persistent_nodemap(tr, revlog):
65 65 """Install whatever is needed transaction side to persist a nodemap on disk
66 66
67 67 (only actually persist the nodemap if this is relevant for this revlog)
68 68 """
69 69 if revlog._inline:
70 70 return # inlined revlog are too small for this to be relevant
71 71 if revlog.nodemap_file is None:
72 72 return # we do not use persistent_nodemap on this revlog
73 73 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
74 74 if tr.hasfinalize(callback_id):
75 75 return # no need to register again
76 76 tr.addfinalize(
77 77 callback_id, lambda tr: _persist_nodemap(tr.addpostclose, revlog)
78 78 )
79 79
80 80
81 81 def update_persistent_nodemap(revlog):
82 82 """update the persistent nodemap right now
83 83
84 84 To be used for updating the nodemap on disk outside of a normal transaction
85 85 setup (eg, `debugupdatecache`).
86 86 """
87 87 cleanups = []
88 88 _persist_nodemap((lambda x, y: cleanups.append(y)), revlog)
89 89 for c in cleanups:
90 90 c(None)
91 91
92 92
93 93 def _persist_nodemap(cleaner, revlog):
94 94 """Write nodemap data on disk for a given revlog
95 95 """
96 96 if getattr(revlog, 'filteredrevs', ()):
97 97 raise error.ProgrammingError(
98 98 "cannot persist nodemap of a filtered changelog"
99 99 )
100 100 if revlog.nodemap_file is None:
101 101 msg = "calling persist nodemap on a revlog without the feature enableb"
102 102 raise error.ProgrammingError(msg)
103 103
104 104 can_incremental = util.safehasattr(revlog.index, "nodemap_data_incremental")
105 105 ondisk_docket = revlog._nodemap_docket
106 106 feed_data = util.safehasattr(revlog.index, "update_nodemap_data")
107 107 use_mmap = revlog.opener.options.get("exp-persistent-nodemap.mmap")
108 108
109 109 data = None
110 110 # first attemp an incremental update of the data
111 111 if can_incremental and ondisk_docket is not None:
112 112 target_docket = revlog._nodemap_docket.copy()
113 113 (
114 114 src_docket,
115 115 data_changed_count,
116 116 data,
117 117 ) = revlog.index.nodemap_data_incremental()
118 118 if src_docket != target_docket:
119 119 data = None
120 120 else:
121 121 datafile = _rawdata_filepath(revlog, target_docket)
122 122 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
123 123 # store vfs
124 124 new_length = target_docket.data_length + len(data)
125 125 with revlog.opener(datafile, b'r+') as fd:
126 126 fd.seek(target_docket.data_length)
127 127 fd.write(data)
128 128 if feed_data:
129 129 if use_mmap:
130 130 fd.seek(0)
131 131 new_data = fd.read(new_length)
132 132 else:
133 133 fd.flush()
134 134 new_data = util.buffer(util.mmapread(fd, new_length))
135 135 target_docket.data_length = new_length
136 136 target_docket.data_unused += data_changed_count
137 137
138 138 if data is None:
139 139 # otherwise fallback to a full new export
140 140 target_docket = NodeMapDocket()
141 141 datafile = _rawdata_filepath(revlog, target_docket)
142 142 if util.safehasattr(revlog.index, "nodemap_data_all"):
143 143 data = revlog.index.nodemap_data_all()
144 144 else:
145 145 data = persistent_data(revlog.index)
146 146 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
147 147 # store vfs
148 148 with revlog.opener(datafile, b'w+') as fd:
149 149 fd.write(data)
150 150 if feed_data:
151 151 if use_mmap:
152 152 new_data = data
153 153 else:
154 154 fd.flush()
155 155 new_data = util.buffer(util.mmapread(fd, len(data)))
156 156 target_docket.data_length = len(data)
157 157 target_docket.tip_rev = revlog.tiprev()
158 158 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
159 159 # store vfs
160 160 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
161 161 fp.write(target_docket.serialize())
162 162 revlog._nodemap_docket = target_docket
163 163 if feed_data:
164 164 revlog.index.update_nodemap_data(target_docket, new_data)
165 165
166 166 # EXP-TODO: if the transaction abort, we should remove the new data and
167 167 # reinstall the old one.
168 168
169 169 # search for old index file in all cases, some older process might have
170 170 # left one behind.
171 171 olds = _other_rawdata_filepath(revlog, target_docket)
172 172 if olds:
173 173 realvfs = getattr(revlog, '_realopener', revlog.opener)
174 174
175 175 def cleanup(tr):
176 176 for oldfile in olds:
177 177 realvfs.tryunlink(oldfile)
178 178
179 179 callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
180 180 cleaner(callback_id, cleanup)
181 181
182 182
183 183 ### Nodemap docket file
184 184 #
185 185 # The nodemap data are stored on disk using 2 files:
186 186 #
187 187 # * a raw data files containing a persistent nodemap
188 188 # (see `Nodemap Trie` section)
189 189 #
190 190 # * a small "docket" file containing medatadata
191 191 #
192 192 # While the nodemap data can be multiple tens of megabytes, the "docket" is
193 193 # small, it is easy to update it automatically or to duplicated its content
194 194 # during a transaction.
195 195 #
196 196 # Multiple raw data can exist at the same time (The currently valid one and a
197 197 # new one beind used by an in progress transaction). To accomodate this, the
198 198 # filename hosting the raw data has a variable parts. The exact filename is
199 199 # specified inside the "docket" file.
200 200 #
201 201 # The docket file contains information to find, qualify and validate the raw
202 202 # data. Its content is currently very light, but it will expand as the on disk
203 203 # nodemap gains the necessary features to be used in production.
204 204
205 205 # version 0 is experimental, no BC garantee, do no use outside of tests.
206 206 ONDISK_VERSION = 0
207 207 S_VERSION = struct.Struct(">B")
208 208 S_HEADER = struct.Struct(">BQQQ")
209 209
210 210 ID_SIZE = 8
211 211
212 212
213 213 def _make_uid():
214 214 """return a new unique identifier.
215 215
216 216 The identifier is random and composed of ascii characters."""
217 217 return nodemod.hex(os.urandom(ID_SIZE))
218 218
219 219
220 220 class NodeMapDocket(object):
221 221 """metadata associated with persistent nodemap data
222 222
223 223 The persistent data may come from disk or be on their way to disk.
224 224 """
225 225
226 226 def __init__(self, uid=None):
227 227 if uid is None:
228 228 uid = _make_uid()
229 229 self.uid = uid
230 230 self.tip_rev = None
231 231 self.data_length = None
232 232 self.data_unused = 0
233 233
234 234 def copy(self):
235 235 new = NodeMapDocket(uid=self.uid)
236 236 new.tip_rev = self.tip_rev
237 237 new.data_length = self.data_length
238 238 new.data_unused = self.data_unused
239 239 return new
240 240
241 241 def __cmp__(self, other):
242 242 if self.uid < other.uid:
243 243 return -1
244 244 if self.uid > other.uid:
245 245 return 1
246 246 elif self.data_length < other.data_length:
247 247 return -1
248 248 elif self.data_length > other.data_length:
249 249 return 1
250 250 return 0
251 251
252 252 def __eq__(self, other):
253 253 return self.uid == other.uid and self.data_length == other.data_length
254 254
255 255 def serialize(self):
256 256 """return serialized bytes for a docket using the passed uid"""
257 257 data = []
258 258 data.append(S_VERSION.pack(ONDISK_VERSION))
259 259 headers = (
260 260 len(self.uid),
261 261 self.tip_rev,
262 262 self.data_length,
263 263 self.data_unused,
264 264 )
265 265 data.append(S_HEADER.pack(*headers))
266 266 data.append(self.uid)
267 267 return b''.join(data)
268 268
269 269
270 270 def _rawdata_filepath(revlog, docket):
271 271 """The (vfs relative) nodemap's rawdata file for a given uid"""
272 272 prefix = revlog.nodemap_file[:-2]
273 273 return b"%s-%s.nd" % (prefix, docket.uid)
274 274
275 275
276 276 def _other_rawdata_filepath(revlog, docket):
277 277 prefix = revlog.nodemap_file[:-2]
278 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
278 pattern = re.compile(br"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
279 279 new_file_path = _rawdata_filepath(revlog, docket)
280 280 new_file_name = revlog.opener.basename(new_file_path)
281 281 dirpath = revlog.opener.dirname(new_file_path)
282 282 others = []
283 283 for f in revlog.opener.listdir(dirpath):
284 284 if pattern.match(f) and f != new_file_name:
285 285 others.append(f)
286 286 return others
287 287
288 288
289 289 ### Nodemap Trie
290 290 #
291 291 # This is a simple reference implementation to compute and persist a nodemap
292 292 # trie. This reference implementation is write only. The python version of this
293 293 # is not expected to be actually used, since it wont provide performance
294 294 # improvement over existing non-persistent C implementation.
295 295 #
296 296 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
297 297 # revision can be adressed using its node shortest prefix.
298 298 #
299 299 # The trie is stored as a sequence of block. Each block contains 16 entries
300 300 # (signed 64bit integer, big endian). Each entry can be one of the following:
301 301 #
302 302 # * value >= 0 -> index of sub-block
303 303 # * value == -1 -> no value
304 304 # * value < -1 -> a revision value: rev = -(value+10)
305 305 #
306 306 # The implementation focus on simplicity, not on performance. A Rust
307 307 # implementation should provide a efficient version of the same binary
308 308 # persistence. This reference python implementation is never meant to be
309 309 # extensively use in production.
310 310
311 311
312 312 def persistent_data(index):
313 313 """return the persistent binary form for a nodemap for a given index
314 314 """
315 315 trie = _build_trie(index)
316 316 return _persist_trie(trie)
317 317
318 318
319 319 def update_persistent_data(index, root, max_idx, last_rev):
320 320 """return the incremental update for persistent nodemap from a given index
321 321 """
322 322 changed_block, trie = _update_trie(index, root, last_rev)
323 323 return (
324 324 changed_block * S_BLOCK.size,
325 325 _persist_trie(trie, existing_idx=max_idx),
326 326 )
327 327
328 328
329 329 S_BLOCK = struct.Struct(">" + ("l" * 16))
330 330
331 331 NO_ENTRY = -1
332 332 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
333 333 REV_OFFSET = 2
334 334
335 335
336 336 def _transform_rev(rev):
337 337 """Return the number used to represent the rev in the tree.
338 338
339 339 (or retrieve a rev number from such representation)
340 340
341 341 Note that this is an involution, a function equal to its inverse (i.e.
342 342 which gives the identity when applied to itself).
343 343 """
344 344 return -(rev + REV_OFFSET)
345 345
346 346
347 347 def _to_int(hex_digit):
348 348 """turn an hexadecimal digit into a proper integer"""
349 349 return int(hex_digit, 16)
350 350
351 351
352 352 class Block(dict):
353 353 """represent a block of the Trie
354 354
355 355 contains up to 16 entry indexed from 0 to 15"""
356 356
357 357 def __init__(self):
358 358 super(Block, self).__init__()
359 359 # If this block exist on disk, here is its ID
360 360 self.ondisk_id = None
361 361
362 362 def __iter__(self):
363 363 return iter(self.get(i) for i in range(16))
364 364
365 365
366 366 def _build_trie(index):
367 367 """build a nodemap trie
368 368
369 369 The nodemap stores revision number for each unique prefix.
370 370
371 371 Each block is a dictionary with keys in `[0, 15]`. Values are either
372 372 another block or a revision number.
373 373 """
374 374 root = Block()
375 375 for rev in range(len(index)):
376 376 hex = nodemod.hex(index[rev][7])
377 377 _insert_into_block(index, 0, root, rev, hex)
378 378 return root
379 379
380 380
381 381 def _update_trie(index, root, last_rev):
382 382 """consume"""
383 383 changed = 0
384 384 for rev in range(last_rev + 1, len(index)):
385 385 hex = nodemod.hex(index[rev][7])
386 386 changed += _insert_into_block(index, 0, root, rev, hex)
387 387 return changed, root
388 388
389 389
390 390 def _insert_into_block(index, level, block, current_rev, current_hex):
391 391 """insert a new revision in a block
392 392
393 393 index: the index we are adding revision for
394 394 level: the depth of the current block in the trie
395 395 block: the block currently being considered
396 396 current_rev: the revision number we are adding
397 397 current_hex: the hexadecimal representation of the of that revision
398 398 """
399 399 changed = 1
400 400 if block.ondisk_id is not None:
401 401 block.ondisk_id = None
402 402 hex_digit = _to_int(current_hex[level : level + 1])
403 403 entry = block.get(hex_digit)
404 404 if entry is None:
405 405 # no entry, simply store the revision number
406 406 block[hex_digit] = current_rev
407 407 elif isinstance(entry, dict):
408 408 # need to recurse to an underlying block
409 409 changed += _insert_into_block(
410 410 index, level + 1, entry, current_rev, current_hex
411 411 )
412 412 else:
413 413 # collision with a previously unique prefix, inserting new
414 414 # vertices to fit both entry.
415 415 other_hex = nodemod.hex(index[entry][7])
416 416 other_rev = entry
417 417 new = Block()
418 418 block[hex_digit] = new
419 419 _insert_into_block(index, level + 1, new, other_rev, other_hex)
420 420 _insert_into_block(index, level + 1, new, current_rev, current_hex)
421 421 return changed
422 422
423 423
424 424 def _persist_trie(root, existing_idx=None):
425 425 """turn a nodemap trie into persistent binary data
426 426
427 427 See `_build_trie` for nodemap trie structure"""
428 428 block_map = {}
429 429 if existing_idx is not None:
430 430 base_idx = existing_idx + 1
431 431 else:
432 432 base_idx = 0
433 433 chunks = []
434 434 for tn in _walk_trie(root):
435 435 if tn.ondisk_id is not None:
436 436 block_map[id(tn)] = tn.ondisk_id
437 437 else:
438 438 block_map[id(tn)] = len(chunks) + base_idx
439 439 chunks.append(_persist_block(tn, block_map))
440 440 return b''.join(chunks)
441 441
442 442
443 443 def _walk_trie(block):
444 444 """yield all the block in a trie
445 445
446 446 Children blocks are always yield before their parent block.
447 447 """
448 448 for (_, item) in sorted(block.items()):
449 449 if isinstance(item, dict):
450 450 for sub_block in _walk_trie(item):
451 451 yield sub_block
452 452 yield block
453 453
454 454
455 455 def _persist_block(block_node, block_map):
456 456 """produce persistent binary data for a single block
457 457
458 458 Children block are assumed to be already persisted and present in
459 459 block_map.
460 460 """
461 461 data = tuple(_to_value(v, block_map) for v in block_node)
462 462 return S_BLOCK.pack(*data)
463 463
464 464
465 465 def _to_value(item, block_map):
466 466 """persist any value as an integer"""
467 467 if item is None:
468 468 return NO_ENTRY
469 469 elif isinstance(item, dict):
470 470 return block_map[id(item)]
471 471 else:
472 472 return _transform_rev(item)
473 473
474 474
475 475 def parse_data(data):
476 476 """parse parse nodemap data into a nodemap Trie"""
477 477 if (len(data) % S_BLOCK.size) != 0:
478 478 msg = "nodemap data size is not a multiple of block size (%d): %d"
479 479 raise error.Abort(msg % (S_BLOCK.size, len(data)))
480 480 if not data:
481 481 return Block(), None
482 482 block_map = {}
483 483 new_blocks = []
484 484 for i in range(0, len(data), S_BLOCK.size):
485 485 block = Block()
486 486 block.ondisk_id = len(block_map)
487 487 block_map[block.ondisk_id] = block
488 488 block_data = data[i : i + S_BLOCK.size]
489 489 values = S_BLOCK.unpack(block_data)
490 490 new_blocks.append((block, values))
491 491 for b, values in new_blocks:
492 492 for idx, v in enumerate(values):
493 493 if v == NO_ENTRY:
494 494 continue
495 495 elif v >= 0:
496 496 b[idx] = block_map[v]
497 497 else:
498 498 b[idx] = _transform_rev(v)
499 499 return block, i // S_BLOCK.size
500 500
501 501
502 502 # debug utility
503 503
504 504
505 505 def check_data(ui, index, data):
506 506 """verify that the provided nodemap data are valid for the given idex"""
507 507 ret = 0
508 508 ui.status((b"revision in index: %d\n") % len(index))
509 509 root, __ = parse_data(data)
510 510 all_revs = set(_all_revisions(root))
511 511 ui.status((b"revision in nodemap: %d\n") % len(all_revs))
512 512 for r in range(len(index)):
513 513 if r not in all_revs:
514 514 msg = b" revision missing from nodemap: %d\n" % r
515 515 ui.write_err(msg)
516 516 ret = 1
517 517 else:
518 518 all_revs.remove(r)
519 519 nm_rev = _find_node(root, nodemod.hex(index[r][7]))
520 520 if nm_rev is None:
521 521 msg = b" revision node does not match any entries: %d\n" % r
522 522 ui.write_err(msg)
523 523 ret = 1
524 524 elif nm_rev != r:
525 525 msg = (
526 526 b" revision node does not match the expected revision: "
527 527 b"%d != %d\n" % (r, nm_rev)
528 528 )
529 529 ui.write_err(msg)
530 530 ret = 1
531 531
532 532 if all_revs:
533 533 for r in sorted(all_revs):
534 534 msg = b" extra revision in nodemap: %d\n" % r
535 535 ui.write_err(msg)
536 536 ret = 1
537 537 return ret
538 538
539 539
540 540 def _all_revisions(root):
541 541 """return all revisions stored in a Trie"""
542 542 for block in _walk_trie(root):
543 543 for v in block:
544 544 if v is None or isinstance(v, Block):
545 545 continue
546 546 yield v
547 547
548 548
549 549 def _find_node(block, node):
550 550 """find the revision associated with a given node"""
551 551 entry = block.get(_to_int(node[0:1]))
552 552 if isinstance(entry, dict):
553 553 return _find_node(entry, node[1:])
554 554 return entry
General Comments 0
You need to be logged in to leave comments. Login now