##// END OF EJS Templates
nodemap: fix some comment formatting...
marmoute -
r48244:f70ca39d default
parent child Browse files
Show More
@@ -1,647 +1,647 b''
1 1 # nodemap.py - nodemap related code and utilities
2 2 #
3 3 # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
4 4 # Copyright 2019 George Racinet <georges.racinet@octobus.net>
5 5 #
6 6 # This software may be used and distributed according to the terms of the
7 7 # GNU General Public License version 2 or any later version.
8 8
9 9 from __future__ import absolute_import
10 10
11 11 import errno
12 12 import re
13 13 import struct
14 14
15 15 from ..node import hex
16 16
17 17 from .. import (
18 18 error,
19 19 util,
20 20 )
21 21 from . import docket as docket_mod
22 22
23 23
24 24 class NodeMap(dict):
25 25 def __missing__(self, x):
26 26 raise error.RevlogError(b'unknown node: %s' % x)
27 27
28 28
29 29 def persisted_data(revlog):
30 30 """read the nodemap for a revlog from disk"""
31 31 if revlog._nodemap_file is None:
32 32 return None
33 33 pdata = revlog.opener.tryread(revlog._nodemap_file)
34 34 if not pdata:
35 35 return None
36 36 offset = 0
37 37 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
38 38 if version != ONDISK_VERSION:
39 39 return None
40 40 offset += S_VERSION.size
41 41 headers = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
42 42 uid_size, tip_rev, data_length, data_unused, tip_node_size = headers
43 43 offset += S_HEADER.size
44 44 docket = NodeMapDocket(pdata[offset : offset + uid_size])
45 45 offset += uid_size
46 46 docket.tip_rev = tip_rev
47 47 docket.tip_node = pdata[offset : offset + tip_node_size]
48 48 docket.data_length = data_length
49 49 docket.data_unused = data_unused
50 50
51 51 filename = _rawdata_filepath(revlog, docket)
52 52 use_mmap = revlog.opener.options.get(b"persistent-nodemap.mmap")
53 53 try:
54 54 with revlog.opener(filename) as fd:
55 55 if use_mmap:
56 56 try:
57 57 data = util.buffer(util.mmapread(fd, data_length))
58 58 except ValueError:
59 59 # raised when the read file is too small
60 60 data = b''
61 61 else:
62 62 data = fd.read(data_length)
63 63 except (IOError, OSError) as e:
64 64 if e.errno == errno.ENOENT:
65 65 return None
66 66 else:
67 67 raise
68 68 if len(data) < data_length:
69 69 return None
70 70 return docket, data
71 71
72 72
73 73 def setup_persistent_nodemap(tr, revlog):
74 74 """Install whatever is needed transaction side to persist a nodemap on disk
75 75
76 76 (only actually persist the nodemap if this is relevant for this revlog)
77 77 """
78 78 if revlog._inline:
79 79 return # inlined revlog are too small for this to be relevant
80 80 if revlog._nodemap_file is None:
81 81 return # we do not use persistent_nodemap on this revlog
82 82
83 83 # we need to happen after the changelog finalization, in that use "cl-"
84 84 callback_id = b"nm-revlog-persistent-nodemap-%s" % revlog._nodemap_file
85 85 if tr.hasfinalize(callback_id):
86 86 return # no need to register again
87 87 tr.addpending(
88 88 callback_id, lambda tr: persist_nodemap(tr, revlog, pending=True)
89 89 )
90 90 tr.addfinalize(callback_id, lambda tr: persist_nodemap(tr, revlog))
91 91
92 92
93 93 class _NoTransaction(object):
94 94 """transaction like object to update the nodemap outside a transaction"""
95 95
96 96 def __init__(self):
97 97 self._postclose = {}
98 98
99 99 def addpostclose(self, callback_id, callback_func):
100 100 self._postclose[callback_id] = callback_func
101 101
102 102 def registertmp(self, *args, **kwargs):
103 103 pass
104 104
105 105 def addbackup(self, *args, **kwargs):
106 106 pass
107 107
108 108 def add(self, *args, **kwargs):
109 109 pass
110 110
111 111 def addabort(self, *args, **kwargs):
112 112 pass
113 113
114 114 def _report(self, *args):
115 115 pass
116 116
117 117
118 118 def update_persistent_nodemap(revlog):
119 119 """update the persistent nodemap right now
120 120
121 121 To be used for updating the nodemap on disk outside of a normal transaction
122 122 setup (eg, `debugupdatecache`).
123 123 """
124 124 if revlog._inline:
125 125 return # inlined revlog are too small for this to be relevant
126 126 if revlog._nodemap_file is None:
127 127 return # we do not use persistent_nodemap on this revlog
128 128
129 129 notr = _NoTransaction()
130 130 persist_nodemap(notr, revlog)
131 131 for k in sorted(notr._postclose):
132 132 notr._postclose[k](None)
133 133
134 134
135 135 def delete_nodemap(tr, repo, revlog):
136 """ Delete nodemap data on disk for a given revlog"""
136 """Delete nodemap data on disk for a given revlog"""
137 137 if revlog._nodemap_file is None:
138 138 msg = "calling persist nodemap on a revlog without the feature enabled"
139 139 raise error.ProgrammingError(msg)
140 140 repo.svfs.unlink(revlog._nodemap_file)
141 141
142 142
143 143 def persist_nodemap(tr, revlog, pending=False, force=False):
144 144 """Write nodemap data on disk for a given revlog"""
145 145 if getattr(revlog, 'filteredrevs', ()):
146 146 raise error.ProgrammingError(
147 147 "cannot persist nodemap of a filtered changelog"
148 148 )
149 149 if revlog._nodemap_file is None:
150 150 if force:
151 151 revlog._nodemap_file = get_nodemap_file(revlog)
152 152 else:
153 153 msg = "calling persist nodemap on a revlog without the feature enabled"
154 154 raise error.ProgrammingError(msg)
155 155
156 156 can_incremental = util.safehasattr(revlog.index, "nodemap_data_incremental")
157 157 ondisk_docket = revlog._nodemap_docket
158 158 feed_data = util.safehasattr(revlog.index, "update_nodemap_data")
159 159 use_mmap = revlog.opener.options.get(b"persistent-nodemap.mmap")
160 160
161 161 data = None
162 162 # first attemp an incremental update of the data
163 163 if can_incremental and ondisk_docket is not None:
164 164 target_docket = revlog._nodemap_docket.copy()
165 165 (
166 166 src_docket,
167 167 data_changed_count,
168 168 data,
169 169 ) = revlog.index.nodemap_data_incremental()
170 170 new_length = target_docket.data_length + len(data)
171 171 new_unused = target_docket.data_unused + data_changed_count
172 172 if src_docket != target_docket:
173 173 data = None
174 174 elif new_length <= (new_unused * 10): # under 10% of unused data
175 175 data = None
176 176 else:
177 177 datafile = _rawdata_filepath(revlog, target_docket)
178 178 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
179 179 # store vfs
180 180 tr.add(datafile, target_docket.data_length)
181 181 with revlog.opener(datafile, b'r+') as fd:
182 182 fd.seek(target_docket.data_length)
183 183 fd.write(data)
184 184 if feed_data:
185 185 if use_mmap:
186 186 fd.seek(0)
187 187 new_data = fd.read(new_length)
188 188 else:
189 189 fd.flush()
190 190 new_data = util.buffer(util.mmapread(fd, new_length))
191 191 target_docket.data_length = new_length
192 192 target_docket.data_unused = new_unused
193 193
194 194 if data is None:
195 195 # otherwise fallback to a full new export
196 196 target_docket = NodeMapDocket()
197 197 datafile = _rawdata_filepath(revlog, target_docket)
198 198 if util.safehasattr(revlog.index, "nodemap_data_all"):
199 199 data = revlog.index.nodemap_data_all()
200 200 else:
201 201 data = persistent_data(revlog.index)
202 202 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
203 203 # store vfs
204 204
205 205 tryunlink = revlog.opener.tryunlink
206 206
207 207 def abortck(tr):
208 208 tryunlink(datafile)
209 209
210 210 callback_id = b"delete-%s" % datafile
211 211
212 212 # some flavor of the transaction abort does not cleanup new file, it
213 213 # simply empty them.
214 214 tr.addabort(callback_id, abortck)
215 215 with revlog.opener(datafile, b'w+') as fd:
216 216 fd.write(data)
217 217 if feed_data:
218 218 if use_mmap:
219 219 new_data = data
220 220 else:
221 221 fd.flush()
222 222 new_data = util.buffer(util.mmapread(fd, len(data)))
223 223 target_docket.data_length = len(data)
224 224 target_docket.tip_rev = revlog.tiprev()
225 225 target_docket.tip_node = revlog.node(target_docket.tip_rev)
226 226 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
227 227 # store vfs
228 228 file_path = revlog._nodemap_file
229 229 if pending:
230 230 file_path += b'.a'
231 231 tr.registertmp(file_path)
232 232 else:
233 233 tr.addbackup(file_path)
234 234
235 235 with revlog.opener(file_path, b'w', atomictemp=True) as fp:
236 236 fp.write(target_docket.serialize())
237 237 revlog._nodemap_docket = target_docket
238 238 if feed_data:
239 239 revlog.index.update_nodemap_data(target_docket, new_data)
240 240
241 241 # search for old index file in all cases, some older process might have
242 242 # left one behind.
243 243 olds = _other_rawdata_filepath(revlog, target_docket)
244 244 if olds:
245 245 realvfs = getattr(revlog, '_realopener', revlog.opener)
246 246
247 247 def cleanup(tr):
248 248 for oldfile in olds:
249 249 realvfs.tryunlink(oldfile)
250 250
251 251 callback_id = b"revlog-cleanup-nodemap-%s" % revlog._nodemap_file
252 252 tr.addpostclose(callback_id, cleanup)
253 253
254 254
255 255 ### Nodemap docket file
256 256 #
257 257 # The nodemap data are stored on disk using 2 files:
258 258 #
259 259 # * a raw data files containing a persistent nodemap
260 260 # (see `Nodemap Trie` section)
261 261 #
262 262 # * a small "docket" file containing medatadata
263 263 #
264 264 # While the nodemap data can be multiple tens of megabytes, the "docket" is
265 265 # small, it is easy to update it automatically or to duplicated its content
266 266 # during a transaction.
267 267 #
268 268 # Multiple raw data can exist at the same time (The currently valid one and a
269 269 # new one beind used by an in progress transaction). To accomodate this, the
270 270 # filename hosting the raw data has a variable parts. The exact filename is
271 271 # specified inside the "docket" file.
272 272 #
273 273 # The docket file contains information to find, qualify and validate the raw
274 274 # data. Its content is currently very light, but it will expand as the on disk
275 275 # nodemap gains the necessary features to be used in production.
276 276
277 277 ONDISK_VERSION = 1
278 278 S_VERSION = struct.Struct(">B")
279 279 S_HEADER = struct.Struct(">BQQQQ")
280 280
281 281
282 282 class NodeMapDocket(object):
283 283 """metadata associated with persistent nodemap data
284 284
285 285 The persistent data may come from disk or be on their way to disk.
286 286 """
287 287
288 288 def __init__(self, uid=None):
289 289 if uid is None:
290 290 uid = docket_mod.make_uid()
291 291 # a unique identifier for the data file:
292 292 # - When new data are appended, it is preserved.
293 293 # - When a new data file is created, a new identifier is generated.
294 294 self.uid = uid
295 295 # the tipmost revision stored in the data file. This revision and all
296 296 # revision before it are expected to be encoded in the data file.
297 297 self.tip_rev = None
298 298 # the node of that tipmost revision, if it mismatch the current index
299 299 # data the docket is not valid for the current index and should be
300 300 # discarded.
301 301 #
302 302 # note: this method is not perfect as some destructive operation could
303 303 # preserve the same tip_rev + tip_node while altering lower revision.
304 304 # However this multiple other caches have the same vulnerability (eg:
305 305 # brancmap cache).
306 306 self.tip_node = None
307 307 # the size (in bytes) of the persisted data to encode the nodemap valid
308 308 # for `tip_rev`.
309 309 # - data file shorter than this are corrupted,
310 310 # - any extra data should be ignored.
311 311 self.data_length = None
312 312 # the amount (in bytes) of "dead" data, still in the data file but no
313 313 # longer used for the nodemap.
314 314 self.data_unused = 0
315 315
316 316 def copy(self):
317 317 new = NodeMapDocket(uid=self.uid)
318 318 new.tip_rev = self.tip_rev
319 319 new.tip_node = self.tip_node
320 320 new.data_length = self.data_length
321 321 new.data_unused = self.data_unused
322 322 return new
323 323
324 324 def __cmp__(self, other):
325 325 if self.uid < other.uid:
326 326 return -1
327 327 if self.uid > other.uid:
328 328 return 1
329 329 elif self.data_length < other.data_length:
330 330 return -1
331 331 elif self.data_length > other.data_length:
332 332 return 1
333 333 return 0
334 334
335 335 def __eq__(self, other):
336 336 return self.uid == other.uid and self.data_length == other.data_length
337 337
338 338 def serialize(self):
339 339 """return serialized bytes for a docket using the passed uid"""
340 340 data = []
341 341 data.append(S_VERSION.pack(ONDISK_VERSION))
342 342 headers = (
343 343 len(self.uid),
344 344 self.tip_rev,
345 345 self.data_length,
346 346 self.data_unused,
347 347 len(self.tip_node),
348 348 )
349 349 data.append(S_HEADER.pack(*headers))
350 350 data.append(self.uid)
351 351 data.append(self.tip_node)
352 352 return b''.join(data)
353 353
354 354
355 355 def _rawdata_filepath(revlog, docket):
356 356 """The (vfs relative) nodemap's rawdata file for a given uid"""
357 357 prefix = revlog.radix
358 358 return b"%s-%s.nd" % (prefix, docket.uid)
359 359
360 360
361 361 def _other_rawdata_filepath(revlog, docket):
362 362 prefix = revlog.radix
363 363 pattern = re.compile(br"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
364 364 new_file_path = _rawdata_filepath(revlog, docket)
365 365 new_file_name = revlog.opener.basename(new_file_path)
366 366 dirpath = revlog.opener.dirname(new_file_path)
367 367 others = []
368 368 for f in revlog.opener.listdir(dirpath):
369 369 if pattern.match(f) and f != new_file_name:
370 370 others.append(f)
371 371 return others
372 372
373 373
374 374 ### Nodemap Trie
375 375 #
376 376 # This is a simple reference implementation to compute and persist a nodemap
377 377 # trie. This reference implementation is write only. The python version of this
378 378 # is not expected to be actually used, since it wont provide performance
379 379 # improvement over existing non-persistent C implementation.
380 380 #
381 381 # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
382 382 # revision can be adressed using its node shortest prefix.
383 383 #
384 384 # The trie is stored as a sequence of block. Each block contains 16 entries
385 385 # (signed 64bit integer, big endian). Each entry can be one of the following:
386 386 #
387 387 # * value >= 0 -> index of sub-block
388 388 # * value == -1 -> no value
389 389 # * value < -1 -> encoded revision: rev = -(value+2)
390 390 #
391 391 # See REV_OFFSET and _transform_rev below.
392 392 #
393 393 # The implementation focus on simplicity, not on performance. A Rust
394 394 # implementation should provide a efficient version of the same binary
395 395 # persistence. This reference python implementation is never meant to be
396 396 # extensively use in production.
397 397
398 398
399 399 def persistent_data(index):
400 400 """return the persistent binary form for a nodemap for a given index"""
401 401 trie = _build_trie(index)
402 402 return _persist_trie(trie)
403 403
404 404
405 405 def update_persistent_data(index, root, max_idx, last_rev):
406 406 """return the incremental update for persistent nodemap from a given index"""
407 407 changed_block, trie = _update_trie(index, root, last_rev)
408 408 return (
409 409 changed_block * S_BLOCK.size,
410 410 _persist_trie(trie, existing_idx=max_idx),
411 411 )
412 412
413 413
414 414 S_BLOCK = struct.Struct(">" + ("l" * 16))
415 415
416 416 NO_ENTRY = -1
417 417 # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
418 418 REV_OFFSET = 2
419 419
420 420
421 421 def _transform_rev(rev):
422 422 """Return the number used to represent the rev in the tree.
423 423
424 424 (or retrieve a rev number from such representation)
425 425
426 426 Note that this is an involution, a function equal to its inverse (i.e.
427 427 which gives the identity when applied to itself).
428 428 """
429 429 return -(rev + REV_OFFSET)
430 430
431 431
432 432 def _to_int(hex_digit):
433 433 """turn an hexadecimal digit into a proper integer"""
434 434 return int(hex_digit, 16)
435 435
436 436
437 437 class Block(dict):
438 438 """represent a block of the Trie
439 439
440 440 contains up to 16 entry indexed from 0 to 15"""
441 441
442 442 def __init__(self):
443 443 super(Block, self).__init__()
444 444 # If this block exist on disk, here is its ID
445 445 self.ondisk_id = None
446 446
447 447 def __iter__(self):
448 448 return iter(self.get(i) for i in range(16))
449 449
450 450
451 451 def _build_trie(index):
452 452 """build a nodemap trie
453 453
454 454 The nodemap stores revision number for each unique prefix.
455 455
456 456 Each block is a dictionary with keys in `[0, 15]`. Values are either
457 457 another block or a revision number.
458 458 """
459 459 root = Block()
460 460 for rev in range(len(index)):
461 461 current_hex = hex(index[rev][7])
462 462 _insert_into_block(index, 0, root, rev, current_hex)
463 463 return root
464 464
465 465
466 466 def _update_trie(index, root, last_rev):
467 467 """consume"""
468 468 changed = 0
469 469 for rev in range(last_rev + 1, len(index)):
470 470 current_hex = hex(index[rev][7])
471 471 changed += _insert_into_block(index, 0, root, rev, current_hex)
472 472 return changed, root
473 473
474 474
475 475 def _insert_into_block(index, level, block, current_rev, current_hex):
476 476 """insert a new revision in a block
477 477
478 478 index: the index we are adding revision for
479 479 level: the depth of the current block in the trie
480 480 block: the block currently being considered
481 481 current_rev: the revision number we are adding
482 482 current_hex: the hexadecimal representation of the of that revision
483 483 """
484 484 changed = 1
485 485 if block.ondisk_id is not None:
486 486 block.ondisk_id = None
487 487 hex_digit = _to_int(current_hex[level : level + 1])
488 488 entry = block.get(hex_digit)
489 489 if entry is None:
490 490 # no entry, simply store the revision number
491 491 block[hex_digit] = current_rev
492 492 elif isinstance(entry, dict):
493 493 # need to recurse to an underlying block
494 494 changed += _insert_into_block(
495 495 index, level + 1, entry, current_rev, current_hex
496 496 )
497 497 else:
498 498 # collision with a previously unique prefix, inserting new
499 499 # vertices to fit both entry.
500 500 other_hex = hex(index[entry][7])
501 501 other_rev = entry
502 502 new = Block()
503 503 block[hex_digit] = new
504 504 _insert_into_block(index, level + 1, new, other_rev, other_hex)
505 505 _insert_into_block(index, level + 1, new, current_rev, current_hex)
506 506 return changed
507 507
508 508
509 509 def _persist_trie(root, existing_idx=None):
510 510 """turn a nodemap trie into persistent binary data
511 511
512 512 See `_build_trie` for nodemap trie structure"""
513 513 block_map = {}
514 514 if existing_idx is not None:
515 515 base_idx = existing_idx + 1
516 516 else:
517 517 base_idx = 0
518 518 chunks = []
519 519 for tn in _walk_trie(root):
520 520 if tn.ondisk_id is not None:
521 521 block_map[id(tn)] = tn.ondisk_id
522 522 else:
523 523 block_map[id(tn)] = len(chunks) + base_idx
524 524 chunks.append(_persist_block(tn, block_map))
525 525 return b''.join(chunks)
526 526
527 527
528 528 def _walk_trie(block):
529 529 """yield all the block in a trie
530 530
531 531 Children blocks are always yield before their parent block.
532 532 """
533 533 for (__, item) in sorted(block.items()):
534 534 if isinstance(item, dict):
535 535 for sub_block in _walk_trie(item):
536 536 yield sub_block
537 537 yield block
538 538
539 539
540 540 def _persist_block(block_node, block_map):
541 541 """produce persistent binary data for a single block
542 542
543 543 Children block are assumed to be already persisted and present in
544 544 block_map.
545 545 """
546 546 data = tuple(_to_value(v, block_map) for v in block_node)
547 547 return S_BLOCK.pack(*data)
548 548
549 549
550 550 def _to_value(item, block_map):
551 551 """persist any value as an integer"""
552 552 if item is None:
553 553 return NO_ENTRY
554 554 elif isinstance(item, dict):
555 555 return block_map[id(item)]
556 556 else:
557 557 return _transform_rev(item)
558 558
559 559
560 560 def parse_data(data):
561 561 """parse parse nodemap data into a nodemap Trie"""
562 562 if (len(data) % S_BLOCK.size) != 0:
563 563 msg = b"nodemap data size is not a multiple of block size (%d): %d"
564 564 raise error.Abort(msg % (S_BLOCK.size, len(data)))
565 565 if not data:
566 566 return Block(), None
567 567 block_map = {}
568 568 new_blocks = []
569 569 for i in range(0, len(data), S_BLOCK.size):
570 570 block = Block()
571 571 block.ondisk_id = len(block_map)
572 572 block_map[block.ondisk_id] = block
573 573 block_data = data[i : i + S_BLOCK.size]
574 574 values = S_BLOCK.unpack(block_data)
575 575 new_blocks.append((block, values))
576 576 for b, values in new_blocks:
577 577 for idx, v in enumerate(values):
578 578 if v == NO_ENTRY:
579 579 continue
580 580 elif v >= 0:
581 581 b[idx] = block_map[v]
582 582 else:
583 583 b[idx] = _transform_rev(v)
584 584 return block, i // S_BLOCK.size
585 585
586 586
587 587 # debug utility
588 588
589 589
590 590 def check_data(ui, index, data):
591 591 """verify that the provided nodemap data are valid for the given idex"""
592 592 ret = 0
593 593 ui.status((b"revision in index: %d\n") % len(index))
594 594 root, __ = parse_data(data)
595 595 all_revs = set(_all_revisions(root))
596 596 ui.status((b"revision in nodemap: %d\n") % len(all_revs))
597 597 for r in range(len(index)):
598 598 if r not in all_revs:
599 599 msg = b" revision missing from nodemap: %d\n" % r
600 600 ui.write_err(msg)
601 601 ret = 1
602 602 else:
603 603 all_revs.remove(r)
604 604 nm_rev = _find_node(root, hex(index[r][7]))
605 605 if nm_rev is None:
606 606 msg = b" revision node does not match any entries: %d\n" % r
607 607 ui.write_err(msg)
608 608 ret = 1
609 609 elif nm_rev != r:
610 610 msg = (
611 611 b" revision node does not match the expected revision: "
612 612 b"%d != %d\n" % (r, nm_rev)
613 613 )
614 614 ui.write_err(msg)
615 615 ret = 1
616 616
617 617 if all_revs:
618 618 for r in sorted(all_revs):
619 619 msg = b" extra revision in nodemap: %d\n" % r
620 620 ui.write_err(msg)
621 621 ret = 1
622 622 return ret
623 623
624 624
625 625 def _all_revisions(root):
626 626 """return all revisions stored in a Trie"""
627 627 for block in _walk_trie(root):
628 628 for v in block:
629 629 if v is None or isinstance(v, Block):
630 630 continue
631 631 yield v
632 632
633 633
634 634 def _find_node(block, node):
635 635 """find the revision associated with a given node"""
636 636 entry = block.get(_to_int(node[0:1]))
637 637 if isinstance(entry, dict):
638 638 return _find_node(entry, node[1:])
639 639 return entry
640 640
641 641
642 642 def get_nodemap_file(revlog):
643 643 if revlog._trypending:
644 644 pending_path = revlog.radix + b".n.a"
645 645 if revlog.opener.exists(pending_path):
646 646 return pending_path
647 647 return revlog.radix + b".n"
General Comments 0
You need to be logged in to leave comments. Login now