##// END OF EJS Templates
dirstate-v2: fix typos in docstrings
Raphaël Gomès -
r50441:07b9ca52 stable
parent child Browse files
Show More
@@ -1,421 +1,421 b''
1 1 # v2.py - Pure-Python implementation of the dirstate-v2 file format
2 2 #
3 3 # Copyright Mercurial Contributors
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8
9 9 import struct
10 10
11 11 from ..thirdparty import attr
12 12 from .. import error, policy
13 13
14 14 parsers = policy.importmod('parsers')
15 15
16 16
17 17 # Must match the constant of the same name in
18 18 # `rust/hg-core/src/dirstate_tree/on_disk.rs`
19 19 TREE_METADATA_SIZE = 44
20 20 NODE_SIZE = 44
21 21
22 22
23 23 # Must match the `TreeMetadata` Rust struct in
24 24 # `rust/hg-core/src/dirstate_tree/on_disk.rs`. See doc-comments there.
25 25 #
26 26 # * 4 bytes: start offset of root nodes
27 27 # * 4 bytes: number of root nodes
28 28 # * 4 bytes: total number of nodes in the tree that have an entry
29 29 # * 4 bytes: total number of nodes in the tree that have a copy source
30 30 # * 4 bytes: number of bytes in the data file that are not used anymore
31 31 # * 4 bytes: unused
32 32 # * 20 bytes: SHA-1 hash of ignore patterns
33 33 TREE_METADATA = struct.Struct('>LLLLL4s20s')
34 34
35 35
36 36 # Must match the `Node` Rust struct in
37 37 # `rust/hg-core/src/dirstate_tree/on_disk.rs`. See doc-comments there.
38 38 #
39 39 # * 4 bytes: start offset of full path
40 40 # * 2 bytes: length of the full path
41 41 # * 2 bytes: length within the full path before its "base name"
42 42 # * 4 bytes: start offset of the copy source if any, or zero for no copy source
43 43 # * 2 bytes: length of the copy source if any, or unused
44 44 # * 4 bytes: start offset of child nodes
45 45 # * 4 bytes: number of child nodes
46 46 # * 4 bytes: number of descendant nodes that have an entry
47 47 # * 4 bytes: number of descendant nodes that have a "tracked" state
48 48 # * 1 byte: flags
49 49 # * 4 bytes: expected size
50 50 # * 4 bytes: mtime seconds
51 51 # * 4 bytes: mtime nanoseconds
52 52 NODE = struct.Struct('>LHHLHLLLLHlll')
53 53
54 54
55 55 assert TREE_METADATA_SIZE == TREE_METADATA.size
56 56 assert NODE_SIZE == NODE.size
57 57
58 58 # match constant in mercurial/pure/parsers.py
59 59 DIRSTATE_V2_DIRECTORY = 1 << 13
60 60
61 61
62 62 def parse_dirstate(map, copy_map, data, tree_metadata):
63 """parse a full v2-dirstate from a binary data into dictionnaries:
63 """parse a full v2-dirstate from a binary data into dictionaries:
64 64
65 65 - map: a {path: entry} mapping that will be filled
66 66 - copy_map: a {path: copy-source} mapping that will be filled
67 67 - data: a binary blob contains v2 nodes data
68 68 - tree_metadata:: a binary blob of the top level node (from the docket)
69 69 """
70 70 (
71 71 root_nodes_start,
72 72 root_nodes_len,
73 73 _nodes_with_entry_count,
74 74 _nodes_with_copy_source_count,
75 75 _unreachable_bytes,
76 76 _unused,
77 77 _ignore_patterns_hash,
78 78 ) = TREE_METADATA.unpack(tree_metadata)
79 79 parse_nodes(map, copy_map, data, root_nodes_start, root_nodes_len)
80 80
81 81
82 82 def parse_nodes(map, copy_map, data, start, len):
83 83 """parse <len> nodes from <data> starting at offset <start>
84 84
85 85 This is used by parse_dirstate to recursively fill `map` and `copy_map`.
86 86
87 87 All directory specific information is ignored and do not need any
88 88 processing (DIRECTORY, ALL_UNKNOWN_RECORDED, ALL_IGNORED_RECORDED)
89 89 """
90 90 for i in range(len):
91 91 node_start = start + NODE_SIZE * i
92 92 node_bytes = slice_with_len(data, node_start, NODE_SIZE)
93 93 (
94 94 path_start,
95 95 path_len,
96 96 _basename_start,
97 97 copy_source_start,
98 98 copy_source_len,
99 99 children_start,
100 100 children_count,
101 101 _descendants_with_entry_count,
102 102 _tracked_descendants_count,
103 103 flags,
104 104 size,
105 105 mtime_s,
106 106 mtime_ns,
107 107 ) = NODE.unpack(node_bytes)
108 108
109 109 # Parse child nodes of this node recursively
110 110 parse_nodes(map, copy_map, data, children_start, children_count)
111 111
112 112 item = parsers.DirstateItem.from_v2_data(flags, size, mtime_s, mtime_ns)
113 113 if not item.any_tracked:
114 114 continue
115 115 path = slice_with_len(data, path_start, path_len)
116 116 map[path] = item
117 117 if copy_source_start:
118 118 copy_map[path] = slice_with_len(
119 119 data, copy_source_start, copy_source_len
120 120 )
121 121
122 122
123 123 def slice_with_len(data, start, len):
124 124 return data[start : start + len]
125 125
126 126
127 127 @attr.s
128 128 class Node:
129 129 path = attr.ib()
130 130 entry = attr.ib()
131 131 parent = attr.ib(default=None)
132 132 children_count = attr.ib(default=0)
133 133 children_offset = attr.ib(default=0)
134 134 descendants_with_entry = attr.ib(default=0)
135 135 tracked_descendants = attr.ib(default=0)
136 136
137 137 def pack(self, copy_map, paths_offset):
138 138 path = self.path
139 139 copy = copy_map.get(path)
140 140 entry = self.entry
141 141
142 142 path_start = paths_offset
143 143 path_len = len(path)
144 144 basename_start = path.rfind(b'/') + 1 # 0 if rfind returns -1
145 145 if copy is not None:
146 146 copy_source_start = paths_offset + len(path)
147 147 copy_source_len = len(copy)
148 148 else:
149 149 copy_source_start = 0
150 150 copy_source_len = 0
151 151 if entry is not None:
152 152 flags, size, mtime_s, mtime_ns = entry.v2_data()
153 153 else:
154 154 # There are no mtime-cached directories in the Python implementation
155 155 flags = DIRSTATE_V2_DIRECTORY
156 156 size = 0
157 157 mtime_s = 0
158 158 mtime_ns = 0
159 159 return NODE.pack(
160 160 path_start,
161 161 path_len,
162 162 basename_start,
163 163 copy_source_start,
164 164 copy_source_len,
165 165 self.children_offset,
166 166 self.children_count,
167 167 self.descendants_with_entry,
168 168 self.tracked_descendants,
169 169 flags,
170 170 size,
171 171 mtime_s,
172 172 mtime_ns,
173 173 )
174 174
175 175
176 176 def pack_dirstate(map, copy_map):
177 177 """
178 178 Pack `map` and `copy_map` into the dirstate v2 binary format and return
179 179 the bytearray.
180 180
181 181 The on-disk format expects a tree-like structure where the leaves are
182 182 written first (and sorted per-directory), going up levels until the root
183 183 node and writing that one to the docket. See more details on the on-disk
184 184 format in `mercurial/helptext/internals/dirstate-v2`.
185 185
186 186 Since both `map` and `copy_map` are flat dicts we need to figure out the
187 187 hierarchy. This algorithm does so without having to build the entire tree
188 188 in-memory: it only keeps the minimum number of nodes around to satisfy the
189 189 format.
190 190
191 191 # Algorithm explanation
192 192
193 193 This explanation does not talk about the different counters for tracked
194 descendents and storing the copies, but that work is pretty simple once this
194 descendants and storing the copies, but that work is pretty simple once this
195 195 algorithm is in place.
196 196
197 197 ## Building a subtree
198 198
199 199 First, sort `map`: this makes it so the leaves of the tree are contiguous
200 200 per directory (i.e. a/b/c and a/b/d will be next to each other in the list),
201 201 and enables us to use the ordering of folders to have a "cursor" of the
202 202 current folder we're in without ever going twice in the same branch of the
203 203 tree. The cursor is a node that remembers its parent and any information
204 204 relevant to the format (see the `Node` class), building the relevant part
205 205 of the tree lazily.
206 206 Then, for each file in `map`, move the cursor into the tree to the
207 207 corresponding folder of the file: for example, if the very first file
208 208 is "a/b/c", we start from `Node[""]`, create `Node["a"]` which points to
209 209 its parent `Node[""]`, then create `Node["a/b"]`, which points to its parent
210 210 `Node["a"]`. These nodes are kept around in a stack.
211 211 If the next file in `map` is in the same subtree ("a/b/d" or "a/b/e/f"), we
212 212 add it to the stack and keep looping with the same logic of creating the
213 213 tree nodes as needed. If however the next file in `map` is *not* in the same
214 214 subtree ("a/other", if we're still in the "a/b" folder), then we know that
215 215 the subtree we're in is complete.
216 216
217 217 ## Writing the subtree
218 218
219 219 We have the entire subtree in the stack, so we start writing it to disk
220 220 folder by folder. The way we write a folder is to pop the stack into a list
221 221 until the folder changes, revert this list of direct children (to satisfy
222 222 the format requirement that children be sorted). This process repeats until
223 223 we hit the "other" subtree.
224 224
225 225 An example:
226 226 a
227 227 dir1/b
228 228 dir1/c
229 229 dir2/dir3/d
230 230 dir2/dir3/e
231 231 dir2/f
232 232
233 233 Would have us:
234 234 - add to the stack until "dir2/dir3/e"
235 235 - realize that "dir2/f" is in a different subtree
236 236 - pop "dir2/dir3/e", "dir2/dir3/d", reverse them so they're sorted and
237 237 pack them since the next entry is "dir2/dir3"
238 238 - go back up to "dir2"
239 239 - add "dir2/f" to the stack
240 240 - realize we're done with the map
241 241 - pop "dir2/f", "dir2/dir3" from the stack, reverse and pack them
242 242 - go up to the root node, do the same to write "a", "dir1" and "dir2" in
243 243 that order
244 244
245 245 ## Special case for the root node
246 246
247 247 The root node is not serialized in the format, but its information is
248 248 written to the docket. Again, see more details on the on-disk format in
249 249 `mercurial/helptext/internals/dirstate-v2`.
250 250 """
251 251 data = bytearray()
252 252 root_nodes_start = 0
253 253 root_nodes_len = 0
254 254 nodes_with_entry_count = 0
255 255 nodes_with_copy_source_count = 0
256 256 # Will always be 0 since this implementation always re-writes everything
257 257 # to disk
258 258 unreachable_bytes = 0
259 259 unused = b'\x00' * 4
260 260 # This is an optimization that's only useful for the Rust implementation
261 261 ignore_patterns_hash = b'\x00' * 20
262 262
263 263 if len(map) == 0:
264 264 tree_metadata = TREE_METADATA.pack(
265 265 root_nodes_start,
266 266 root_nodes_len,
267 267 nodes_with_entry_count,
268 268 nodes_with_copy_source_count,
269 269 unreachable_bytes,
270 270 unused,
271 271 ignore_patterns_hash,
272 272 )
273 273 return data, tree_metadata
274 274
275 275 sorted_map = sorted(map.items(), key=lambda x: x[0])
276 276
277 # Use a stack to not have to only remember the nodes we currently need
277 # Use a stack to have to only remember the nodes we currently need
278 278 # instead of building the entire tree in memory
279 279 stack = []
280 280 current_node = Node(b"", None)
281 281 stack.append(current_node)
282 282
283 283 for index, (path, entry) in enumerate(sorted_map, 1):
284 284 nodes_with_entry_count += 1
285 285 if path in copy_map:
286 286 nodes_with_copy_source_count += 1
287 287 current_folder = get_folder(path)
288 288 current_node = move_to_correct_node_in_tree(
289 289 current_folder, current_node, stack
290 290 )
291 291
292 292 current_node.children_count += 1
293 293 # Entries from `map` are never `None`
294 294 if entry.tracked:
295 295 current_node.tracked_descendants += 1
296 296 current_node.descendants_with_entry += 1
297 297 stack.append(Node(path, entry, current_node))
298 298
299 299 should_pack = True
300 300 next_path = None
301 301 if index < len(sorted_map):
302 302 # Determine if the next entry is in the same sub-tree, if so don't
303 303 # pack yet
304 304 next_path = sorted_map[index][0]
305 305 should_pack = not is_ancestor(next_path, current_folder)
306 306 if should_pack:
307 307 pack_directory_children(current_node, copy_map, data, stack)
308 308 while stack and current_node.path != b"":
309 309 # Go up the tree and write until we reach the folder of the next
310 310 # entry (if any, otherwise the root)
311 311 parent = current_node.parent
312 312 in_ancestor_of_next_path = next_path is not None and (
313 313 is_ancestor(next_path, get_folder(stack[-1].path))
314 314 )
315 315 if parent is None or in_ancestor_of_next_path:
316 316 break
317 317 pack_directory_children(parent, copy_map, data, stack)
318 318 current_node = parent
319 319
320 320 # Special case for the root node since we don't write it to disk, only its
321 321 # children to the docket
322 322 current_node = stack.pop()
323 323 assert current_node.path == b"", current_node.path
324 324 assert len(stack) == 0, len(stack)
325 325
326 326 tree_metadata = TREE_METADATA.pack(
327 327 current_node.children_offset,
328 328 current_node.children_count,
329 329 nodes_with_entry_count,
330 330 nodes_with_copy_source_count,
331 331 unreachable_bytes,
332 332 unused,
333 333 ignore_patterns_hash,
334 334 )
335 335
336 336 return data, tree_metadata
337 337
338 338
339 339 def get_folder(path):
340 340 """
341 341 Return the folder of the path that's given, an empty string for root paths.
342 342 """
343 343 return path.rsplit(b'/', 1)[0] if b'/' in path else b''
344 344
345 345
346 346 def is_ancestor(path, maybe_ancestor):
347 347 """Returns whether `maybe_ancestor` is an ancestor of `path`.
348 348
349 349 >>> is_ancestor(b"a", b"")
350 350 True
351 351 >>> is_ancestor(b"a/b/c", b"a/b/c")
352 352 False
353 353 >>> is_ancestor(b"hgext3rd/__init__.py", b"hgext")
354 354 False
355 355 >>> is_ancestor(b"hgext3rd/__init__.py", b"hgext3rd")
356 356 True
357 357 """
358 358 if maybe_ancestor == b"":
359 359 return True
360 360 if path <= maybe_ancestor:
361 361 return False
362 362 path_components = path.split(b"/")
363 363 ancestor_components = maybe_ancestor.split(b"/")
364 364 return all(c == o for c, o in zip(path_components, ancestor_components))
365 365
366 366
367 367 def move_to_correct_node_in_tree(target_folder, current_node, stack):
368 368 """
369 369 Move inside the dirstate node tree to the node corresponding to
370 370 `target_folder`, creating the missing nodes along the way if needed.
371 371 """
372 372 while target_folder != current_node.path:
373 373 if is_ancestor(target_folder, current_node.path):
374 374 # We need to go down a folder
375 375 prefix = target_folder[len(current_node.path) :].lstrip(b'/')
376 376 subfolder_name = prefix.split(b'/', 1)[0]
377 377 if current_node.path:
378 378 subfolder_path = current_node.path + b'/' + subfolder_name
379 379 else:
380 380 subfolder_path = subfolder_name
381 381 next_node = stack[-1]
382 382 if next_node.path == target_folder:
383 383 # This folder is now a file and only contains removed entries
384 384 # merge with the last node
385 385 current_node = next_node
386 386 else:
387 387 current_node.children_count += 1
388 388 current_node = Node(subfolder_path, None, current_node)
389 389 stack.append(current_node)
390 390 else:
391 391 # We need to go up a folder
392 392 current_node = current_node.parent
393 393 return current_node
394 394
395 395
396 396 def pack_directory_children(node, copy_map, data, stack):
397 397 """
398 398 Write the binary representation of the direct sorted children of `node` to
399 399 `data`
400 400 """
401 401 direct_children = []
402 402
403 403 while stack[-1].path != b"" and get_folder(stack[-1].path) == node.path:
404 404 direct_children.append(stack.pop())
405 405 if not direct_children:
406 406 raise error.ProgrammingError(b"no direct children for %r" % node.path)
407 407
408 408 # Reverse the stack to get the correct sorted order
409 409 direct_children.reverse()
410 410 packed_children = bytearray()
411 411 # Write the paths to `data`. Pack child nodes but don't write them yet
412 412 for child in direct_children:
413 413 packed = child.pack(copy_map=copy_map, paths_offset=len(data))
414 414 packed_children.extend(packed)
415 415 data.extend(child.path)
416 416 data.extend(copy_map.get(child.path, b""))
417 417 node.tracked_descendants += child.tracked_descendants
418 418 node.descendants_with_entry += child.descendants_with_entry
419 419 # Write the fixed-size child nodes all together
420 420 node.children_offset = len(data)
421 421 data.extend(packed_children)
General Comments 0
You need to be logged in to leave comments. Login now