##// END OF EJS Templates
dirstate-v2: Reserve a few bytes of space for future extensions...
Simon Sapin -
r48484:852262e2 default
parent child Browse files
Show More
@@ -1,75 +1,75 b''
1 1 # dirstatedocket.py - docket file for dirstate-v2
2 2 #
3 3 # Copyright Mercurial Contributors
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import struct
11 11
12 12 from ..revlogutils import docket as docket_mod
13 13
14 14
15 15 V2_FORMAT_MARKER = b"dirstate-v2\n"
16 16
17 17 # Must match the constant of the same name in
18 18 # `rust/hg-core/src/dirstate_tree/on_disk.rs`
19 TREE_METADATA_SIZE = 40
19 TREE_METADATA_SIZE = 44
20 20
21 21 # * 12 bytes: format marker
22 22 # * 32 bytes: node ID of the working directory's first parent
23 23 # * 32 bytes: node ID of the working directory's second parent
24 24 # * 4 bytes: big-endian used size of the data file
25 25 # * {TREE_METADATA_SIZE} bytes: tree metadata, parsed separately
26 26 # * 1 byte: length of the data file's UUID
27 27 # * variable: data file's UUID
28 28 #
29 29 # Node IDs are null-padded if shorter than 32 bytes.
30 30 # A data file shorter than the specified used size is corrupted (truncated)
31 31 HEADER = struct.Struct(
32 32 ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE)
33 33 )
34 34
35 35
36 36 class DirstateDocket(object):
37 37 data_filename_pattern = b'dirstate.%s.d'
38 38
39 39 def __init__(self, parents, data_size, tree_metadata, uuid):
40 40 self.parents = parents
41 41 self.data_size = data_size
42 42 self.tree_metadata = tree_metadata
43 43 self.uuid = uuid
44 44
45 45 @classmethod
46 46 def with_new_uuid(cls, parents, data_size, tree_metadata):
47 47 return cls(parents, data_size, tree_metadata, docket_mod.make_uid())
48 48
49 49 @classmethod
50 50 def parse(cls, data, nodeconstants):
51 51 if not data:
52 52 parents = (nodeconstants.nullid, nodeconstants.nullid)
53 53 return cls(parents, 0, b'', None)
54 54 marker, p1, p2, data_size, meta, uuid_size = HEADER.unpack_from(data)
55 55 if marker != V2_FORMAT_MARKER:
56 56 raise ValueError("expected dirstate-v2 marker")
57 57 uuid = data[HEADER.size : HEADER.size + uuid_size]
58 58 p1 = p1[: nodeconstants.nodelen]
59 59 p2 = p2[: nodeconstants.nodelen]
60 60 return cls((p1, p2), data_size, meta, uuid)
61 61
62 62 def serialize(self):
63 63 p1, p2 = self.parents
64 64 header = HEADER.pack(
65 65 V2_FORMAT_MARKER,
66 66 p1,
67 67 p2,
68 68 self.data_size,
69 69 self.tree_metadata,
70 70 len(self.uuid),
71 71 )
72 72 return header + self.uuid
73 73
74 74 def data_filename(self):
75 75 return self.data_filename_pattern % self.uuid
@@ -1,753 +1,760 b''
1 1 //! The "version 2" disk representation of the dirstate
2 2 //!
3 3 //! # File format
4 4 //!
5 5 //! In dirstate-v2 format, the `.hg/dirstate` file is a "docket that starts
6 6 //! with a fixed-sized header whose layout is defined by the `DocketHeader`
7 7 //! struct, followed by the data file identifier.
8 8 //!
9 9 //! A separate `.hg/dirstate.{uuid}.d` file contains most of the data. That
10 10 //! file may be longer than the size given in the docket, but not shorter. Only
11 11 //! the start of the data file up to the given size is considered. The
12 12 //! fixed-size "root" of the dirstate tree whose layout is defined by the
13 13 //! `Root` struct is found at the end of that slice of data.
14 14 //!
15 15 //! Its `root_nodes` field contains the slice (offset and length) to
16 16 //! the nodes representing the files and directories at the root of the
17 17 //! repository. Each node is also fixed-size, defined by the `Node` struct.
18 18 //! Nodes in turn contain slices to variable-size paths, and to their own child
19 19 //! nodes (if any) for nested files and directories.
20 20
21 21 use crate::dirstate_tree::dirstate_map::{self, DirstateMap, NodeRef};
22 22 use crate::dirstate_tree::path_with_basename::WithBasename;
23 23 use crate::errors::HgError;
24 24 use crate::utils::hg_path::HgPath;
25 25 use crate::DirstateEntry;
26 26 use crate::DirstateError;
27 27 use crate::DirstateParents;
28 28 use crate::EntryState;
29 29 use bytes_cast::unaligned::{I32Be, I64Be, U16Be, U32Be};
30 30 use bytes_cast::BytesCast;
31 31 use format_bytes::format_bytes;
32 32 use std::borrow::Cow;
33 33 use std::convert::{TryFrom, TryInto};
34 34 use std::time::{Duration, SystemTime, UNIX_EPOCH};
35 35
36 36 /// Added at the start of `.hg/dirstate` when the "v2" format is used.
37 37 /// This a redundant sanity check more than an actual "magic number" since
38 38 /// `.hg/requires` already governs which format should be used.
39 39 pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
40 40
41 41 /// Keep space for 256-bit hashes
42 42 const STORED_NODE_ID_BYTES: usize = 32;
43 43
44 44 /// … even though only 160 bits are used for now, with SHA-1
45 45 const USED_NODE_ID_BYTES: usize = 20;
46 46
47 47 pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
48 48 pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
49 49
50 50 /// Must match the constant of the same name in
51 51 /// `mercurial/dirstateutils/docket.py`
52 const TREE_METADATA_SIZE: usize = 40;
52 const TREE_METADATA_SIZE: usize = 44;
53 53
54 54 /// Make sure that size-affecting changes are made knowingly
55 55 #[allow(unused)]
56 56 fn static_assert_size_of() {
57 let _ = std::mem::transmute::<DocketHeader, [u8; 121]>;
58 57 let _ = std::mem::transmute::<TreeMetadata, [u8; TREE_METADATA_SIZE]>;
58 let _ = std::mem::transmute::<DocketHeader, [u8; TREE_METADATA_SIZE + 81]>;
59 59 let _ = std::mem::transmute::<Node, [u8; 43]>;
60 60 }
61 61
62 62 // Must match `HEADER` in `mercurial/dirstateutils/docket.py`
63 63 #[derive(BytesCast)]
64 64 #[repr(C)]
65 65 struct DocketHeader {
66 66 marker: [u8; V2_FORMAT_MARKER.len()],
67 67 parent_1: [u8; STORED_NODE_ID_BYTES],
68 68 parent_2: [u8; STORED_NODE_ID_BYTES],
69 69
70 70 /// Counted in bytes
71 71 data_size: Size,
72 72
73 73 metadata: TreeMetadata,
74 74
75 75 uuid_size: u8,
76 76 }
77 77
78 78 pub struct Docket<'on_disk> {
79 79 header: &'on_disk DocketHeader,
80 80 uuid: &'on_disk [u8],
81 81 }
82 82
83 83 #[derive(BytesCast)]
84 84 #[repr(C)]
85 85 struct TreeMetadata {
86 86 root_nodes: ChildNodes,
87 87 nodes_with_entry_count: Size,
88 88 nodes_with_copy_source_count: Size,
89 89
90 90 /// How many bytes of this data file are not used anymore
91 91 unreachable_bytes: Size,
92 92
93 /// Current version always sets these bytes to zero when creating or
94 /// updating a dirstate. Future versions could assign some bits to signal
95 /// for example "the version that last wrote/updated this dirstate did so
96 /// in such and such way that can be relied on by versions that know to."
97 unused: [u8; 4],
98
93 99 /// If non-zero, a hash of ignore files that were used for some previous
94 100 /// run of the `status` algorithm.
95 101 ///
96 102 /// We define:
97 103 ///
98 104 /// * "Root" ignore files are `.hgignore` at the root of the repository if
99 105 /// it exists, and files from `ui.ignore.*` config. This set of files is
100 106 /// then sorted by the string representation of their path.
101 107 /// * The "expanded contents" of an ignore files is the byte string made
102 108 /// by concatenating its contents with the "expanded contents" of other
103 109 /// files included with `include:` or `subinclude:` files, in inclusion
104 110 /// order. This definition is recursive, as included files can
105 111 /// themselves include more files.
106 112 ///
107 113 /// This hash is defined as the SHA-1 of the concatenation (in sorted
108 114 /// order) of the "expanded contents" of each "root" ignore file.
109 115 /// (Note that computing this does not require actually concatenating byte
110 116 /// strings into contiguous memory, instead SHA-1 hashing can be done
111 117 /// incrementally.)
112 118 ignore_patterns_hash: IgnorePatternsHash,
113 119 }
114 120
115 121 #[derive(BytesCast)]
116 122 #[repr(C)]
117 123 pub(super) struct Node {
118 124 full_path: PathSlice,
119 125
120 126 /// In bytes from `self.full_path.start`
121 127 base_name_start: PathSize,
122 128
123 129 copy_source: OptPathSlice,
124 130 children: ChildNodes,
125 131 pub(super) descendants_with_entry_count: Size,
126 132 pub(super) tracked_descendants_count: Size,
127 133
128 134 /// Depending on the value of `state`:
129 135 ///
130 136 /// * A null byte: `data` is not used.
131 137 ///
132 138 /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
133 139 /// represent a dirstate entry like in the v1 format.
134 140 ///
135 141 /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
136 142 /// as the `Timestamp` for the mtime of a cached directory.
137 143 ///
138 144 /// The presence of this state means that at some point, this path in
139 145 /// the working directory was observed:
140 146 ///
141 147 /// - To be a directory
142 148 /// - With the modification time as given by `Timestamp`
143 149 /// - That timestamp was already strictly in the past when observed,
144 150 /// meaning that later changes cannot happen in the same clock tick
145 151 /// and must cause a different modification time (unless the system
146 152 /// clock jumps back and we get unlucky, which is not impossible but
147 153 /// but deemed unlikely enough).
148 154 /// - All direct children of this directory (as returned by
149 155 /// `std::fs::read_dir`) either have a corresponding dirstate node, or
150 156 /// are ignored by ignore patterns whose hash is in
151 157 /// `TreeMetadata::ignore_patterns_hash`.
152 158 ///
153 159 /// This means that if `std::fs::symlink_metadata` later reports the
154 160 /// same modification time and ignored patterns haven’t changed, a run
155 161 /// of status that is not listing ignored files can skip calling
156 162 /// `std::fs::read_dir` again for this directory, iterate child
157 163 /// dirstate nodes instead.
158 164 state: u8,
159 165 data: Entry,
160 166 }
161 167
162 168 #[derive(BytesCast, Copy, Clone)]
163 169 #[repr(C)]
164 170 struct Entry {
165 171 mode: I32Be,
166 172 mtime: I32Be,
167 173 size: I32Be,
168 174 }
169 175
170 176 /// Duration since the Unix epoch
171 177 #[derive(BytesCast, Copy, Clone, PartialEq)]
172 178 #[repr(C)]
173 179 pub(super) struct Timestamp {
174 180 seconds: I64Be,
175 181
176 182 /// In `0 .. 1_000_000_000`.
177 183 ///
178 184 /// This timestamp is later or earlier than `(seconds, 0)` by this many
179 185 /// nanoseconds, if `seconds` is non-negative or negative, respectively.
180 186 nanoseconds: U32Be,
181 187 }
182 188
183 189 /// Counted in bytes from the start of the file
184 190 ///
185 191 /// NOTE: not supporting `.hg/dirstate` files larger than 4 GiB.
186 192 type Offset = U32Be;
187 193
188 194 /// Counted in number of items
189 195 ///
190 196 /// NOTE: we choose not to support counting more than 4 billion nodes anywhere.
191 197 type Size = U32Be;
192 198
193 199 /// Counted in bytes
194 200 ///
195 201 /// NOTE: we choose not to support file names/paths longer than 64 KiB.
196 202 type PathSize = U16Be;
197 203
198 204 /// A contiguous sequence of `len` times `Node`, representing the child nodes
199 205 /// of either some other node or of the repository root.
200 206 ///
201 207 /// Always sorted by ascending `full_path`, to allow binary search.
202 208 /// Since nodes with the same parent nodes also have the same parent path,
203 209 /// only the `base_name`s need to be compared during binary search.
204 210 #[derive(BytesCast, Copy, Clone)]
205 211 #[repr(C)]
206 212 struct ChildNodes {
207 213 start: Offset,
208 214 len: Size,
209 215 }
210 216
211 217 /// A `HgPath` of `len` bytes
212 218 #[derive(BytesCast, Copy, Clone)]
213 219 #[repr(C)]
214 220 struct PathSlice {
215 221 start: Offset,
216 222 len: PathSize,
217 223 }
218 224
219 225 /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
220 226 type OptPathSlice = PathSlice;
221 227
222 228 /// Unexpected file format found in `.hg/dirstate` with the "v2" format.
223 229 ///
224 230 /// This should only happen if Mercurial is buggy or a repository is corrupted.
225 231 #[derive(Debug)]
226 232 pub struct DirstateV2ParseError;
227 233
228 234 impl From<DirstateV2ParseError> for HgError {
229 235 fn from(_: DirstateV2ParseError) -> Self {
230 236 HgError::corrupted("dirstate-v2 parse error")
231 237 }
232 238 }
233 239
234 240 impl From<DirstateV2ParseError> for crate::DirstateError {
235 241 fn from(error: DirstateV2ParseError) -> Self {
236 242 HgError::from(error).into()
237 243 }
238 244 }
239 245
240 246 impl<'on_disk> Docket<'on_disk> {
241 247 pub fn parents(&self) -> DirstateParents {
242 248 use crate::Node;
243 249 let p1 = Node::try_from(&self.header.parent_1[..USED_NODE_ID_BYTES])
244 250 .unwrap()
245 251 .clone();
246 252 let p2 = Node::try_from(&self.header.parent_2[..USED_NODE_ID_BYTES])
247 253 .unwrap()
248 254 .clone();
249 255 DirstateParents { p1, p2 }
250 256 }
251 257
252 258 pub fn tree_metadata(&self) -> &[u8] {
253 259 self.header.metadata.as_bytes()
254 260 }
255 261
256 262 pub fn data_size(&self) -> usize {
257 263 // This `unwrap` could only panic on a 16-bit CPU
258 264 self.header.data_size.get().try_into().unwrap()
259 265 }
260 266
261 267 pub fn data_filename(&self) -> String {
262 268 String::from_utf8(format_bytes!(b"dirstate.{}.d", self.uuid)).unwrap()
263 269 }
264 270 }
265 271
266 272 pub fn read_docket(
267 273 on_disk: &[u8],
268 274 ) -> Result<Docket<'_>, DirstateV2ParseError> {
269 275 let (header, uuid) =
270 276 DocketHeader::from_bytes(on_disk).map_err(|_| DirstateV2ParseError)?;
271 277 let uuid_size = header.uuid_size as usize;
272 278 if header.marker == *V2_FORMAT_MARKER && uuid.len() == uuid_size {
273 279 Ok(Docket { header, uuid })
274 280 } else {
275 281 Err(DirstateV2ParseError)
276 282 }
277 283 }
278 284
279 285 pub(super) fn read<'on_disk>(
280 286 on_disk: &'on_disk [u8],
281 287 metadata: &[u8],
282 288 ) -> Result<DirstateMap<'on_disk>, DirstateV2ParseError> {
283 289 if on_disk.is_empty() {
284 290 return Ok(DirstateMap::empty(on_disk));
285 291 }
286 292 let (meta, _) = TreeMetadata::from_bytes(metadata)
287 293 .map_err(|_| DirstateV2ParseError)?;
288 294 let dirstate_map = DirstateMap {
289 295 on_disk,
290 296 root: dirstate_map::ChildNodes::OnDisk(read_nodes(
291 297 on_disk,
292 298 meta.root_nodes,
293 299 )?),
294 300 nodes_with_entry_count: meta.nodes_with_entry_count.get(),
295 301 nodes_with_copy_source_count: meta.nodes_with_copy_source_count.get(),
296 302 ignore_patterns_hash: meta.ignore_patterns_hash,
297 303 unreachable_bytes: meta.unreachable_bytes.get(),
298 304 };
299 305 Ok(dirstate_map)
300 306 }
301 307
302 308 impl Node {
303 309 pub(super) fn full_path<'on_disk>(
304 310 &self,
305 311 on_disk: &'on_disk [u8],
306 312 ) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
307 313 read_hg_path(on_disk, self.full_path)
308 314 }
309 315
310 316 pub(super) fn base_name_start<'on_disk>(
311 317 &self,
312 318 ) -> Result<usize, DirstateV2ParseError> {
313 319 let start = self.base_name_start.get();
314 320 if start < self.full_path.len.get() {
315 321 let start = usize::try_from(start)
316 322 // u32 -> usize, could only panic on a 16-bit CPU
317 323 .expect("dirstate-v2 base_name_start out of bounds");
318 324 Ok(start)
319 325 } else {
320 326 Err(DirstateV2ParseError)
321 327 }
322 328 }
323 329
324 330 pub(super) fn base_name<'on_disk>(
325 331 &self,
326 332 on_disk: &'on_disk [u8],
327 333 ) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
328 334 let full_path = self.full_path(on_disk)?;
329 335 let base_name_start = self.base_name_start()?;
330 336 Ok(HgPath::new(&full_path.as_bytes()[base_name_start..]))
331 337 }
332 338
333 339 pub(super) fn path<'on_disk>(
334 340 &self,
335 341 on_disk: &'on_disk [u8],
336 342 ) -> Result<dirstate_map::NodeKey<'on_disk>, DirstateV2ParseError> {
337 343 Ok(WithBasename::from_raw_parts(
338 344 Cow::Borrowed(self.full_path(on_disk)?),
339 345 self.base_name_start()?,
340 346 ))
341 347 }
342 348
343 349 pub(super) fn has_copy_source<'on_disk>(&self) -> bool {
344 350 self.copy_source.start.get() != 0
345 351 }
346 352
347 353 pub(super) fn copy_source<'on_disk>(
348 354 &self,
349 355 on_disk: &'on_disk [u8],
350 356 ) -> Result<Option<&'on_disk HgPath>, DirstateV2ParseError> {
351 357 Ok(if self.has_copy_source() {
352 358 Some(read_hg_path(on_disk, self.copy_source)?)
353 359 } else {
354 360 None
355 361 })
356 362 }
357 363
358 364 pub(super) fn node_data(
359 365 &self,
360 366 ) -> Result<dirstate_map::NodeData, DirstateV2ParseError> {
361 367 let entry = |state| {
362 368 dirstate_map::NodeData::Entry(self.entry_with_given_state(state))
363 369 };
364 370
365 371 match self.state {
366 372 b'\0' => Ok(dirstate_map::NodeData::None),
367 373 b'd' => Ok(dirstate_map::NodeData::CachedDirectory {
368 374 mtime: *self.data.as_timestamp(),
369 375 }),
370 376 b'n' => Ok(entry(EntryState::Normal)),
371 377 b'a' => Ok(entry(EntryState::Added)),
372 378 b'r' => Ok(entry(EntryState::Removed)),
373 379 b'm' => Ok(entry(EntryState::Merged)),
374 380 _ => Err(DirstateV2ParseError),
375 381 }
376 382 }
377 383
378 384 pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
379 385 if self.state == b'd' {
380 386 Some(self.data.as_timestamp())
381 387 } else {
382 388 None
383 389 }
384 390 }
385 391
386 392 pub(super) fn state(
387 393 &self,
388 394 ) -> Result<Option<EntryState>, DirstateV2ParseError> {
389 395 match self.state {
390 396 b'\0' | b'd' => Ok(None),
391 397 b'n' => Ok(Some(EntryState::Normal)),
392 398 b'a' => Ok(Some(EntryState::Added)),
393 399 b'r' => Ok(Some(EntryState::Removed)),
394 400 b'm' => Ok(Some(EntryState::Merged)),
395 401 _ => Err(DirstateV2ParseError),
396 402 }
397 403 }
398 404
399 405 fn entry_with_given_state(&self, state: EntryState) -> DirstateEntry {
400 406 DirstateEntry {
401 407 state,
402 408 mode: self.data.mode.get(),
403 409 mtime: self.data.mtime.get(),
404 410 size: self.data.size.get(),
405 411 }
406 412 }
407 413
408 414 pub(super) fn entry(
409 415 &self,
410 416 ) -> Result<Option<DirstateEntry>, DirstateV2ParseError> {
411 417 Ok(self
412 418 .state()?
413 419 .map(|state| self.entry_with_given_state(state)))
414 420 }
415 421
416 422 pub(super) fn children<'on_disk>(
417 423 &self,
418 424 on_disk: &'on_disk [u8],
419 425 ) -> Result<&'on_disk [Node], DirstateV2ParseError> {
420 426 read_nodes(on_disk, self.children)
421 427 }
422 428
423 429 pub(super) fn to_in_memory_node<'on_disk>(
424 430 &self,
425 431 on_disk: &'on_disk [u8],
426 432 ) -> Result<dirstate_map::Node<'on_disk>, DirstateV2ParseError> {
427 433 Ok(dirstate_map::Node {
428 434 children: dirstate_map::ChildNodes::OnDisk(
429 435 self.children(on_disk)?,
430 436 ),
431 437 copy_source: self.copy_source(on_disk)?.map(Cow::Borrowed),
432 438 data: self.node_data()?,
433 439 descendants_with_entry_count: self
434 440 .descendants_with_entry_count
435 441 .get(),
436 442 tracked_descendants_count: self.tracked_descendants_count.get(),
437 443 })
438 444 }
439 445 }
440 446
441 447 impl Entry {
442 448 fn from_timestamp(timestamp: Timestamp) -> Self {
443 449 // Safety: both types implement the `ByteCast` trait, so we could
444 450 // safely use `as_bytes` and `from_bytes` to do this conversion. Using
445 451 // `transmute` instead makes the compiler check that the two types
446 452 // have the same size, which eliminates the error case of
447 453 // `from_bytes`.
448 454 unsafe { std::mem::transmute::<Timestamp, Entry>(timestamp) }
449 455 }
450 456
451 457 fn as_timestamp(&self) -> &Timestamp {
452 458 // Safety: same as above in `from_timestamp`
453 459 unsafe { &*(self as *const Entry as *const Timestamp) }
454 460 }
455 461 }
456 462
457 463 impl Timestamp {
458 464 pub fn seconds(&self) -> i64 {
459 465 self.seconds.get()
460 466 }
461 467 }
462 468
463 469 impl From<SystemTime> for Timestamp {
464 470 fn from(system_time: SystemTime) -> Self {
465 471 let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
466 472 Ok(duration) => {
467 473 (duration.as_secs() as i64, duration.subsec_nanos())
468 474 }
469 475 Err(error) => {
470 476 let negative = error.duration();
471 477 (-(negative.as_secs() as i64), negative.subsec_nanos())
472 478 }
473 479 };
474 480 Timestamp {
475 481 seconds: secs.into(),
476 482 nanoseconds: nanos.into(),
477 483 }
478 484 }
479 485 }
480 486
481 487 impl From<&'_ Timestamp> for SystemTime {
482 488 fn from(timestamp: &'_ Timestamp) -> Self {
483 489 let secs = timestamp.seconds.get();
484 490 let nanos = timestamp.nanoseconds.get();
485 491 if secs >= 0 {
486 492 UNIX_EPOCH + Duration::new(secs as u64, nanos)
487 493 } else {
488 494 UNIX_EPOCH - Duration::new((-secs) as u64, nanos)
489 495 }
490 496 }
491 497 }
492 498
493 499 fn read_hg_path(
494 500 on_disk: &[u8],
495 501 slice: PathSlice,
496 502 ) -> Result<&HgPath, DirstateV2ParseError> {
497 503 read_slice(on_disk, slice.start, slice.len.get()).map(HgPath::new)
498 504 }
499 505
500 506 fn read_nodes(
501 507 on_disk: &[u8],
502 508 slice: ChildNodes,
503 509 ) -> Result<&[Node], DirstateV2ParseError> {
504 510 read_slice(on_disk, slice.start, slice.len.get())
505 511 }
506 512
507 513 fn read_slice<T, Len>(
508 514 on_disk: &[u8],
509 515 start: Offset,
510 516 len: Len,
511 517 ) -> Result<&[T], DirstateV2ParseError>
512 518 where
513 519 T: BytesCast,
514 520 Len: TryInto<usize>,
515 521 {
516 522 // Either `usize::MAX` would result in "out of bounds" error since a single
517 523 // `&[u8]` cannot occupy the entire addess space.
518 524 let start = start.get().try_into().unwrap_or(std::usize::MAX);
519 525 let len = len.try_into().unwrap_or(std::usize::MAX);
520 526 on_disk
521 527 .get(start..)
522 528 .and_then(|bytes| T::slice_from_bytes(bytes, len).ok())
523 529 .map(|(slice, _rest)| slice)
524 530 .ok_or_else(|| DirstateV2ParseError)
525 531 }
526 532
527 533 pub(crate) fn for_each_tracked_path<'on_disk>(
528 534 on_disk: &'on_disk [u8],
529 535 metadata: &[u8],
530 536 mut f: impl FnMut(&'on_disk HgPath),
531 537 ) -> Result<(), DirstateV2ParseError> {
532 538 let (meta, _) = TreeMetadata::from_bytes(metadata)
533 539 .map_err(|_| DirstateV2ParseError)?;
534 540 fn recur<'on_disk>(
535 541 on_disk: &'on_disk [u8],
536 542 nodes: ChildNodes,
537 543 f: &mut impl FnMut(&'on_disk HgPath),
538 544 ) -> Result<(), DirstateV2ParseError> {
539 545 for node in read_nodes(on_disk, nodes)? {
540 546 if let Some(state) = node.state()? {
541 547 if state.is_tracked() {
542 548 f(node.full_path(on_disk)?)
543 549 }
544 550 }
545 551 recur(on_disk, node.children, f)?
546 552 }
547 553 Ok(())
548 554 }
549 555 recur(on_disk, meta.root_nodes, &mut f)
550 556 }
551 557
552 558 /// Returns new data and metadata, together with whether that data should be
553 559 /// appended to the existing data file whose content is at
554 560 /// `dirstate_map.on_disk` (true), instead of written to a new data file
555 561 /// (false).
556 562 pub(super) fn write(
557 563 dirstate_map: &mut DirstateMap,
558 564 can_append: bool,
559 565 ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
560 566 let append = can_append && dirstate_map.write_should_append();
561 567
562 568 // This ignores the space for paths, and for nodes without an entry.
563 569 // TODO: better estimate? Skip the `Vec` and write to a file directly?
564 570 let size_guess = std::mem::size_of::<Node>()
565 571 * dirstate_map.nodes_with_entry_count as usize;
566 572
567 573 let mut writer = Writer {
568 574 dirstate_map,
569 575 append,
570 576 out: Vec::with_capacity(size_guess),
571 577 };
572 578
573 579 let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
574 580
575 581 let meta = TreeMetadata {
576 582 root_nodes,
577 583 nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
578 584 nodes_with_copy_source_count: dirstate_map
579 585 .nodes_with_copy_source_count
580 586 .into(),
581 587 unreachable_bytes: dirstate_map.unreachable_bytes.into(),
588 unused: [0; 4],
582 589 ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
583 590 };
584 591 Ok((writer.out, meta.as_bytes().to_vec(), append))
585 592 }
586 593
587 594 struct Writer<'dmap, 'on_disk> {
588 595 dirstate_map: &'dmap DirstateMap<'on_disk>,
589 596 append: bool,
590 597 out: Vec<u8>,
591 598 }
592 599
593 600 impl Writer<'_, '_> {
594 601 fn write_nodes(
595 602 &mut self,
596 603 nodes: dirstate_map::ChildNodesRef,
597 604 ) -> Result<ChildNodes, DirstateError> {
598 605 // Reuse already-written nodes if possible
599 606 if self.append {
600 607 if let dirstate_map::ChildNodesRef::OnDisk(nodes_slice) = nodes {
601 608 let start = self.on_disk_offset_of(nodes_slice).expect(
602 609 "dirstate-v2 OnDisk nodes not found within on_disk",
603 610 );
604 611 let len = child_nodes_len_from_usize(nodes_slice.len());
605 612 return Ok(ChildNodes { start, len });
606 613 }
607 614 }
608 615
609 616 // `dirstate_map::ChildNodes::InMemory` contains a `HashMap` which has
610 617 // undefined iteration order. Sort to enable binary search in the
611 618 // written file.
612 619 let nodes = nodes.sorted();
613 620 let nodes_len = nodes.len();
614 621
615 622 // First accumulate serialized nodes in a `Vec`
616 623 let mut on_disk_nodes = Vec::with_capacity(nodes_len);
617 624 for node in nodes {
618 625 let children =
619 626 self.write_nodes(node.children(self.dirstate_map.on_disk)?)?;
620 627 let full_path = node.full_path(self.dirstate_map.on_disk)?;
621 628 let full_path = self.write_path(full_path.as_bytes());
622 629 let copy_source = if let Some(source) =
623 630 node.copy_source(self.dirstate_map.on_disk)?
624 631 {
625 632 self.write_path(source.as_bytes())
626 633 } else {
627 634 PathSlice {
628 635 start: 0.into(),
629 636 len: 0.into(),
630 637 }
631 638 };
632 639 on_disk_nodes.push(match node {
633 640 NodeRef::InMemory(path, node) => {
634 641 let (state, data) = match &node.data {
635 642 dirstate_map::NodeData::Entry(entry) => (
636 643 entry.state.into(),
637 644 Entry {
638 645 mode: entry.mode.into(),
639 646 mtime: entry.mtime.into(),
640 647 size: entry.size.into(),
641 648 },
642 649 ),
643 650 dirstate_map::NodeData::CachedDirectory { mtime } => {
644 651 (b'd', Entry::from_timestamp(*mtime))
645 652 }
646 653 dirstate_map::NodeData::None => (
647 654 b'\0',
648 655 Entry {
649 656 mode: 0.into(),
650 657 mtime: 0.into(),
651 658 size: 0.into(),
652 659 },
653 660 ),
654 661 };
655 662 Node {
656 663 children,
657 664 copy_source,
658 665 full_path,
659 666 base_name_start: u16::try_from(path.base_name_start())
660 667 // Could only panic for paths over 64 KiB
661 668 .expect("dirstate-v2 path length overflow")
662 669 .into(),
663 670 descendants_with_entry_count: node
664 671 .descendants_with_entry_count
665 672 .into(),
666 673 tracked_descendants_count: node
667 674 .tracked_descendants_count
668 675 .into(),
669 676 state,
670 677 data,
671 678 }
672 679 }
673 680 NodeRef::OnDisk(node) => Node {
674 681 children,
675 682 copy_source,
676 683 full_path,
677 684 ..*node
678 685 },
679 686 })
680 687 }
681 688 // … so we can write them contiguously, after writing everything else
682 689 // they refer to.
683 690 let start = self.current_offset();
684 691 let len = child_nodes_len_from_usize(nodes_len);
685 692 self.out.extend(on_disk_nodes.as_bytes());
686 693 Ok(ChildNodes { start, len })
687 694 }
688 695
689 696 /// If the given slice of items is within `on_disk`, returns its offset
690 697 /// from the start of `on_disk`.
691 698 fn on_disk_offset_of<T>(&self, slice: &[T]) -> Option<Offset>
692 699 where
693 700 T: BytesCast,
694 701 {
695 702 fn address_range(slice: &[u8]) -> std::ops::RangeInclusive<usize> {
696 703 let start = slice.as_ptr() as usize;
697 704 let end = start + slice.len();
698 705 start..=end
699 706 }
700 707 let slice_addresses = address_range(slice.as_bytes());
701 708 let on_disk_addresses = address_range(self.dirstate_map.on_disk);
702 709 if on_disk_addresses.contains(slice_addresses.start())
703 710 && on_disk_addresses.contains(slice_addresses.end())
704 711 {
705 712 let offset = slice_addresses.start() - on_disk_addresses.start();
706 713 Some(offset_from_usize(offset))
707 714 } else {
708 715 None
709 716 }
710 717 }
711 718
712 719 fn current_offset(&mut self) -> Offset {
713 720 let mut offset = self.out.len();
714 721 if self.append {
715 722 offset += self.dirstate_map.on_disk.len()
716 723 }
717 724 offset_from_usize(offset)
718 725 }
719 726
720 727 fn write_path(&mut self, slice: &[u8]) -> PathSlice {
721 728 let len = path_len_from_usize(slice.len());
722 729 // Reuse an already-written path if possible
723 730 if self.append {
724 731 if let Some(start) = self.on_disk_offset_of(slice) {
725 732 return PathSlice { start, len };
726 733 }
727 734 }
728 735 let start = self.current_offset();
729 736 self.out.extend(slice.as_bytes());
730 737 PathSlice { start, len }
731 738 }
732 739 }
733 740
734 741 fn offset_from_usize(x: usize) -> Offset {
735 742 u32::try_from(x)
736 743 // Could only panic for a dirstate file larger than 4 GiB
737 744 .expect("dirstate-v2 offset overflow")
738 745 .into()
739 746 }
740 747
741 748 fn child_nodes_len_from_usize(x: usize) -> Size {
742 749 u32::try_from(x)
743 750 // Could only panic with over 4 billion nodes
744 751 .expect("dirstate-v2 slice length overflow")
745 752 .into()
746 753 }
747 754
748 755 fn path_len_from_usize(x: usize) -> PathSize {
749 756 u16::try_from(x)
750 757 // Could only panic for paths over 64 KiB
751 758 .expect("dirstate-v2 path length overflow")
752 759 .into()
753 760 }
General Comments 0
You need to be logged in to leave comments. Login now