# HG changeset patch # User Simon Sapin # Date 2021-07-15 21:02:17 # Node ID 78f7f0d490ee2e064c6dbab5a9e2aec4b448e9c7 # Parent d94118365ec58d02aa6c3a6f905cd1789dca1a98 dirstate-v2: Move fixed-size tree metadata into the docket file Before this changeset, the dirstate-v2 data file contained not only nodes and paths that may be reused when appending to an existing file, but also some fixed-size metadata that applies to the entire tree and was added at the end of the data file for every append. This moves that metadata into the docket file, so that repeated "append" operations without meaningful changes don’t actually need to grow any file. Differential Revision: https://phab.mercurial-scm.org/D11098 diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py --- a/mercurial/debugcommands.py +++ b/mercurial/debugcommands.py @@ -999,11 +999,7 @@ def debugdirstateignorepatternshash(ui, if repo.dirstate._use_dirstate_v2: docket = repo.dirstate._map.docket hash_len = 20 # 160 bits for SHA-1 - hash_offset = docket.data_size - hash_len # hash is at the end - data_filename = docket.data_filename() - with repo.vfs(data_filename) as f: - f.seek(hash_offset) - hash_bytes = f.read(hash_len) + hash_bytes = docket.tree_metadata[-hash_len:] ui.write(binascii.hexlify(hash_bytes) + b'\n') diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py --- a/mercurial/dirstatemap.py +++ b/mercurial/dirstatemap.py @@ -638,7 +638,7 @@ if rustmod is not None: else: data = b'' self._rustmap = rustmod.DirstateMap.new_v2( - data, self.docket.data_size + data, self.docket.data_size, self.docket.tree_metadata ) parents = self.docket.parents else: @@ -665,7 +665,7 @@ if rustmod is not None: # We can only append to an existing data file if there is one can_append = self.docket.uuid is not None - packed, append = self._rustmap.write_v2(now, can_append) + packed, meta, append = self._rustmap.write_v2(now, can_append) if append: docket = self.docket data_filename = docket.data_filename() @@ -679,12 +679,13 @@ if rustmod is not None: assert written == len(packed), (written, len(packed)) docket.data_size += len(packed) docket.parents = self.parents() + docket.tree_metadata = meta st.write(docket.serialize()) st.close() else: old_docket = self.docket new_docket = docketmod.DirstateDocket.with_new_uuid( - self.parents(), len(packed) + self.parents(), len(packed), meta ) data_filename = new_docket.data_filename() if tr: diff --git a/mercurial/dirstateutils/docket.py b/mercurial/dirstateutils/docket.py --- a/mercurial/dirstateutils/docket.py +++ b/mercurial/dirstateutils/docket.py @@ -14,47 +14,60 @@ from ..revlogutils import docket as dock V2_FORMAT_MARKER = b"dirstate-v2\n" +# Must match the constant of the same name in +# `rust/hg-core/src/dirstate_tree/on_disk.rs` +TREE_METADATA_SIZE = 40 + # * 12 bytes: format marker # * 32 bytes: node ID of the working directory's first parent # * 32 bytes: node ID of the working directory's second parent # * 4 bytes: big-endian used size of the data file +# * {TREE_METADATA_SIZE} bytes: tree metadata, parsed separately # * 1 byte: length of the data file's UUID # * variable: data file's UUID # # Node IDs are null-padded if shorter than 32 bytes. # A data file shorter than the specified used size is corrupted (truncated) -HEADER = struct.Struct(">{}s32s32sLB".format(len(V2_FORMAT_MARKER))) +HEADER = struct.Struct( + ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE) +) class DirstateDocket(object): data_filename_pattern = b'dirstate.%s.d' - def __init__(self, parents, data_size, uuid): + def __init__(self, parents, data_size, tree_metadata, uuid): self.parents = parents self.data_size = data_size + self.tree_metadata = tree_metadata self.uuid = uuid @classmethod - def with_new_uuid(cls, parents, data): - return cls(parents, data, docket_mod.make_uid()) + def with_new_uuid(cls, parents, data_size, tree_metadata): + return cls(parents, data_size, tree_metadata, docket_mod.make_uid()) @classmethod def parse(cls, data, nodeconstants): if not data: parents = (nodeconstants.nullid, nodeconstants.nullid) - return cls(parents, 0, None) - marker, p1, p2, data_size, uuid_size = HEADER.unpack_from(data) + return cls(parents, 0, b'', None) + marker, p1, p2, data_size, meta, uuid_size = HEADER.unpack_from(data) if marker != V2_FORMAT_MARKER: raise ValueError("expected dirstate-v2 marker") uuid = data[HEADER.size : HEADER.size + uuid_size] p1 = p1[: nodeconstants.nodelen] p2 = p2[: nodeconstants.nodelen] - return cls((p1, p2), data_size, uuid) + return cls((p1, p2), data_size, meta, uuid) def serialize(self): p1, p2 = self.parents header = HEADER.pack( - V2_FORMAT_MARKER, p1, p2, self.data_size, len(self.uuid) + V2_FORMAT_MARKER, + p1, + p2, + self.data_size, + self.tree_metadata, + len(self.uuid), ) return header + self.uuid diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs --- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs @@ -424,9 +424,10 @@ impl<'on_disk> DirstateMap<'on_disk> { pub fn new_v2( on_disk: &'on_disk [u8], data_size: usize, + metadata: &[u8], ) -> Result { if let Some(data) = on_disk.get(..data_size) { - Ok(on_disk::read(data)?) + Ok(on_disk::read(data, metadata)?) } else { Err(DirstateV2ParseError.into()) } @@ -1094,15 +1095,16 @@ impl<'on_disk> super::dispatch::Dirstate Ok(packed) } - /// Returns new data together with whether that data should be appended to - /// the existing data file whose content is at `self.on_disk` (true), - /// instead of written to a new data file (false). + /// Returns new data and metadata together with whether that data should be + /// appended to the existing data file whose content is at + /// `self.on_disk` (true), instead of written to a new data file + /// (false). #[timed] fn pack_v2( &mut self, now: Timestamp, can_append: bool, - ) -> Result<(Vec, bool), DirstateError> { + ) -> Result<(Vec, Vec, bool), DirstateError> { // TODO: how do we want to handle this in 2038? let now: i32 = now.0.try_into().expect("time overflow"); let mut paths = Vec::new(); diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs --- a/rust/hg-core/src/dirstate_tree/dispatch.rs +++ b/rust/hg-core/src/dirstate_tree/dispatch.rs @@ -182,16 +182,17 @@ pub trait DirstateMapMethods { /// serialize bytes to write a dirstate data file to disk in dirstate-v2 /// format. /// - /// Returns new data together with whether that data should be appended to - /// the existing data file whose content is at `self.on_disk` (true), - /// instead of written to a new data file (false). + /// Returns new data and metadata together with whether that data should be + /// appended to the existing data file whose content is at + /// `self.on_disk` (true), instead of written to a new data file + /// (false). /// /// Note: this is only supported by the tree dirstate map. fn pack_v2( &mut self, now: Timestamp, can_append: bool, - ) -> Result<(Vec, bool), DirstateError>; + ) -> Result<(Vec, Vec, bool), DirstateError>; /// Run the status algorithm. /// @@ -395,7 +396,7 @@ impl DirstateMapMethods for DirstateMap &mut self, _now: Timestamp, _can_append: bool, - ) -> Result<(Vec, bool), DirstateError> { + ) -> Result<(Vec, Vec, bool), DirstateError> { panic!( "should have used dirstate_tree::DirstateMap to use the v2 format" ) diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs --- a/rust/hg-core/src/dirstate_tree/on_disk.rs +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs @@ -47,6 +47,18 @@ const USED_NODE_ID_BYTES: usize = 20; pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20; pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN]; +/// Must match the constant of the same name in +/// `mercurial/dirstateutils/docket.py` +const TREE_METADATA_SIZE: usize = 40; + +/// Make sure that size-affecting changes are made knowingly +#[allow(unused)] +fn static_assert_size_of() { + let _ = std::mem::transmute::; + let _ = std::mem::transmute::; + let _ = std::mem::transmute::; +} + // Must match `HEADER` in `mercurial/dirstateutils/docket.py` #[derive(BytesCast)] #[repr(C)] @@ -58,6 +70,8 @@ struct DocketHeader { /// Counted in bytes data_size: Size, + metadata: TreeMetadata, + uuid_size: u8, } @@ -68,7 +82,7 @@ pub struct Docket<'on_disk> { #[derive(BytesCast)] #[repr(C)] -struct Root { +struct TreeMetadata { root_nodes: ChildNodes, nodes_with_entry_count: Size, nodes_with_copy_source_count: Size, @@ -134,7 +148,7 @@ pub(super) struct Node { /// - All direct children of this directory (as returned by /// `std::fs::read_dir`) either have a corresponding dirstate node, or /// are ignored by ignore patterns whose hash is in - /// `Root::ignore_patterns_hash`. + /// `TreeMetadata::ignore_patterns_hash`. /// /// This means that if `std::fs::symlink_metadata` later reports the /// same modification time and ignored patterns haven’t changed, a run @@ -205,13 +219,6 @@ struct PathSlice { /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes type OptPathSlice = PathSlice; -/// Make sure that size-affecting changes are made knowingly -fn _static_assert_size_of() { - let _ = std::mem::transmute::; - let _ = std::mem::transmute::; - let _ = std::mem::transmute::; -} - /// Unexpected file format found in `.hg/dirstate` with the "v2" format. /// /// This should only happen if Mercurial is buggy or a repository is corrupted. @@ -242,6 +249,10 @@ impl<'on_disk> Docket<'on_disk> { DirstateParents { p1, p2 } } + pub fn tree_metadata(&self) -> &[u8] { + self.header.metadata.as_bytes() + } + pub fn data_size(&self) -> usize { // This `unwrap` could only panic on a 16-bit CPU self.header.data_size.get().try_into().unwrap() @@ -265,40 +276,25 @@ pub fn read_docket( } } -fn read_root<'on_disk>( - on_disk: &'on_disk [u8], -) -> Result<&'on_disk Root, DirstateV2ParseError> { - // Find the `Root` at the end of the given slice - let root_offset = on_disk - .len() - .checked_sub(std::mem::size_of::()) - // A non-empty slice too short is an error - .ok_or(DirstateV2ParseError)?; - let (root, _) = Root::from_bytes(&on_disk[root_offset..]) - .map_err(|_| DirstateV2ParseError)?; - Ok(root) -} - pub(super) fn read<'on_disk>( on_disk: &'on_disk [u8], + metadata: &[u8], ) -> Result, DirstateV2ParseError> { if on_disk.is_empty() { return Ok(DirstateMap::empty(on_disk)); } - let root = read_root(on_disk)?; - let mut unreachable_bytes = root.unreachable_bytes.get(); - // Each append writes a new `Root`, so it’s never reused - unreachable_bytes += std::mem::size_of::() as u32; + let (meta, _) = TreeMetadata::from_bytes(metadata) + .map_err(|_| DirstateV2ParseError)?; let dirstate_map = DirstateMap { on_disk, root: dirstate_map::ChildNodes::OnDisk(read_nodes( on_disk, - root.root_nodes, + meta.root_nodes, )?), - nodes_with_entry_count: root.nodes_with_entry_count.get(), - nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(), - ignore_patterns_hash: root.ignore_patterns_hash, - unreachable_bytes, + nodes_with_entry_count: meta.nodes_with_entry_count.get(), + nodes_with_copy_source_count: meta.nodes_with_copy_source_count.get(), + ignore_patterns_hash: meta.ignore_patterns_hash, + unreachable_bytes: meta.unreachable_bytes.get(), }; Ok(dirstate_map) } @@ -530,9 +526,11 @@ where pub(crate) fn for_each_tracked_path<'on_disk>( on_disk: &'on_disk [u8], + metadata: &[u8], mut f: impl FnMut(&'on_disk HgPath), ) -> Result<(), DirstateV2ParseError> { - let root = read_root(on_disk)?; + let (meta, _) = TreeMetadata::from_bytes(metadata) + .map_err(|_| DirstateV2ParseError)?; fn recur<'on_disk>( on_disk: &'on_disk [u8], nodes: ChildNodes, @@ -548,23 +546,23 @@ pub(crate) fn for_each_tracked_path<'on_ } Ok(()) } - recur(on_disk, root.root_nodes, &mut f) + recur(on_disk, meta.root_nodes, &mut f) } -/// Returns new data together with whether that data should be appended to the -/// existing data file whose content is at `dirstate_map.on_disk` (true), -/// instead of written to a new data file (false). +/// Returns new data and metadata, together with whether that data should be +/// appended to the existing data file whose content is at +/// `dirstate_map.on_disk` (true), instead of written to a new data file +/// (false). pub(super) fn write( dirstate_map: &mut DirstateMap, can_append: bool, -) -> Result<(Vec, bool), DirstateError> { +) -> Result<(Vec, Vec, bool), DirstateError> { let append = can_append && dirstate_map.write_should_append(); // This ignores the space for paths, and for nodes without an entry. // TODO: better estimate? Skip the `Vec` and write to a file directly? - let size_guess = std::mem::size_of::() - + std::mem::size_of::() - * dirstate_map.nodes_with_entry_count as usize; + let size_guess = std::mem::size_of::() + * dirstate_map.nodes_with_entry_count as usize; let mut writer = Writer { dirstate_map, @@ -574,7 +572,7 @@ pub(super) fn write( let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?; - let root = Root { + let meta = TreeMetadata { root_nodes, nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(), nodes_with_copy_source_count: dirstate_map @@ -583,8 +581,7 @@ pub(super) fn write( unreachable_bytes: dirstate_map.unreachable_bytes.into(), ignore_patterns_hash: dirstate_map.ignore_patterns_hash, }; - writer.out.extend(root.as_bytes()); - Ok((writer.out, append)) + Ok((writer.out, meta.as_bytes().to_vec(), append)) } struct Writer<'dmap, 'on_disk> { diff --git a/rust/hg-core/src/operations/list_tracked_files.rs b/rust/hg-core/src/operations/list_tracked_files.rs --- a/rust/hg-core/src/operations/list_tracked_files.rs +++ b/rust/hg-core/src/operations/list_tracked_files.rs @@ -22,27 +22,33 @@ use rayon::prelude::*; pub struct Dirstate { /// The `dirstate` content. content: Vec, - dirstate_v2: bool, + v2_metadata: Option>, } impl Dirstate { pub fn new(repo: &Repo) -> Result { let mut content = repo.hg_vfs().read("dirstate")?; - if repo.has_dirstate_v2() { + let v2_metadata = if repo.has_dirstate_v2() { let docket = read_docket(&content)?; + let meta = docket.tree_metadata().to_vec(); content = repo.hg_vfs().read(docket.data_filename())?; - } + Some(meta) + } else { + None + }; Ok(Self { content, - dirstate_v2: repo.has_dirstate_v2(), + v2_metadata, }) } pub fn tracked_files(&self) -> Result, DirstateError> { let mut files = Vec::new(); if !self.content.is_empty() { - if self.dirstate_v2 { - for_each_tracked_path(&self.content, |path| files.push(path))? + if let Some(meta) = &self.v2_metadata { + for_each_tracked_path(&self.content, meta, |path| { + files.push(path) + })? } else { let _parents = parse_dirstate_entries( &self.content, diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs --- a/rust/hg-cpython/src/dirstate/dirstate_map.rs +++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs @@ -84,12 +84,14 @@ py_class!(pub class DirstateMap |py| { def new_v2( on_disk: PyBytes, data_size: usize, + tree_metadata: PyBytes, ) -> PyResult { let dirstate_error = |e: DirstateError| { PyErr::new::(py, format!("Dirstate error: {:?}", e)) }; - let inner = OwningDirstateMap::new_v2(py, on_disk, data_size) - .map_err(dirstate_error)?; + let inner = OwningDirstateMap::new_v2( + py, on_disk, data_size, tree_metadata, + ).map_err(dirstate_error)?; let map = Self::create_instance(py, Box::new(inner))?; Ok(map.into_object()) } @@ -353,9 +355,11 @@ py_class!(pub class DirstateMap |py| { let mut inner = self.inner(py).borrow_mut(); let result = inner.pack_v2(now, can_append); match result { - Ok((packed, append)) => { + Ok((packed, tree_metadata, append)) => { let packed = PyBytes::new(py, &packed); - Ok((packed, append).to_py_object(py).into_object()) + let tree_metadata = PyBytes::new(py, &tree_metadata); + let tuple = (packed, tree_metadata, append); + Ok(tuple.to_py_object(py).into_object()) }, Err(_) => Err(PyErr::new::( py, diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs --- a/rust/hg-cpython/src/dirstate/dispatch.rs +++ b/rust/hg-cpython/src/dirstate/dispatch.rs @@ -128,7 +128,7 @@ impl DirstateMapMethods for OwningDirsta &mut self, now: Timestamp, can_append: bool, - ) -> Result<(Vec, bool), DirstateError> { + ) -> Result<(Vec, Vec, bool), DirstateError> { self.get_mut().pack_v2(now, can_append) } diff --git a/rust/hg-cpython/src/dirstate/owning.rs b/rust/hg-cpython/src/dirstate/owning.rs --- a/rust/hg-cpython/src/dirstate/owning.rs +++ b/rust/hg-cpython/src/dirstate/owning.rs @@ -49,9 +49,11 @@ impl OwningDirstateMap { py: Python, on_disk: PyBytes, data_size: usize, + tree_metadata: PyBytes, ) -> Result { let bytes: &'_ [u8] = on_disk.data(py); - let map = DirstateMap::new_v2(bytes, data_size)?; + let map = + DirstateMap::new_v2(bytes, data_size, tree_metadata.data(py))?; // Like in `bytes` above, this `'_` lifetime parameter borrows from // the bytes buffer owned by `on_disk`. diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs --- a/rust/rhg/src/commands/status.rs +++ b/rust/rhg/src/commands/status.rs @@ -168,13 +168,16 @@ pub fn run(invocation: &crate::CliInvoca let repo = invocation.repo?; let dirstate_data_mmap; let (mut dmap, parents) = if repo.has_dirstate_v2() { + let docket_data = + repo.hg_vfs().read("dirstate").io_not_found_as_none()?; let parents; let dirstate_data; let data_size; - if let Some(docket_data) = - repo.hg_vfs().read("dirstate").io_not_found_as_none()? - { - let docket = on_disk::read_docket(&docket_data)?; + let docket; + let tree_metadata; + if let Some(docket_data) = &docket_data { + docket = on_disk::read_docket(docket_data)?; + tree_metadata = docket.tree_metadata(); parents = Some(docket.parents()); data_size = docket.data_size(); dirstate_data_mmap = repo @@ -184,10 +187,12 @@ pub fn run(invocation: &crate::CliInvoca dirstate_data = dirstate_data_mmap.as_deref().unwrap_or(b""); } else { parents = None; + tree_metadata = b""; data_size = 0; dirstate_data = b""; } - let dmap = DirstateMap::new_v2(dirstate_data, data_size)?; + let dmap = + DirstateMap::new_v2(dirstate_data, data_size, tree_metadata)?; (dmap, parents) } else { dirstate_data_mmap =