# HG changeset patch # User Simon Sapin # Date 2021-07-13 15:18:23 # Node ID 065e6162898006e45cbd2c1e3e652427ca6869ef # Parent da1c0cd68d53778b34296e6acfb58a02db121705 dirstate-v2: Support appending to the same data file For now we’re still writing the entire data every time, so appending is not useful yet. Later we’ll have new nodes pointing to some existing data for nodes and paths that haven’t changed. The decision whether to append is pseudo-random in order to make tests exercise both code paths. This will be replaced by a heuristic based on the amount of unused existing data. Differential Revision: https://phab.mercurial-scm.org/D11094 diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py --- a/mercurial/dirstatemap.py +++ b/mercurial/dirstatemap.py @@ -655,13 +655,41 @@ if rustmod is not None: return self._rustmap def write(self, tr, st, now): - if self._use_dirstate_v2: - packed = self._rustmap.write_v2(now) + if not self._use_dirstate_v2: + p1, p2 = self.parents() + packed = self._rustmap.write_v1(p1, p2, now) + st.write(packed) + st.close() + self._dirtyparents = False + return + + # We can only append to an existing data file if there is one + can_append = self.docket.uuid is not None + packed, append = self._rustmap.write_v2(now, can_append) + if append: + docket = self.docket + data_filename = docket.data_filename() + if tr: + tr.add(data_filename, docket.data_size) + with self._opener(data_filename, b'r+b') as fp: + fp.seek(docket.data_size) + assert fp.tell() == docket.data_size + written = fp.write(packed) + if written is not None: # py2 may return None + assert written == len(packed), (written, len(packed)) + docket.data_size += len(packed) + docket.parents = self.parents() + st.write(docket.serialize()) + st.close() + else: old_docket = self.docket new_docket = docketmod.DirstateDocket.with_new_uuid( self.parents(), len(packed) ) - self._opener.write(new_docket.data_filename(), packed) + data_filename = new_docket.data_filename() + if tr: + tr.add(data_filename, 0) + self._opener.write(data_filename, packed) # Write the new docket after the new data file has been # written. Because `st` was opened with `atomictemp=True`, # the actual `.hg/dirstate` file is only affected on close. @@ -670,13 +698,16 @@ if rustmod is not None: # Remove the old data file after the new docket pointing to # the new data file was written. if old_docket.uuid: - self._opener.unlink(old_docket.data_filename()) + data_filename = old_docket.data_filename() + unlink = lambda _tr=None: self._opener.unlink(data_filename) + if tr: + category = b"dirstate-v2-clean-" + old_docket.uuid + tr.addpostclose(category, unlink) + else: + unlink() self._docket = new_docket - else: - p1, p2 = self.parents() - packed = self._rustmap.write_v1(p1, p2, now) - st.write(packed) - st.close() + # Reload from the newly-written file + util.clearcachedproperty(self, b"_rustmap") self._dirtyparents = False @propertycache diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs --- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs @@ -468,6 +468,24 @@ impl<'on_disk> DirstateMap<'on_disk> { Ok((map, parents)) } + /// Assuming dirstate-v2 format, returns whether the next write should + /// append to the existing data file that contains `self.on_disk` (true), + /// or create a new data file from scratch (false). + pub(super) fn write_should_append(&self) -> bool { + // Soon this will be a heuristic based on the amount of unreachable + // data. For now it’s pseudo-random in order to make tests exercise + // both code paths. + + fn bad_rng() -> u32 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .subsec_millis() + } + + bad_rng() % 2 == 0 + } + fn get_node<'tree>( &'tree self, path: &HgPath, @@ -1043,8 +1061,15 @@ impl<'on_disk> super::dispatch::Dirstate Ok(packed) } + /// Returns new data together with whether that data should be appended to + /// the existing data file whose content is at `self.on_disk` (true), + /// instead of written to a new data file (false). #[timed] - fn pack_v2(&mut self, now: Timestamp) -> Result, DirstateError> { + fn pack_v2( + &mut self, + now: Timestamp, + can_append: bool, + ) -> Result<(Vec, bool), DirstateError> { // TODO: how do we want to handle this in 2038? let now: i32 = now.0.try_into().expect("time overflow"); let mut paths = Vec::new(); @@ -1063,7 +1088,7 @@ impl<'on_disk> super::dispatch::Dirstate self.clear_known_ambiguous_mtimes(&paths)?; - on_disk::write(self) + on_disk::write(self, can_append) } fn status<'a>( diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs --- a/rust/hg-core/src/dirstate_tree/dispatch.rs +++ b/rust/hg-core/src/dirstate_tree/dispatch.rs @@ -179,11 +179,19 @@ pub trait DirstateMapMethods { /// Clear mtimes that are ambigous with `now` (similar to /// `clear_ambiguous_times` but for all files in the dirstate map), and - /// serialize bytes to write the `.hg/dirstate` file to disk in dirstate-v2 + /// serialize bytes to write a dirstate data file to disk in dirstate-v2 /// format. /// + /// Returns new data together with whether that data should be appended to + /// the existing data file whose content is at `self.on_disk` (true), + /// instead of written to a new data file (false). + /// /// Note: this is only supported by the tree dirstate map. - fn pack_v2(&mut self, now: Timestamp) -> Result, DirstateError>; + fn pack_v2( + &mut self, + now: Timestamp, + can_append: bool, + ) -> Result<(Vec, bool), DirstateError>; /// Run the status algorithm. /// @@ -383,7 +391,11 @@ impl DirstateMapMethods for DirstateMap self.pack(parents, now) } - fn pack_v2(&mut self, _now: Timestamp) -> Result, DirstateError> { + fn pack_v2( + &mut self, + _now: Timestamp, + _can_append: bool, + ) -> Result<(Vec, bool), DirstateError> { panic!( "should have used dirstate_tree::DirstateMap to use the v2 format" ) diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs --- a/rust/hg-core/src/dirstate_tree/on_disk.rs +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs @@ -544,20 +544,28 @@ pub(crate) fn for_each_tracked_path<'on_ recur(on_disk, root.root_nodes, &mut f) } +/// Returns new data together with whether that data should be appended to the +/// existing data file whose content is at `dirstate_map.on_disk` (true), +/// instead of written to a new data file (false). pub(super) fn write( dirstate_map: &mut DirstateMap, -) -> Result, DirstateError> { - let root_len = std::mem::size_of::(); + can_append: bool, +) -> Result<(Vec, bool), DirstateError> { + let append = can_append && dirstate_map.write_should_append(); // This ignores the space for paths, and for nodes without an entry. // TODO: better estimate? Skip the `Vec` and write to a file directly? - let size_guess = root_len + let size_guess = std::mem::size_of::() + std::mem::size_of::() * dirstate_map.nodes_with_entry_count as usize; - let mut out = Vec::with_capacity(size_guess); - let root_nodes = - write_nodes(dirstate_map, dirstate_map.root.as_ref(), &mut out)?; + let mut writer = Writer { + dirstate_map, + append, + out: Vec::with_capacity(size_guess), + }; + + let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?; let root = Root { root_nodes, @@ -567,112 +575,121 @@ pub(super) fn write( .into(), ignore_patterns_hash: dirstate_map.ignore_patterns_hash, }; - out.extend(root.as_bytes()); - Ok(out) + writer.out.extend(root.as_bytes()); + Ok((writer.out, append)) +} + +struct Writer<'dmap, 'on_disk> { + dirstate_map: &'dmap DirstateMap<'on_disk>, + append: bool, + out: Vec, } -fn write_nodes( - dirstate_map: &DirstateMap, - nodes: dirstate_map::ChildNodesRef, - out: &mut Vec, -) -> Result { - // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration - // order. Sort to enable binary search in the written file. - let nodes = nodes.sorted(); - let nodes_len = nodes.len(); +impl Writer<'_, '_> { + fn write_nodes( + &mut self, + nodes: dirstate_map::ChildNodesRef, + ) -> Result { + // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration + // order. Sort to enable binary search in the written file. + let nodes = nodes.sorted(); + let nodes_len = nodes.len(); - // First accumulate serialized nodes in a `Vec` - let mut on_disk_nodes = Vec::with_capacity(nodes_len); - for node in nodes { - let children = write_nodes( - dirstate_map, - node.children(dirstate_map.on_disk)?, - out, - )?; - let full_path = node.full_path(dirstate_map.on_disk)?; - let full_path = write_path(full_path.as_bytes(), out); - let copy_source = - if let Some(source) = node.copy_source(dirstate_map.on_disk)? { - write_path(source.as_bytes(), out) + // First accumulate serialized nodes in a `Vec` + let mut on_disk_nodes = Vec::with_capacity(nodes_len); + for node in nodes { + let children = + self.write_nodes(node.children(self.dirstate_map.on_disk)?)?; + let full_path = node.full_path(self.dirstate_map.on_disk)?; + let full_path = self.write_path(full_path.as_bytes()); + let copy_source = if let Some(source) = + node.copy_source(self.dirstate_map.on_disk)? + { + self.write_path(source.as_bytes()) } else { PathSlice { start: 0.into(), len: 0.into(), } }; - on_disk_nodes.push(match node { - NodeRef::InMemory(path, node) => { - let (state, data) = match &node.data { - dirstate_map::NodeData::Entry(entry) => ( - entry.state.into(), - Entry { - mode: entry.mode.into(), - mtime: entry.mtime.into(), - size: entry.size.into(), - }, - ), - dirstate_map::NodeData::CachedDirectory { mtime } => { - (b'd', Entry::from_timestamp(*mtime)) + on_disk_nodes.push(match node { + NodeRef::InMemory(path, node) => { + let (state, data) = match &node.data { + dirstate_map::NodeData::Entry(entry) => ( + entry.state.into(), + Entry { + mode: entry.mode.into(), + mtime: entry.mtime.into(), + size: entry.size.into(), + }, + ), + dirstate_map::NodeData::CachedDirectory { mtime } => { + (b'd', Entry::from_timestamp(*mtime)) + } + dirstate_map::NodeData::None => ( + b'\0', + Entry { + mode: 0.into(), + mtime: 0.into(), + size: 0.into(), + }, + ), + }; + Node { + children, + copy_source, + full_path, + base_name_start: u16::try_from(path.base_name_start()) + // Could only panic for paths over 64 KiB + .expect("dirstate-v2 path length overflow") + .into(), + descendants_with_entry_count: node + .descendants_with_entry_count + .into(), + tracked_descendants_count: node + .tracked_descendants_count + .into(), + state, + data, } - dirstate_map::NodeData::None => ( - b'\0', - Entry { - mode: 0.into(), - mtime: 0.into(), - size: 0.into(), - }, - ), - }; - Node { + } + NodeRef::OnDisk(node) => Node { children, copy_source, full_path, - base_name_start: u16::try_from(path.base_name_start()) - // Could only panic for paths over 64 KiB - .expect("dirstate-v2 path length overflow") - .into(), - descendants_with_entry_count: node - .descendants_with_entry_count - .into(), - tracked_descendants_count: node - .tracked_descendants_count - .into(), - state, - data, - } - } - NodeRef::OnDisk(node) => Node { - children, - copy_source, - full_path, - ..*node - }, - }) + ..*node + }, + }) + } + // … so we can write them contiguously, after writing everything else + // they refer to. + let start = self.current_offset(); + let len = u32::try_from(nodes_len) + // Could only panic with over 4 billion nodes + .expect("dirstate-v2 path length overflow") + .into(); + self.out.extend(on_disk_nodes.as_bytes()); + Ok(ChildNodes { start, len }) } - // … so we can write them contiguously, after writing everything else they - // refer to. - let start = current_offset(out); - let len = u32::try_from(nodes_len) - // Could only panic with over 4 billion nodes - .expect("dirstate-v2 path length overflow") - .into(); - out.extend(on_disk_nodes.as_bytes()); - Ok(ChildNodes { start, len }) -} -fn current_offset(out: &Vec) -> Offset { - u32::try_from(out.len()) - // Could only panic for a dirstate file larger than 4 GiB - .expect("dirstate-v2 offset overflow") - .into() -} + fn current_offset(&mut self) -> Offset { + let mut offset = self.out.len(); + if self.append { + offset += self.dirstate_map.on_disk.len() + } + u32::try_from(offset) + // Could only panic for a dirstate file larger than 4 GiB + .expect("dirstate-v2 offset overflow") + .into() + } -fn write_path(slice: &[u8], out: &mut Vec) -> PathSlice { - let start = current_offset(out); - let len = u16::try_from(slice.len()) - // Could only panic for paths over 64 KiB - .expect("dirstate-v2 path length overflow") - .into(); - out.extend(slice.as_bytes()); - PathSlice { start, len } + fn write_path(&mut self, slice: &[u8]) -> PathSlice { + let start = self.current_offset(); + let len = u16::try_from(slice.len()) + // Could only panic for paths over 64 KiB + .expect("dirstate-v2 path length overflow") + .into(); + self.out.extend(slice.as_bytes()); + PathSlice { start, len } + } } diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs --- a/rust/hg-cpython/src/dirstate/dirstate_map.rs +++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs @@ -340,16 +340,23 @@ py_class!(pub class DirstateMap |py| { } } + /// Returns new data together with whether that data should be appended to + /// the existing data file whose content is at `self.on_disk` (True), + /// instead of written to a new data file (False). def write_v2( &self, - now: PyObject - ) -> PyResult { + now: PyObject, + can_append: bool, + ) -> PyResult { let now = Timestamp(now.extract(py)?); let mut inner = self.inner(py).borrow_mut(); - let result = inner.pack_v2(now); + let result = inner.pack_v2(now, can_append); match result { - Ok(packed) => Ok(PyBytes::new(py, &packed)), + Ok((packed, append)) => { + let packed = PyBytes::new(py, &packed); + Ok((packed, append).to_py_object(py).into_object()) + }, Err(_) => Err(PyErr::new::( py, "Dirstate error".to_string(), diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs --- a/rust/hg-cpython/src/dirstate/dispatch.rs +++ b/rust/hg-cpython/src/dirstate/dispatch.rs @@ -124,8 +124,12 @@ impl DirstateMapMethods for OwningDirsta self.get_mut().pack_v1(parents, now) } - fn pack_v2(&mut self, now: Timestamp) -> Result, DirstateError> { - self.get_mut().pack_v2(now) + fn pack_v2( + &mut self, + now: Timestamp, + can_append: bool, + ) -> Result<(Vec, bool), DirstateError> { + self.get_mut().pack_v2(now, can_append) } fn status<'a>(