##// END OF EJS Templates
rhg: read [paths] for `--repository` value...
rhg: read [paths] for `--repository` value hg parses `-R` and `--repository` CLI arguments "early" in order to know which local repository to load config from. (Config can then affect whether or how to fall back.) The value of of those arguments can be not only a filesystem path, but also an alias configured in the `[paths]` section. This part was missing in rhg and this patch implements that. The current patch still lacks functionality to read config of current repository if we are not at root of repo. That will be fixed in upcoming patches. A new crate `home` is added to get path of home directory. Differential Revision: https://phab.mercurial-scm.org/D10296

File last commit:

r48165:bd88b6bf default
r48196:ebdef628 default
Show More
on_disk.rs
574 lines | 18.2 KiB | application/rls-services+xml | RustLexer
//! The "version 2" disk representation of the dirstate
//!
//! # File format
//!
//! The file starts with a fixed-sized header, whose layout is defined by the
//! `Header` struct. Its `root` field contains the slice (offset and length) to
//! the nodes representing the files and directories at the root of the
//! repository. Each node is also fixed-size, defined by the `Node` struct.
//! Nodes in turn contain slices to variable-size paths, and to their own child
//! nodes (if any) for nested files and directories.
use crate::dirstate_tree::dirstate_map::{self, DirstateMap, NodeRef};
use crate::dirstate_tree::path_with_basename::WithBasename;
use crate::errors::HgError;
use crate::utils::hg_path::HgPath;
use crate::DirstateEntry;
use crate::DirstateError;
use crate::DirstateParents;
use crate::EntryState;
use bytes_cast::unaligned::{I32Be, I64Be, U32Be, U64Be};
use bytes_cast::BytesCast;
use std::borrow::Cow;
use std::convert::TryFrom;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
/// Added at the start of `.hg/dirstate` when the "v2" format is used.
/// This a redundant sanity check more than an actual "magic number" since
/// `.hg/requires` already governs which format should be used.
pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
#[derive(BytesCast)]
#[repr(C)]
struct Header {
marker: [u8; V2_FORMAT_MARKER.len()],
/// `dirstatemap.parents()` in `mercurial/dirstate.py` relies on this
/// `parents` field being at this offset, immediately after `marker`.
parents: DirstateParents,
root: ChildNodes,
nodes_with_entry_count: Size,
nodes_with_copy_source_count: Size,
}
#[derive(BytesCast)]
#[repr(C)]
pub(super) struct Node {
full_path: PathSlice,
/// In bytes from `self.full_path.start`
base_name_start: Size,
copy_source: OptPathSlice,
children: ChildNodes,
pub(super) tracked_descendants_count: Size,
/// Dependending on the value of `state`:
///
/// * A null byte: `data` is not used.
///
/// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
/// represent a dirstate entry like in the v1 format.
///
/// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
/// as the `Timestamp` for the mtime of a cached directory.
///
/// The presence of this state means that at some point, this path in
/// the working directory was observed:
///
/// - To be a directory
/// - With the modification time as given by `Timestamp`
/// - That timestamp was already strictly in the past when observed,
/// meaning that later changes cannot happen in the same clock tick
/// and must cause a different modification time (unless the system
/// clock jumps back and we get unlucky, which is not impossible but
/// but deemed unlikely enough).
/// - The directory did not contain any child entry that did not have a
/// corresponding dirstate node.
///
/// This means that if `std::fs::symlink_metadata` later reports the
/// same modification time, we don’t need to call `std::fs::read_dir`
/// again for this directory and can iterate child dirstate nodes
/// instead.
state: u8,
data: Entry,
}
#[derive(BytesCast, Copy, Clone)]
#[repr(C)]
struct Entry {
mode: I32Be,
mtime: I32Be,
size: I32Be,
}
/// Duration since the Unix epoch
#[derive(BytesCast, Copy, Clone, PartialEq)]
#[repr(C)]
pub(super) struct Timestamp {
seconds: I64Be,
/// In `0 .. 1_000_000_000`.
///
/// This timestamp is later or earlier than `(seconds, 0)` by this many
/// nanoseconds, if `seconds` is non-negative or negative, respectively.
nanoseconds: U32Be,
}
/// Counted in bytes from the start of the file
///
/// NOTE: If we decide to never support `.hg/dirstate` files larger than 4 GiB
/// we could save space by using `U32Be` instead.
type Offset = U64Be;
/// Counted in number of items
///
/// NOTE: not supporting directories with more than 4 billion direct children,
/// or filenames more than 4 GiB.
type Size = U32Be;
/// Location of consecutive, fixed-size items.
///
/// An item can be a single byte for paths, or a struct with
/// `derive(BytesCast)`.
#[derive(BytesCast, Copy, Clone)]
#[repr(C)]
struct Slice {
start: Offset,
len: Size,
}
/// A contiguous sequence of `len` times `Node`, representing the child nodes
/// of either some other node or of the repository root.
///
/// Always sorted by ascending `full_path`, to allow binary search.
/// Since nodes with the same parent nodes also have the same parent path,
/// only the `base_name`s need to be compared during binary search.
type ChildNodes = Slice;
/// A `HgPath` of `len` bytes
type PathSlice = Slice;
/// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
type OptPathSlice = Slice;
/// Make sure that size-affecting changes are made knowingly
fn _static_assert_size_of() {
let _ = std::mem::transmute::<Header, [u8; 72]>;
let _ = std::mem::transmute::<Node, [u8; 57]>;
}
/// Unexpected file format found in `.hg/dirstate` with the "v2" format.
///
/// This should only happen if Mercurial is buggy or a repository is corrupted.
#[derive(Debug)]
pub struct DirstateV2ParseError;
impl From<DirstateV2ParseError> for HgError {
fn from(_: DirstateV2ParseError) -> Self {
HgError::corrupted("dirstate-v2 parse error")
}
}
impl From<DirstateV2ParseError> for crate::DirstateError {
fn from(error: DirstateV2ParseError) -> Self {
HgError::from(error).into()
}
}
fn read_header(on_disk: &[u8]) -> Result<&Header, DirstateV2ParseError> {
let (header, _) =
Header::from_bytes(on_disk).map_err(|_| DirstateV2ParseError)?;
if header.marker == *V2_FORMAT_MARKER {
Ok(header)
} else {
Err(DirstateV2ParseError)
}
}
pub(super) fn read<'on_disk>(
on_disk: &'on_disk [u8],
) -> Result<
(DirstateMap<'on_disk>, Option<DirstateParents>),
DirstateV2ParseError,
> {
if on_disk.is_empty() {
return Ok((DirstateMap::empty(on_disk), None));
}
let header = read_header(on_disk)?;
let dirstate_map = DirstateMap {
on_disk,
root: dirstate_map::ChildNodes::OnDisk(read_slice::<Node>(
on_disk,
header.root,
)?),
nodes_with_entry_count: header.nodes_with_entry_count.get(),
nodes_with_copy_source_count: header
.nodes_with_copy_source_count
.get(),
};
let parents = Some(header.parents.clone());
Ok((dirstate_map, parents))
}
impl Node {
pub(super) fn full_path<'on_disk>(
&self,
on_disk: &'on_disk [u8],
) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
read_hg_path(on_disk, self.full_path)
}
pub(super) fn base_name_start<'on_disk>(
&self,
) -> Result<usize, DirstateV2ParseError> {
let start = self.base_name_start.get();
if start < self.full_path.len.get() {
let start = usize::try_from(start)
// u32 -> usize, could only panic on a 16-bit CPU
.expect("dirstate-v2 base_name_start out of bounds");
Ok(start)
} else {
Err(DirstateV2ParseError)
}
}
pub(super) fn base_name<'on_disk>(
&self,
on_disk: &'on_disk [u8],
) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
let full_path = self.full_path(on_disk)?;
let base_name_start = self.base_name_start()?;
Ok(HgPath::new(&full_path.as_bytes()[base_name_start..]))
}
pub(super) fn path<'on_disk>(
&self,
on_disk: &'on_disk [u8],
) -> Result<dirstate_map::NodeKey<'on_disk>, DirstateV2ParseError> {
Ok(WithBasename::from_raw_parts(
Cow::Borrowed(self.full_path(on_disk)?),
self.base_name_start()?,
))
}
pub(super) fn has_copy_source<'on_disk>(&self) -> bool {
self.copy_source.start.get() != 0
}
pub(super) fn copy_source<'on_disk>(
&self,
on_disk: &'on_disk [u8],
) -> Result<Option<&'on_disk HgPath>, DirstateV2ParseError> {
Ok(if self.has_copy_source() {
Some(read_hg_path(on_disk, self.copy_source)?)
} else {
None
})
}
pub(super) fn node_data(
&self,
) -> Result<dirstate_map::NodeData, DirstateV2ParseError> {
let entry = |state| {
dirstate_map::NodeData::Entry(self.entry_with_given_state(state))
};
match self.state {
b'\0' => Ok(dirstate_map::NodeData::None),
b'd' => Ok(dirstate_map::NodeData::CachedDirectory {
mtime: *self.data.as_timestamp(),
}),
b'n' => Ok(entry(EntryState::Normal)),
b'a' => Ok(entry(EntryState::Added)),
b'r' => Ok(entry(EntryState::Removed)),
b'm' => Ok(entry(EntryState::Merged)),
_ => Err(DirstateV2ParseError),
}
}
pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
if self.state == b'd' {
Some(self.data.as_timestamp())
} else {
None
}
}
pub(super) fn state(
&self,
) -> Result<Option<EntryState>, DirstateV2ParseError> {
match self.state {
b'\0' | b'd' => Ok(None),
b'n' => Ok(Some(EntryState::Normal)),
b'a' => Ok(Some(EntryState::Added)),
b'r' => Ok(Some(EntryState::Removed)),
b'm' => Ok(Some(EntryState::Merged)),
_ => Err(DirstateV2ParseError),
}
}
fn entry_with_given_state(&self, state: EntryState) -> DirstateEntry {
DirstateEntry {
state,
mode: self.data.mode.get(),
mtime: self.data.mtime.get(),
size: self.data.size.get(),
}
}
pub(super) fn entry(
&self,
) -> Result<Option<DirstateEntry>, DirstateV2ParseError> {
Ok(self
.state()?
.map(|state| self.entry_with_given_state(state)))
}
pub(super) fn children<'on_disk>(
&self,
on_disk: &'on_disk [u8],
) -> Result<&'on_disk [Node], DirstateV2ParseError> {
read_slice::<Node>(on_disk, self.children)
}
pub(super) fn to_in_memory_node<'on_disk>(
&self,
on_disk: &'on_disk [u8],
) -> Result<dirstate_map::Node<'on_disk>, DirstateV2ParseError> {
Ok(dirstate_map::Node {
children: dirstate_map::ChildNodes::OnDisk(
self.children(on_disk)?,
),
copy_source: self.copy_source(on_disk)?.map(Cow::Borrowed),
data: self.node_data()?,
tracked_descendants_count: self.tracked_descendants_count.get(),
})
}
}
impl Entry {
fn from_timestamp(timestamp: Timestamp) -> Self {
// Safety: both types implement the `ByteCast` trait, so we could
// safely use `as_bytes` and `from_bytes` to do this conversion. Using
// `transmute` instead makes the compiler check that the two types
// have the same size, which eliminates the error case of
// `from_bytes`.
unsafe { std::mem::transmute::<Timestamp, Entry>(timestamp) }
}
fn as_timestamp(&self) -> &Timestamp {
// Safety: same as above in `from_timestamp`
unsafe { &*(self as *const Entry as *const Timestamp) }
}
}
impl Timestamp {
pub fn seconds(&self) -> i64 {
self.seconds.get()
}
}
impl From<SystemTime> for Timestamp {
fn from(system_time: SystemTime) -> Self {
let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
Ok(duration) => {
(duration.as_secs() as i64, duration.subsec_nanos())
}
Err(error) => {
let negative = error.duration();
(-(negative.as_secs() as i64), negative.subsec_nanos())
}
};
Timestamp {
seconds: secs.into(),
nanoseconds: nanos.into(),
}
}
}
impl From<&'_ Timestamp> for SystemTime {
fn from(timestamp: &'_ Timestamp) -> Self {
let secs = timestamp.seconds.get();
let nanos = timestamp.nanoseconds.get();
if secs >= 0 {
UNIX_EPOCH + Duration::new(secs as u64, nanos)
} else {
UNIX_EPOCH - Duration::new((-secs) as u64, nanos)
}
}
}
fn read_hg_path(
on_disk: &[u8],
slice: Slice,
) -> Result<&HgPath, DirstateV2ParseError> {
let bytes = read_slice::<u8>(on_disk, slice)?;
Ok(HgPath::new(bytes))
}
fn read_slice<T>(
on_disk: &[u8],
slice: Slice,
) -> Result<&[T], DirstateV2ParseError>
where
T: BytesCast,
{
// Either `usize::MAX` would result in "out of bounds" error since a single
// `&[u8]` cannot occupy the entire addess space.
let start = usize::try_from(slice.start.get()).unwrap_or(std::usize::MAX);
let len = usize::try_from(slice.len.get()).unwrap_or(std::usize::MAX);
on_disk
.get(start..)
.and_then(|bytes| T::slice_from_bytes(bytes, len).ok())
.map(|(slice, _rest)| slice)
.ok_or_else(|| DirstateV2ParseError)
}
pub(crate) fn parse_dirstate_parents(
on_disk: &[u8],
) -> Result<&DirstateParents, HgError> {
Ok(&read_header(on_disk)?.parents)
}
pub(crate) fn for_each_tracked_path<'on_disk>(
on_disk: &'on_disk [u8],
mut f: impl FnMut(&'on_disk HgPath),
) -> Result<(), DirstateV2ParseError> {
let header = read_header(on_disk)?;
fn recur<'on_disk>(
on_disk: &'on_disk [u8],
nodes: Slice,
f: &mut impl FnMut(&'on_disk HgPath),
) -> Result<(), DirstateV2ParseError> {
for node in read_slice::<Node>(on_disk, nodes)? {
if let Some(state) = node.state()? {
if state.is_tracked() {
f(node.full_path(on_disk)?)
}
}
recur(on_disk, node.children, f)?
}
Ok(())
}
recur(on_disk, header.root, &mut f)
}
pub(super) fn write(
dirstate_map: &mut DirstateMap,
parents: DirstateParents,
) -> Result<Vec<u8>, DirstateError> {
let header_len = std::mem::size_of::<Header>();
// This ignores the space for paths, and for nodes without an entry.
// TODO: better estimate? Skip the `Vec` and write to a file directly?
let size_guess = header_len
+ std::mem::size_of::<Node>()
* dirstate_map.nodes_with_entry_count as usize;
let mut out = Vec::with_capacity(size_guess);
// Keep space for the header. We’ll fill it out at the end when we know the
// actual offset for the root nodes.
out.resize(header_len, 0_u8);
let root =
write_nodes(dirstate_map, dirstate_map.root.as_ref(), &mut out)?;
let header = Header {
marker: *V2_FORMAT_MARKER,
parents: parents,
root,
nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
nodes_with_copy_source_count: dirstate_map
.nodes_with_copy_source_count
.into(),
};
out[..header_len].copy_from_slice(header.as_bytes());
Ok(out)
}
fn write_nodes(
dirstate_map: &DirstateMap,
nodes: dirstate_map::ChildNodesRef,
out: &mut Vec<u8>,
) -> Result<ChildNodes, DirstateError> {
// `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
// order. Sort to enable binary search in the written file.
let nodes = nodes.sorted();
// First accumulate serialized nodes in a `Vec`
let mut on_disk_nodes = Vec::with_capacity(nodes.len());
for node in nodes {
let children = write_nodes(
dirstate_map,
node.children(dirstate_map.on_disk)?,
out,
)?;
let full_path = node.full_path(dirstate_map.on_disk)?;
let full_path = write_slice::<u8>(full_path.as_bytes(), out);
let copy_source =
if let Some(source) = node.copy_source(dirstate_map.on_disk)? {
write_slice::<u8>(source.as_bytes(), out)
} else {
Slice {
start: 0.into(),
len: 0.into(),
}
};
on_disk_nodes.push(match node {
NodeRef::InMemory(path, node) => {
let (state, data) = match &node.data {
dirstate_map::NodeData::Entry(entry) => (
entry.state.into(),
Entry {
mode: entry.mode.into(),
mtime: entry.mtime.into(),
size: entry.size.into(),
},
),
dirstate_map::NodeData::CachedDirectory { mtime } => {
(b'd', Entry::from_timestamp(*mtime))
}
dirstate_map::NodeData::None => (
b'\0',
Entry {
mode: 0.into(),
mtime: 0.into(),
size: 0.into(),
},
),
};
Node {
children,
copy_source,
full_path,
base_name_start: u32::try_from(path.base_name_start())
// Could only panic for paths over 4 GiB
.expect("dirstate-v2 offset overflow")
.into(),
tracked_descendants_count: node
.tracked_descendants_count
.into(),
state,
data,
}
}
NodeRef::OnDisk(node) => Node {
children,
copy_source,
full_path,
..*node
},
})
}
// … so we can write them contiguously
Ok(write_slice::<Node>(&on_disk_nodes, out))
}
fn write_slice<T>(slice: &[T], out: &mut Vec<u8>) -> Slice
where
T: BytesCast,
{
let start = u64::try_from(out.len())
// Could only panic on a 128-bit CPU with a dirstate over 16 EiB
.expect("dirstate-v2 offset overflow")
.into();
let len = u32::try_from(slice.len())
// Could only panic for paths over 4 GiB or nodes with over 4 billions
// child nodes
.expect("dirstate-v2 offset overflow")
.into();
out.extend(slice.as_bytes());
Slice { start, len }
}