##// END OF EJS Templates
dirstate-v2: Change the on-disk format to be tree-shaped...
Simon Sapin -
r48058:2a9ddc80 default
parent child Browse files
Show More
@@ -4,17 +4,15 b' use std::borrow::Cow;'
4 use std::convert::TryInto;
4 use std::convert::TryInto;
5 use std::path::PathBuf;
5 use std::path::PathBuf;
6
6
7 use super::on_disk::V2_FORMAT_MARKER;
7 use super::on_disk;
8 use super::path_with_basename::WithBasename;
8 use super::path_with_basename::WithBasename;
9 use crate::dirstate::parsers::clear_ambiguous_mtime;
9 use crate::dirstate::parsers::clear_ambiguous_mtime;
10 use crate::dirstate::parsers::pack_entry;
10 use crate::dirstate::parsers::pack_entry;
11 use crate::dirstate::parsers::packed_entry_size;
11 use crate::dirstate::parsers::packed_entry_size;
12 use crate::dirstate::parsers::parse_dirstate_entries;
12 use crate::dirstate::parsers::parse_dirstate_entries;
13 use crate::dirstate::parsers::Timestamp;
13 use crate::dirstate::parsers::Timestamp;
14 use crate::errors::HgError;
15 use crate::matchers::Matcher;
14 use crate::matchers::Matcher;
16 use crate::utils::hg_path::{HgPath, HgPathBuf};
15 use crate::utils::hg_path::{HgPath, HgPathBuf};
17 use crate::utils::SliceExt;
18 use crate::CopyMapIter;
16 use crate::CopyMapIter;
19 use crate::DirstateEntry;
17 use crate::DirstateEntry;
20 use crate::DirstateError;
18 use crate::DirstateError;
@@ -30,16 +28,16 b' use crate::StatusOptions;'
30
28
31 pub struct DirstateMap<'on_disk> {
29 pub struct DirstateMap<'on_disk> {
32 /// Contents of the `.hg/dirstate` file
30 /// Contents of the `.hg/dirstate` file
33 on_disk: &'on_disk [u8],
31 pub(super) on_disk: &'on_disk [u8],
34
32
35 pub(super) root: ChildNodes<'on_disk>,
33 pub(super) root: ChildNodes<'on_disk>,
36
34
37 /// Number of nodes anywhere in the tree that have `.entry.is_some()`.
35 /// Number of nodes anywhere in the tree that have `.entry.is_some()`.
38 nodes_with_entry_count: usize,
36 pub(super) nodes_with_entry_count: u32,
39
37
40 /// Number of nodes anywhere in the tree that have
38 /// Number of nodes anywhere in the tree that have
41 /// `.copy_source.is_some()`.
39 /// `.copy_source.is_some()`.
42 nodes_with_copy_source_count: usize,
40 pub(super) nodes_with_copy_source_count: u32,
43 }
41 }
44
42
45 /// Using a plain `HgPathBuf` of the full path from the repository root as a
43 /// Using a plain `HgPathBuf` of the full path from the repository root as a
@@ -62,7 +60,7 b" pub(super) struct Node<'on_disk> {"
62 pub(super) children: ChildNodes<'on_disk>,
60 pub(super) children: ChildNodes<'on_disk>,
63
61
64 /// How many (non-inclusive) descendants of this node are tracked files
62 /// How many (non-inclusive) descendants of this node are tracked files
65 tracked_descendants_count: usize,
63 pub(super) tracked_descendants_count: u32,
66 }
64 }
67
65
68 impl<'on_disk> Node<'on_disk> {
66 impl<'on_disk> Node<'on_disk> {
@@ -89,32 +87,27 b" type NodeDataMut<'tree, 'on_disk> = ("
89 );
87 );
90
88
91 impl<'on_disk> DirstateMap<'on_disk> {
89 impl<'on_disk> DirstateMap<'on_disk> {
90 pub(super) fn empty(on_disk: &'on_disk [u8]) -> Self {
91 Self {
92 on_disk,
93 root: ChildNodes::default(),
94 nodes_with_entry_count: 0,
95 nodes_with_copy_source_count: 0,
96 }
97 }
98
92 #[timed]
99 #[timed]
93 pub fn new_v2(
100 pub fn new_v2(
94 on_disk: &'on_disk [u8],
101 on_disk: &'on_disk [u8],
95 ) -> Result<(Self, Option<DirstateParents>), DirstateError> {
102 ) -> Result<(Self, Option<DirstateParents>), DirstateError> {
96 if let Some(rest) = on_disk.drop_prefix(V2_FORMAT_MARKER) {
103 on_disk::read(on_disk)
97 Self::new_v1(rest)
98 } else if on_disk.is_empty() {
99 Self::new_v1(on_disk)
100 } else {
101 return Err(HgError::corrupted(
102 "missing dirstate-v2 magic number",
103 )
104 .into());
105 }
106 }
104 }
107
105
108 #[timed]
106 #[timed]
109 pub fn new_v1(
107 pub fn new_v1(
110 on_disk: &'on_disk [u8],
108 on_disk: &'on_disk [u8],
111 ) -> Result<(Self, Option<DirstateParents>), DirstateError> {
109 ) -> Result<(Self, Option<DirstateParents>), DirstateError> {
112 let mut map = Self {
110 let mut map = Self::empty(on_disk);
113 on_disk,
114 root: ChildNodes::default(),
115 nodes_with_entry_count: 0,
116 nodes_with_copy_source_count: 0,
117 };
118 if map.on_disk.is_empty() {
111 if map.on_disk.is_empty() {
119 return Ok((map, None));
112 return Ok((map, None));
120 }
113 }
@@ -565,10 +558,7 b" impl<'on_disk> super::dispatch::Dirstate"
565 parents: DirstateParents,
558 parents: DirstateParents,
566 now: Timestamp,
559 now: Timestamp,
567 ) -> Result<Vec<u8>, DirstateError> {
560 ) -> Result<Vec<u8>, DirstateError> {
568 // Inefficient but temporary
561 on_disk::write(self, parents, now)
569 let mut v2 = V2_FORMAT_MARKER.to_vec();
570 v2.append(&mut self.pack_v1(parents, now)?);
571 Ok(v2)
572 }
562 }
573
563
574 fn set_all_dirs(&mut self) -> Result<(), DirstateMapError> {
564 fn set_all_dirs(&mut self) -> Result<(), DirstateMapError> {
@@ -595,7 +585,7 b" impl<'on_disk> super::dispatch::Dirstate"
595 }
585 }
596
586
597 fn copy_map_len(&self) -> usize {
587 fn copy_map_len(&self) -> usize {
598 self.nodes_with_copy_source_count
588 self.nodes_with_copy_source_count as usize
599 }
589 }
600
590
601 fn copy_map_iter(&self) -> CopyMapIter<'_> {
591 fn copy_map_iter(&self) -> CopyMapIter<'_> {
@@ -646,7 +636,7 b" impl<'on_disk> super::dispatch::Dirstate"
646 }
636 }
647
637
648 fn len(&self) -> usize {
638 fn len(&self) -> usize {
649 self.nodes_with_entry_count
639 self.nodes_with_entry_count as usize
650 }
640 }
651
641
652 fn contains_key(&self, key: &HgPath) -> bool {
642 fn contains_key(&self, key: &HgPath) -> bool {
@@ -1,4 +1,335 b''
1 //! The "version 2" disk representation of the dirstate
2 //!
3 //! # File format
4 //!
5 //! The file starts with a fixed-sized header, whose layout is defined by the
6 //! `Header` struct. Its `root` field contains the slice (offset and length) to
7 //! the nodes representing the files and directories at the root of the
8 //! repository. Each node is also fixed-size, defined by the `Node` struct.
9 //! Nodes in turn contain slices to variable-size paths, and to their own child
10 //! nodes (if any) for nested files and directories.
11
12 use crate::dirstate::parsers::clear_ambiguous_mtime;
13 use crate::dirstate::parsers::Timestamp;
14 use crate::dirstate_tree::dirstate_map::{self, DirstateMap};
15 use crate::dirstate_tree::path_with_basename::WithBasename;
16 use crate::errors::HgError;
17 use crate::utils::hg_path::HgPath;
18 use crate::DirstateEntry;
19 use crate::DirstateError;
20 use crate::DirstateParents;
21 use bytes_cast::unaligned::{I32Be, U32Be, U64Be};
22 use bytes_cast::BytesCast;
23 use std::borrow::Cow;
24 use std::convert::{TryFrom, TryInto};
25
1 /// Added at the start of `.hg/dirstate` when the "v2" format is used.
26 /// Added at the start of `.hg/dirstate` when the "v2" format is used.
2 /// Acts like a "magic number". This is a sanity check, not strictly necessary
27 /// This a redundant sanity check more than an actual "magic number" since
3 /// since `.hg/requires` already governs which format should be used.
28 /// `.hg/requires` already governs which format should be used.
4 pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
29 pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
30
31 #[derive(BytesCast)]
32 #[repr(C)]
33 struct Header {
34 marker: [u8; V2_FORMAT_MARKER.len()],
35
36 /// `dirstatemap.parents()` in `mercurial/dirstate.py` relies on this
37 /// `parents` field being at this offset, immediately after `marker`.
38 parents: DirstateParents,
39
40 root: ChildNodes,
41 nodes_with_entry_count: Size,
42 nodes_with_copy_source_count: Size,
43 }
44
45 #[derive(BytesCast)]
46 #[repr(C)]
47 struct Node {
48 full_path: PathSlice,
49
50 /// In bytes from `self.full_path.start`
51 base_name_start: Size,
52
53 copy_source: OptPathSlice,
54 entry: OptEntry,
55 children: ChildNodes,
56 tracked_descendants_count: Size,
57 }
58
59 /// Either nothing if `state == b'\0'`, or a dirstate entry like in the v1
60 /// format
61 #[derive(BytesCast)]
62 #[repr(C)]
63 struct OptEntry {
64 state: u8,
65 mode: I32Be,
66 mtime: I32Be,
67 size: I32Be,
68 }
69
70 /// Counted in bytes from the start of the file
71 ///
72 /// NOTE: If we decide to never support `.hg/dirstate` files larger than 4 GiB
73 /// we could save space by using `U32Be` instead.
74 type Offset = U64Be;
75
76 /// Counted in number of items
77 ///
78 /// NOTE: not supporting directories with more than 4 billion direct children,
79 /// or filenames more than 4 GiB.
80 type Size = U32Be;
81
82 /// Location of consecutive, fixed-size items.
83 ///
84 /// An item can be a single byte for paths, or a struct with
85 /// `derive(BytesCast)`.
86 #[derive(BytesCast, Copy, Clone)]
87 #[repr(C)]
88 struct Slice {
89 start: Offset,
90 len: Size,
91 }
92
93 /// A contiguous sequence of `len` times `Node`, representing the child nodes
94 /// of either some other node or of the repository root.
95 ///
96 /// Always sorted by ascending `full_path`, to allow binary search.
97 /// Since nodes with the same parent nodes also have the same parent path,
98 /// only the `base_name`s need to be compared during binary search.
99 type ChildNodes = Slice;
100
101 /// A `HgPath` of `len` bytes
102 type PathSlice = Slice;
103
104 /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
105 type OptPathSlice = Slice;
106
107 /// Make sure that size-affecting changes are made knowingly
108 fn _static_assert_size_of() {
109 let _ = std::mem::transmute::<Header, [u8; 72]>;
110 let _ = std::mem::transmute::<Node, [u8; 57]>;
111 }
112
113 pub(super) fn read<'on_disk>(
114 on_disk: &'on_disk [u8],
115 ) -> Result<(DirstateMap<'on_disk>, Option<DirstateParents>), DirstateError> {
116 if on_disk.is_empty() {
117 return Ok((DirstateMap::empty(on_disk), None));
118 }
119 let (header, _) = Header::from_bytes(on_disk)
120 .map_err(|_| HgError::corrupted("truncated dirstate-v2"))?;
121 let Header {
122 marker,
123 parents,
124 root,
125 nodes_with_entry_count,
126 nodes_with_copy_source_count,
127 } = header;
128 if marker != V2_FORMAT_MARKER {
129 return Err(HgError::corrupted("missing dirstated-v2 marker").into());
130 }
131 let dirstate_map = DirstateMap {
132 on_disk,
133 root: read_nodes(on_disk, *root)?,
134 nodes_with_entry_count: nodes_with_entry_count.get(),
135 nodes_with_copy_source_count: nodes_with_copy_source_count.get(),
136 };
137 let parents = Some(parents.clone());
138 Ok((dirstate_map, parents))
139 }
140
141 impl Node {
142 pub(super) fn path<'on_disk>(
143 &self,
144 on_disk: &'on_disk [u8],
145 ) -> Result<dirstate_map::NodeKey<'on_disk>, HgError> {
146 let full_path = read_hg_path(on_disk, self.full_path)?;
147 let base_name_start = usize::try_from(self.base_name_start.get())
148 // u32 -> usize, could only panic on a 16-bit CPU
149 .expect("dirstate-v2 base_name_start out of bounds");
150 if base_name_start < full_path.len() {
151 Ok(WithBasename::from_raw_parts(full_path, base_name_start))
152 } else {
153 Err(HgError::corrupted(
154 "dirstate-v2 base_name_start out of bounds",
155 ))
156 }
157 }
158
159 pub(super) fn copy_source<'on_disk>(
160 &self,
161 on_disk: &'on_disk [u8],
162 ) -> Result<Option<Cow<'on_disk, HgPath>>, HgError> {
163 Ok(if self.copy_source.start.get() != 0 {
164 Some(read_hg_path(on_disk, self.copy_source)?)
165 } else {
166 None
167 })
168 }
169
170 pub(super) fn entry(&self) -> Result<Option<DirstateEntry>, HgError> {
171 Ok(if self.entry.state != b'\0' {
172 Some(DirstateEntry {
173 state: self.entry.state.try_into()?,
174 mode: self.entry.mode.get(),
175 mtime: self.entry.mtime.get(),
176 size: self.entry.size.get(),
177 })
178 } else {
179 None
180 })
181 }
182
183 pub(super) fn to_in_memory_node<'on_disk>(
184 &self,
185 on_disk: &'on_disk [u8],
186 ) -> Result<dirstate_map::Node<'on_disk>, HgError> {
187 Ok(dirstate_map::Node {
188 children: read_nodes(on_disk, self.children)?,
189 copy_source: self.copy_source(on_disk)?,
190 entry: self.entry()?,
191 tracked_descendants_count: self.tracked_descendants_count.get(),
192 })
193 }
194 }
195
196 fn read_nodes(
197 on_disk: &[u8],
198 slice: ChildNodes,
199 ) -> Result<dirstate_map::ChildNodes, HgError> {
200 read_slice::<Node>(on_disk, slice)?
201 .iter()
202 .map(|node| {
203 Ok((node.path(on_disk)?, node.to_in_memory_node(on_disk)?))
204 })
205 .collect()
206 }
207
208 fn read_hg_path(on_disk: &[u8], slice: Slice) -> Result<Cow<HgPath>, HgError> {
209 let bytes = read_slice::<u8>(on_disk, slice)?;
210 Ok(Cow::Borrowed(HgPath::new(bytes)))
211 }
212
213 fn read_slice<T>(on_disk: &[u8], slice: Slice) -> Result<&[T], HgError>
214 where
215 T: BytesCast,
216 {
217 // Either `usize::MAX` would result in "out of bounds" error since a single
218 // `&[u8]` cannot occupy the entire addess space.
219 let start = usize::try_from(slice.start.get()).unwrap_or(std::usize::MAX);
220 let len = usize::try_from(slice.len.get()).unwrap_or(std::usize::MAX);
221 on_disk
222 .get(start..)
223 .and_then(|bytes| T::slice_from_bytes(bytes, len).ok())
224 .map(|(slice, _rest)| slice)
225 .ok_or_else(|| {
226 HgError::corrupted("dirstate v2 slice is out of bounds")
227 })
228 }
229
230 pub(super) fn write(
231 dirstate_map: &mut DirstateMap,
232 parents: DirstateParents,
233 now: Timestamp,
234 ) -> Result<Vec<u8>, DirstateError> {
235 // TODO:Β how do we want to handle this in 2038?
236 let now: i32 = now.0.try_into().expect("time overflow");
237
238 let header_len = std::mem::size_of::<Header>();
239
240 // This ignores the space for paths, and for nodes without an entry.
241 // TODO: better estimate? Skip the `Vec` and write to a file directly?
242 let size_guess = header_len
243 + std::mem::size_of::<Node>()
244 * dirstate_map.nodes_with_entry_count as usize;
245 let mut out = Vec::with_capacity(size_guess);
246
247 // Keep space for the header. We’ll fill it out at the end when we know the
248 // actual offset for the root nodes.
249 out.resize(header_len, 0_u8);
250
251 let root = write_nodes(&mut dirstate_map.root, now, &mut out)?;
252
253 let header = Header {
254 marker: *V2_FORMAT_MARKER,
255 parents: parents,
256 root,
257 nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
258 nodes_with_copy_source_count: dirstate_map
259 .nodes_with_copy_source_count
260 .into(),
261 };
262 out[..header_len].copy_from_slice(header.as_bytes());
263 Ok(out)
264 }
265
266 /// Serialize the dirstate to the `v2` format after clearing ambigous `mtime`s.
267 fn write_nodes(
268 nodes: &mut dirstate_map::ChildNodes,
269 now: i32,
270 out: &mut Vec<u8>,
271 ) -> Result<ChildNodes, DirstateError> {
272 // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
273 // order. Sort to enable binary search in the written file.
274 let nodes = dirstate_map::Node::sorted(nodes);
275
276 // First accumulate serialized nodes in a `Vec`
277 let mut on_disk_nodes = Vec::with_capacity(nodes.len());
278 for (full_path, node) in nodes {
279 on_disk_nodes.push(Node {
280 children: write_nodes(&mut node.children, now, out)?,
281 tracked_descendants_count: node.tracked_descendants_count.into(),
282 full_path: write_slice::<u8>(
283 full_path.full_path().as_bytes(),
284 out,
285 ),
286 base_name_start: u32::try_from(full_path.base_name_start())
287 // Could only panic for paths over 4 GiB
288 .expect("dirstate-v2 offset overflow")
289 .into(),
290 copy_source: if let Some(source) = &node.copy_source {
291 write_slice::<u8>(source.as_bytes(), out)
292 } else {
293 Slice {
294 start: 0.into(),
295 len: 0.into(),
296 }
297 },
298 entry: if let Some(entry) = &mut node.entry {
299 clear_ambiguous_mtime(entry, now);
300 OptEntry {
301 state: entry.state.into(),
302 mode: entry.mode.into(),
303 mtime: entry.mtime.into(),
304 size: entry.size.into(),
305 }
306 } else {
307 OptEntry {
308 state: b'\0',
309 mode: 0.into(),
310 mtime: 0.into(),
311 size: 0.into(),
312 }
313 },
314 })
315 }
316 // … so we can write them contiguously
317 Ok(write_slice::<Node>(&on_disk_nodes, out))
318 }
319
320 fn write_slice<T>(slice: &[T], out: &mut Vec<u8>) -> Slice
321 where
322 T: BytesCast,
323 {
324 let start = u64::try_from(out.len())
325 // Could only panic on a 128-bit CPU with a dirstate over 16 EiB
326 .expect("dirstate-v2 offset overflow")
327 .into();
328 let len = u32::try_from(slice.len())
329 // Could only panic for paths over 4 GiB or nodes with over 4 billions
330 // child nodes
331 .expect("dirstate-v2 offset overflow")
332 .into();
333 out.extend(slice.as_bytes());
334 Slice { start, len }
335 }
@@ -24,18 +24,29 b' impl<T> WithBasename<T> {'
24 }
24 }
25 }
25 }
26
26
27 fn find_base_name_start(full_path: &HgPath) -> usize {
28 if let Some(last_slash_position) =
29 full_path.as_bytes().iter().rposition(|&byte| byte == b'/')
30 {
31 last_slash_position + 1
32 } else {
33 0
34 }
35 }
36
27 impl<T: AsRef<HgPath>> WithBasename<T> {
37 impl<T: AsRef<HgPath>> WithBasename<T> {
28 pub fn new(full_path: T) -> Self {
38 pub fn new(full_path: T) -> Self {
29 let base_name_start = if let Some(last_slash_position) = full_path
39 Self {
30 .as_ref()
40 base_name_start: find_base_name_start(full_path.as_ref()),
31 .as_bytes()
41 full_path,
32 .iter()
42 }
33 .rposition(|&byte| byte == b'/')
43 }
34 {
44
35 last_slash_position + 1
45 pub fn from_raw_parts(full_path: T, base_name_start: usize) -> Self {
36 } else {
46 debug_assert_eq!(
37 0
47 base_name_start,
38 };
48 find_base_name_start(full_path.as_ref())
49 );
39 Self {
50 Self {
40 base_name_start,
51 base_name_start,
41 full_path,
52 full_path,
@@ -47,6 +58,10 b' impl<T: AsRef<HgPath>> WithBasename<T> {'
47 &self.full_path.as_ref().as_bytes()[self.base_name_start..],
58 &self.full_path.as_ref().as_bytes()[self.base_name_start..],
48 )
59 )
49 }
60 }
61
62 pub fn base_name_start(&self) -> usize {
63 self.base_name_start
64 }
50 }
65 }
51
66
52 impl<T: AsRef<HgPath>> Borrow<HgPath> for WithBasename<T> {
67 impl<T: AsRef<HgPath>> Borrow<HgPath> for WithBasename<T> {
General Comments 0
You need to be logged in to leave comments. Login now