##// END OF EJS Templates
dirstate-v2: Skip readdir in status based on directory mtime...
Simon Sapin -
r48138:7138c863 default
parent child Browse files
Show More
@@ -23,6 +23,7 b' rayon = "1.3.0"'
23 23 regex = "1.3.9"
24 24 twox-hash = "1.5.0"
25 25 same-file = "1.0.6"
26 tempfile = "3.1.0"
26 27 crossbeam-channel = "0.4"
27 28 micro-timer = "0.3.0"
28 29 log = "0.4.8"
@@ -41,4 +42,3 b' default-features = false'
41 42 [dev-dependencies]
42 43 clap = "*"
43 44 pretty_assertions = "0.6.1"
44 tempfile = "3.1.0"
@@ -317,6 +317,18 b" impl<'tree, 'on_disk> NodeRef<'tree, 'on"
317 317 }
318 318 }
319 319
320 pub(super) fn cached_directory_mtime(
321 &self,
322 ) -> Option<&on_disk::Timestamp> {
323 match self {
324 NodeRef::InMemory(_path, node) => match &node.data {
325 NodeData::CachedDirectory { mtime } => Some(mtime),
326 _ => None,
327 },
328 NodeRef::OnDisk(node) => node.cached_directory_mtime(),
329 }
330 }
331
320 332 pub(super) fn tracked_descendants_count(&self) -> u32 {
321 333 match self {
322 334 NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
@@ -479,7 +491,7 b" impl<'on_disk> DirstateMap<'on_disk> {"
479 491 }
480 492 }
481 493
482 fn get_or_insert_node<'tree, 'path>(
494 pub(super) fn get_or_insert_node<'tree, 'path>(
483 495 on_disk: &'on_disk [u8],
484 496 root: &'tree mut ChildNodes<'on_disk>,
485 497 path: &'path HgPath,
@@ -56,13 +56,31 b' pub(super) struct Node {'
56 56
57 57 /// Dependending on the value of `state`:
58 58 ///
59 /// * A null byte: `data` represents nothing
59 /// * A null byte: `data` is not used.
60 ///
60 61 /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
61 /// represents a dirstate entry like in the v1 format.
62 /// represent a dirstate entry like in the v1 format.
63 ///
62 64 /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
63 65 /// as the `Timestamp` for the mtime of a cached directory.
64 66 ///
65 /// TODO: document directory caching
67 /// The presence of this state means that at some point, this path in
68 /// the working directory was observed:
69 ///
70 /// - To be a directory
71 /// - With the modification time as given by `Timestamp`
72 /// - That timestamp was already strictly in the past when observed,
73 /// meaning that later changes cannot happen in the same clock tick
74 /// and must cause a different modification time (unless the system
75 /// clock jumps back and we get unlucky, which is not impossible but
76 /// but deemed unlikely enough).
77 /// - The directory did not contain any child entry that did not have a
78 /// corresponding dirstate node.
79 ///
80 /// This means that if `std::fs::symlink_metadata` later reports the
81 /// same modification time, we don’t need to call `std::fs::read_dir`
82 /// again for this directory and can iterate child dirstate nodes
83 /// instead.
66 84 state: u8,
67 85 data: Entry,
68 86 }
@@ -76,7 +94,7 b' struct Entry {'
76 94 }
77 95
78 96 /// Duration since the Unix epoch
79 #[derive(BytesCast, Copy, Clone)]
97 #[derive(BytesCast, Copy, Clone, PartialEq)]
80 98 #[repr(C)]
81 99 pub(super) struct Timestamp {
82 100 seconds: I64Be,
@@ -258,6 +276,14 b' impl Node {'
258 276 }
259 277 }
260 278
279 pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
280 if self.state == b'd' {
281 Some(self.data.as_timestamp())
282 } else {
283 None
284 }
285 }
286
261 287 pub(super) fn state(
262 288 &self,
263 289 ) -> Result<Option<EntryState>, DirstateV2ParseError> {
@@ -326,8 +352,8 b' impl Entry {'
326 352 }
327 353 }
328 354
329 impl From<&'_ SystemTime> for Timestamp {
330 fn from(system_time: &'_ SystemTime) -> Self {
355 impl From<SystemTime> for Timestamp {
356 fn from(system_time: SystemTime) -> Self {
331 357 let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
332 358 Ok(duration) => {
333 359 (duration.as_secs() as i64, duration.subsec_nanos())
@@ -2,8 +2,11 b' use crate::dirstate::status::IgnoreFnTyp'
2 2 use crate::dirstate_tree::dirstate_map::BorrowedPath;
3 3 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
4 4 use crate::dirstate_tree::dirstate_map::DirstateMap;
5 use crate::dirstate_tree::dirstate_map::NodeData;
5 6 use crate::dirstate_tree::dirstate_map::NodeRef;
6 7 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
8 use crate::dirstate_tree::on_disk::Timestamp;
9 use crate::dirstate_tree::path_with_basename::WithBasename;
7 10 use crate::matchers::get_ignore_function;
8 11 use crate::matchers::Matcher;
9 12 use crate::utils::files::get_bytes_from_os_string;
@@ -18,10 +21,12 b' use crate::StatusError;'
18 21 use crate::StatusOptions;
19 22 use micro_timer::timed;
20 23 use rayon::prelude::*;
24 use std::borrow::Cow;
21 25 use std::io;
22 26 use std::path::Path;
23 27 use std::path::PathBuf;
24 28 use std::sync::Mutex;
29 use std::time::SystemTime;
25 30
26 31 /// Returns the status of the working directory compared to its parent
27 32 /// changeset.
@@ -52,19 +57,45 b" pub fn status<'tree, 'on_disk: 'tree>("
52 57 options,
53 58 matcher,
54 59 ignore_fn,
55 outcome: Mutex::new(DirstateStatus::default()),
60 outcome: Default::default(),
61 cached_directory_mtimes_to_add: Default::default(),
62 filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
56 63 };
57 64 let is_at_repo_root = true;
58 65 let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
59 66 let has_ignored_ancestor = false;
67 let root_cached_mtime = None;
68 let root_dir_metadata = None;
69 // If the path we have for the repository root is a symlink, do follow it.
70 // (As opposed to symlinks within the working directory which are not
71 // followed, using `std::fs::symlink_metadata`.)
60 72 common.traverse_fs_directory_and_dirstate(
61 73 has_ignored_ancestor,
62 74 dmap.root.as_ref(),
63 75 hg_path,
64 76 &root_dir,
77 root_dir_metadata,
78 root_cached_mtime,
65 79 is_at_repo_root,
66 80 )?;
67 Ok((common.outcome.into_inner().unwrap(), warnings))
81 let outcome = common.outcome.into_inner().unwrap();
82 let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
83 for (path, mtime) in &to_add {
84 let node = DirstateMap::get_or_insert_node(
85 dmap.on_disk,
86 &mut dmap.root,
87 path,
88 WithBasename::to_cow_owned,
89 |_| {},
90 )?;
91 match &node.data {
92 NodeData::Entry(_) => {} // Don’t overwrite an entry
93 NodeData::CachedDirectory { .. } | NodeData::None => {
94 node.data = NodeData::CachedDirectory { mtime: *mtime }
95 }
96 }
97 }
98 Ok((outcome, warnings))
68 99 }
69 100
70 101 /// Bag of random things needed by various parts of the algorithm. Reduces the
@@ -75,6 +106,12 b" struct StatusCommon<'a, 'tree, 'on_disk:"
75 106 matcher: &'a (dyn Matcher + Sync),
76 107 ignore_fn: IgnoreFnType<'a>,
77 108 outcome: Mutex<DirstateStatus<'on_disk>>,
109 cached_directory_mtimes_to_add:
110 Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
111
112 /// The current time at the start of the `status()` algorithm, as measured
113 /// and possibly truncated by the filesystem.
114 filesystem_time_at_status_start: Option<SystemTime>,
78 115 }
79 116
80 117 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
@@ -97,18 +134,54 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
97 134 .push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
98 135 }
99 136
137 /// If this returns true, we can get accurate results by only using
138 /// `symlink_metadata` for child nodes that exist in the dirstate and don’t
139 /// need to call `read_dir`.
140 fn can_skip_fs_readdir(
141 &self,
142 directory_metadata: Option<&std::fs::Metadata>,
143 cached_directory_mtime: Option<&Timestamp>,
144 ) -> bool {
145 if !self.options.list_unknown && !self.options.list_ignored {
146 // All states that we care about listing have corresponding
147 // dirstate entries.
148 // This happens for example with `hg status -mard`.
149 return true;
150 }
151 if let Some(cached_mtime) = cached_directory_mtime {
152 // The dirstate contains a cached mtime for this directory, set by
153 // a previous run of the `status` algorithm which found this
154 // directory eligible for `read_dir` caching.
155 if let Some(meta) = directory_metadata {
156 if let Ok(current_mtime) = meta.modified() {
157 if current_mtime == cached_mtime.into() {
158 // The mtime of that directory has not changed since
159 // then, which means that the
160 // results of `read_dir` should also
161 // be unchanged.
162 return true;
163 }
164 }
165 }
166 }
167 false
168 }
169
170 /// Returns whether the filesystem directory was found to have any entry
171 /// that does not have a corresponding dirstate tree node.
100 172 fn traverse_fs_directory_and_dirstate(
101 173 &self,
102 174 has_ignored_ancestor: bool,
103 175 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
104 176 directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
105 177 directory_fs_path: &Path,
178 directory_metadata: Option<&std::fs::Metadata>,
179 cached_directory_mtime: Option<&Timestamp>,
106 180 is_at_repo_root: bool,
107 ) -> Result<(), DirstateV2ParseError> {
108 if !self.options.list_unknown && !self.options.list_ignored {
109 // We only care about files in the dirstate, so we can skip listing
110 // filesystem directories entirely.
111 return dirstate_nodes
181 ) -> Result<bool, DirstateV2ParseError> {
182 if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
183 {
184 dirstate_nodes
112 185 .par_iter()
113 186 .map(|dirstate_node| {
114 187 let fs_path = directory_fs_path.join(get_path_from_bytes(
@@ -131,7 +204,13 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
131 204 }
132 205 }
133 206 })
134 .collect();
207 .collect::<Result<_, _>>()?;
208
209 // Conservatively don’t let the caller assume that there aren’t
210 // any, since we don’t know.
211 let directory_has_any_fs_only_entry = true;
212
213 return Ok(directory_has_any_fs_only_entry);
135 214 }
136 215
137 216 let mut fs_entries = if let Ok(entries) = self.read_dir(
@@ -174,6 +253,7 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
174 253 .par_bridge()
175 254 .map(|pair| {
176 255 use itertools::EitherOrBoth::*;
256 let is_fs_only = pair.is_right();
177 257 match pair {
178 258 Both(dirstate_node, fs_entry) => self
179 259 .traverse_fs_and_dirstate(
@@ -181,18 +261,19 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
181 261 &fs_entry.metadata,
182 262 dirstate_node,
183 263 has_ignored_ancestor,
184 ),
264 )?,
185 265 Left(dirstate_node) => {
186 self.traverse_dirstate_only(dirstate_node)
266 self.traverse_dirstate_only(dirstate_node)?
187 267 }
188 Right(fs_entry) => Ok(self.traverse_fs_only(
268 Right(fs_entry) => self.traverse_fs_only(
189 269 has_ignored_ancestor,
190 270 directory_hg_path,
191 271 fs_entry,
192 )),
272 ),
193 273 }
274 Ok(is_fs_only)
194 275 })
195 .collect()
276 .try_reduce(|| false, |a, b| Ok(a || b))
196 277 }
197 278
198 279 fn traverse_fs_and_dirstate(
@@ -224,12 +305,20 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
224 305 }
225 306 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
226 307 let is_at_repo_root = false;
227 self.traverse_fs_directory_and_dirstate(
308 let directory_has_any_fs_only_entry = self
309 .traverse_fs_directory_and_dirstate(
228 310 is_ignored,
229 311 dirstate_node.children(self.dmap.on_disk)?,
230 312 hg_path,
231 313 fs_path,
314 Some(fs_metadata),
315 dirstate_node.cached_directory_mtime(),
232 316 is_at_repo_root,
317 )?;
318 self.maybe_save_directory_mtime(
319 directory_has_any_fs_only_entry,
320 fs_metadata,
321 dirstate_node,
233 322 )?
234 323 } else {
235 324 if file_or_symlink && self.matcher.matches(hg_path) {
@@ -274,6 +363,75 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
274 363 Ok(())
275 364 }
276 365
366 fn maybe_save_directory_mtime(
367 &self,
368 directory_has_any_fs_only_entry: bool,
369 directory_metadata: &std::fs::Metadata,
370 dirstate_node: NodeRef<'tree, 'on_disk>,
371 ) -> Result<(), DirstateV2ParseError> {
372 if !directory_has_any_fs_only_entry {
373 // All filesystem directory entries from `read_dir` have a
374 // corresponding node in the dirstate, so we can reconstitute the
375 // names of those entries without calling `read_dir` again.
376 if let (Some(status_start), Ok(directory_mtime)) = (
377 &self.filesystem_time_at_status_start,
378 directory_metadata.modified(),
379 ) {
380 // Although the Rust standard library’s `SystemTime` type
381 // has nanosecond precision, the times reported for a
382 // directory’s (or file’s) modified time may have lower
383 // resolution based on the filesystem (for example ext3
384 // only stores integer seconds), kernel (see
385 // https://stackoverflow.com/a/14393315/1162888), etc.
386 if &directory_mtime >= status_start {
387 // The directory was modified too recently, don’t cache its
388 // `read_dir` results.
389 //
390 // A timeline like this is possible:
391 //
392 // 1. A change to this directory (direct child was
393 // added or removed) cause its mtime to be set
394 // (possibly truncated) to `directory_mtime`
395 // 2. This `status` algorithm calls `read_dir`
396 // 3. An other change is made to the same directory is
397 // made so that calling `read_dir` agin would give
398 // different results, but soon enough after 1. that
399 // the mtime stays the same
400 //
401 // On a system where the time resolution poor, this
402 // scenario is not unlikely if all three steps are caused
403 // by the same script.
404 } else {
405 // We’ve observed (through `status_start`) that time has
406 // “progressed” since `directory_mtime`, so any further
407 // change to this directory is extremely likely to cause a
408 // different mtime.
409 //
410 // Having the same mtime again is not entirely impossible
411 // since the system clock is not monotonous. It could jump
412 // backward to some point before `directory_mtime`, then a
413 // directory change could potentially happen during exactly
414 // the wrong tick.
415 //
416 // We deem this scenario (unlike the previous one) to be
417 // unlikely enough in practice.
418 let timestamp = directory_mtime.into();
419 let cached = dirstate_node.cached_directory_mtime();
420 if cached != Some(&timestamp) {
421 let hg_path = dirstate_node
422 .full_path_borrowed(self.dmap.on_disk)?
423 .detach_from_tree();
424 self.cached_directory_mtimes_to_add
425 .lock()
426 .unwrap()
427 .push((hg_path, timestamp))
428 }
429 }
430 }
431 }
432 Ok(())
433 }
434
277 435 /// A file with `EntryState::Normal` in the dirstate was found in the
278 436 /// filesystem
279 437 fn handle_normal_file(
@@ -505,3 +663,22 b' impl DirEntry {'
505 663 Ok(results)
506 664 }
507 665 }
666
667 /// Return the `mtime` of a temporary file newly-created in the `.hg` directory
668 /// of the give repository.
669 ///
670 /// This is similar to `SystemTime::now()`, with the result truncated to the
671 /// same time resolution as other files’ modification times. Using `.hg`
672 /// instead of the system’s default temporary directory (such as `/tmp`) makes
673 /// it more likely the temporary file is in the same disk partition as contents
674 /// of the working directory, which can matter since different filesystems may
675 /// store timestamps with different resolutions.
676 ///
677 /// This may fail, typically if we lack write permissions. In that case we
678 /// should continue the `status()` algoritm anyway and consider the current
679 /// date/time to be unknown.
680 fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
681 tempfile::tempfile_in(repo_root.join(".hg"))?
682 .metadata()?
683 .modified()
684 }
General Comments 0
You need to be logged in to leave comments. Login now