##// END OF EJS Templates
dirstate-v2: Skip readdir in status based on directory mtime...
Simon Sapin -
r48138:7138c863 default
parent child Browse files
Show More
@@ -23,6 +23,7 b' rayon = "1.3.0"'
23 regex = "1.3.9"
23 regex = "1.3.9"
24 twox-hash = "1.5.0"
24 twox-hash = "1.5.0"
25 same-file = "1.0.6"
25 same-file = "1.0.6"
26 tempfile = "3.1.0"
26 crossbeam-channel = "0.4"
27 crossbeam-channel = "0.4"
27 micro-timer = "0.3.0"
28 micro-timer = "0.3.0"
28 log = "0.4.8"
29 log = "0.4.8"
@@ -41,4 +42,3 b' default-features = false'
41 [dev-dependencies]
42 [dev-dependencies]
42 clap = "*"
43 clap = "*"
43 pretty_assertions = "0.6.1"
44 pretty_assertions = "0.6.1"
44 tempfile = "3.1.0"
@@ -317,6 +317,18 b" impl<'tree, 'on_disk> NodeRef<'tree, 'on"
317 }
317 }
318 }
318 }
319
319
320 pub(super) fn cached_directory_mtime(
321 &self,
322 ) -> Option<&on_disk::Timestamp> {
323 match self {
324 NodeRef::InMemory(_path, node) => match &node.data {
325 NodeData::CachedDirectory { mtime } => Some(mtime),
326 _ => None,
327 },
328 NodeRef::OnDisk(node) => node.cached_directory_mtime(),
329 }
330 }
331
320 pub(super) fn tracked_descendants_count(&self) -> u32 {
332 pub(super) fn tracked_descendants_count(&self) -> u32 {
321 match self {
333 match self {
322 NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
334 NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
@@ -479,7 +491,7 b" impl<'on_disk> DirstateMap<'on_disk> {"
479 }
491 }
480 }
492 }
481
493
482 fn get_or_insert_node<'tree, 'path>(
494 pub(super) fn get_or_insert_node<'tree, 'path>(
483 on_disk: &'on_disk [u8],
495 on_disk: &'on_disk [u8],
484 root: &'tree mut ChildNodes<'on_disk>,
496 root: &'tree mut ChildNodes<'on_disk>,
485 path: &'path HgPath,
497 path: &'path HgPath,
@@ -56,13 +56,31 b' pub(super) struct Node {'
56
56
57 /// Dependending on the value of `state`:
57 /// Dependending on the value of `state`:
58 ///
58 ///
59 /// * A null byte: `data` represents nothing
59 /// * A null byte: `data` is not used.
60 ///
60 /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
61 /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
61 /// represents a dirstate entry like in the v1 format.
62 /// represent a dirstate entry like in the v1 format.
63 ///
62 /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
64 /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
63 /// as the `Timestamp` for the mtime of a cached directory.
65 /// as the `Timestamp` for the mtime of a cached directory.
64 ///
66 ///
65 /// TODO: document directory caching
67 /// The presence of this state means that at some point, this path in
68 /// the working directory was observed:
69 ///
70 /// - To be a directory
71 /// - With the modification time as given by `Timestamp`
72 /// - That timestamp was already strictly in the past when observed,
73 /// meaning that later changes cannot happen in the same clock tick
74 /// and must cause a different modification time (unless the system
75 /// clock jumps back and we get unlucky, which is not impossible but
76 /// but deemed unlikely enough).
77 /// - The directory did not contain any child entry that did not have a
78 /// corresponding dirstate node.
79 ///
80 /// This means that if `std::fs::symlink_metadata` later reports the
81 /// same modification time, we don’t need to call `std::fs::read_dir`
82 /// again for this directory and can iterate child dirstate nodes
83 /// instead.
66 state: u8,
84 state: u8,
67 data: Entry,
85 data: Entry,
68 }
86 }
@@ -76,7 +94,7 b' struct Entry {'
76 }
94 }
77
95
78 /// Duration since the Unix epoch
96 /// Duration since the Unix epoch
79 #[derive(BytesCast, Copy, Clone)]
97 #[derive(BytesCast, Copy, Clone, PartialEq)]
80 #[repr(C)]
98 #[repr(C)]
81 pub(super) struct Timestamp {
99 pub(super) struct Timestamp {
82 seconds: I64Be,
100 seconds: I64Be,
@@ -258,6 +276,14 b' impl Node {'
258 }
276 }
259 }
277 }
260
278
279 pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
280 if self.state == b'd' {
281 Some(self.data.as_timestamp())
282 } else {
283 None
284 }
285 }
286
261 pub(super) fn state(
287 pub(super) fn state(
262 &self,
288 &self,
263 ) -> Result<Option<EntryState>, DirstateV2ParseError> {
289 ) -> Result<Option<EntryState>, DirstateV2ParseError> {
@@ -326,8 +352,8 b' impl Entry {'
326 }
352 }
327 }
353 }
328
354
329 impl From<&'_ SystemTime> for Timestamp {
355 impl From<SystemTime> for Timestamp {
330 fn from(system_time: &'_ SystemTime) -> Self {
356 fn from(system_time: SystemTime) -> Self {
331 let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
357 let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
332 Ok(duration) => {
358 Ok(duration) => {
333 (duration.as_secs() as i64, duration.subsec_nanos())
359 (duration.as_secs() as i64, duration.subsec_nanos())
@@ -2,8 +2,11 b' use crate::dirstate::status::IgnoreFnTyp'
2 use crate::dirstate_tree::dirstate_map::BorrowedPath;
2 use crate::dirstate_tree::dirstate_map::BorrowedPath;
3 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
3 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
4 use crate::dirstate_tree::dirstate_map::DirstateMap;
4 use crate::dirstate_tree::dirstate_map::DirstateMap;
5 use crate::dirstate_tree::dirstate_map::NodeData;
5 use crate::dirstate_tree::dirstate_map::NodeRef;
6 use crate::dirstate_tree::dirstate_map::NodeRef;
6 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
7 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
8 use crate::dirstate_tree::on_disk::Timestamp;
9 use crate::dirstate_tree::path_with_basename::WithBasename;
7 use crate::matchers::get_ignore_function;
10 use crate::matchers::get_ignore_function;
8 use crate::matchers::Matcher;
11 use crate::matchers::Matcher;
9 use crate::utils::files::get_bytes_from_os_string;
12 use crate::utils::files::get_bytes_from_os_string;
@@ -18,10 +21,12 b' use crate::StatusError;'
18 use crate::StatusOptions;
21 use crate::StatusOptions;
19 use micro_timer::timed;
22 use micro_timer::timed;
20 use rayon::prelude::*;
23 use rayon::prelude::*;
24 use std::borrow::Cow;
21 use std::io;
25 use std::io;
22 use std::path::Path;
26 use std::path::Path;
23 use std::path::PathBuf;
27 use std::path::PathBuf;
24 use std::sync::Mutex;
28 use std::sync::Mutex;
29 use std::time::SystemTime;
25
30
26 /// Returns the status of the working directory compared to its parent
31 /// Returns the status of the working directory compared to its parent
27 /// changeset.
32 /// changeset.
@@ -52,19 +57,45 b" pub fn status<'tree, 'on_disk: 'tree>("
52 options,
57 options,
53 matcher,
58 matcher,
54 ignore_fn,
59 ignore_fn,
55 outcome: Mutex::new(DirstateStatus::default()),
60 outcome: Default::default(),
61 cached_directory_mtimes_to_add: Default::default(),
62 filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
56 };
63 };
57 let is_at_repo_root = true;
64 let is_at_repo_root = true;
58 let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
65 let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
59 let has_ignored_ancestor = false;
66 let has_ignored_ancestor = false;
67 let root_cached_mtime = None;
68 let root_dir_metadata = None;
69 // If the path we have for the repository root is a symlink, do follow it.
70 // (As opposed to symlinks within the working directory which are not
71 // followed, using `std::fs::symlink_metadata`.)
60 common.traverse_fs_directory_and_dirstate(
72 common.traverse_fs_directory_and_dirstate(
61 has_ignored_ancestor,
73 has_ignored_ancestor,
62 dmap.root.as_ref(),
74 dmap.root.as_ref(),
63 hg_path,
75 hg_path,
64 &root_dir,
76 &root_dir,
77 root_dir_metadata,
78 root_cached_mtime,
65 is_at_repo_root,
79 is_at_repo_root,
66 )?;
80 )?;
67 Ok((common.outcome.into_inner().unwrap(), warnings))
81 let outcome = common.outcome.into_inner().unwrap();
82 let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
83 for (path, mtime) in &to_add {
84 let node = DirstateMap::get_or_insert_node(
85 dmap.on_disk,
86 &mut dmap.root,
87 path,
88 WithBasename::to_cow_owned,
89 |_| {},
90 )?;
91 match &node.data {
92 NodeData::Entry(_) => {} // Don’t overwrite an entry
93 NodeData::CachedDirectory { .. } | NodeData::None => {
94 node.data = NodeData::CachedDirectory { mtime: *mtime }
95 }
96 }
97 }
98 Ok((outcome, warnings))
68 }
99 }
69
100
70 /// Bag of random things needed by various parts of the algorithm. Reduces the
101 /// Bag of random things needed by various parts of the algorithm. Reduces the
@@ -75,6 +106,12 b" struct StatusCommon<'a, 'tree, 'on_disk:"
75 matcher: &'a (dyn Matcher + Sync),
106 matcher: &'a (dyn Matcher + Sync),
76 ignore_fn: IgnoreFnType<'a>,
107 ignore_fn: IgnoreFnType<'a>,
77 outcome: Mutex<DirstateStatus<'on_disk>>,
108 outcome: Mutex<DirstateStatus<'on_disk>>,
109 cached_directory_mtimes_to_add:
110 Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
111
112 /// The current time at the start of the `status()` algorithm, as measured
113 /// and possibly truncated by the filesystem.
114 filesystem_time_at_status_start: Option<SystemTime>,
78 }
115 }
79
116
80 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
117 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
@@ -97,18 +134,54 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
97 .push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
134 .push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
98 }
135 }
99
136
137 /// If this returns true, we can get accurate results by only using
138 /// `symlink_metadata` for child nodes that exist in the dirstate and don’t
139 /// need to call `read_dir`.
140 fn can_skip_fs_readdir(
141 &self,
142 directory_metadata: Option<&std::fs::Metadata>,
143 cached_directory_mtime: Option<&Timestamp>,
144 ) -> bool {
145 if !self.options.list_unknown && !self.options.list_ignored {
146 // All states that we care about listing have corresponding
147 // dirstate entries.
148 // This happens for example with `hg status -mard`.
149 return true;
150 }
151 if let Some(cached_mtime) = cached_directory_mtime {
152 // The dirstate contains a cached mtime for this directory, set by
153 // a previous run of the `status` algorithm which found this
154 // directory eligible for `read_dir` caching.
155 if let Some(meta) = directory_metadata {
156 if let Ok(current_mtime) = meta.modified() {
157 if current_mtime == cached_mtime.into() {
158 // The mtime of that directory has not changed since
159 // then, which means that the
160 // results of `read_dir` should also
161 // be unchanged.
162 return true;
163 }
164 }
165 }
166 }
167 false
168 }
169
170 /// Returns whether the filesystem directory was found to have any entry
171 /// that does not have a corresponding dirstate tree node.
100 fn traverse_fs_directory_and_dirstate(
172 fn traverse_fs_directory_and_dirstate(
101 &self,
173 &self,
102 has_ignored_ancestor: bool,
174 has_ignored_ancestor: bool,
103 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
175 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
104 directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
176 directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
105 directory_fs_path: &Path,
177 directory_fs_path: &Path,
178 directory_metadata: Option<&std::fs::Metadata>,
179 cached_directory_mtime: Option<&Timestamp>,
106 is_at_repo_root: bool,
180 is_at_repo_root: bool,
107 ) -> Result<(), DirstateV2ParseError> {
181 ) -> Result<bool, DirstateV2ParseError> {
108 if !self.options.list_unknown && !self.options.list_ignored {
182 if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
109 // We only care about files in the dirstate, so we can skip listing
183 {
110 // filesystem directories entirely.
184 dirstate_nodes
111 return dirstate_nodes
112 .par_iter()
185 .par_iter()
113 .map(|dirstate_node| {
186 .map(|dirstate_node| {
114 let fs_path = directory_fs_path.join(get_path_from_bytes(
187 let fs_path = directory_fs_path.join(get_path_from_bytes(
@@ -131,7 +204,13 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
131 }
204 }
132 }
205 }
133 })
206 })
134 .collect();
207 .collect::<Result<_, _>>()?;
208
209 // Conservatively don’t let the caller assume that there aren’t
210 // any, since we don’t know.
211 let directory_has_any_fs_only_entry = true;
212
213 return Ok(directory_has_any_fs_only_entry);
135 }
214 }
136
215
137 let mut fs_entries = if let Ok(entries) = self.read_dir(
216 let mut fs_entries = if let Ok(entries) = self.read_dir(
@@ -174,6 +253,7 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
174 .par_bridge()
253 .par_bridge()
175 .map(|pair| {
254 .map(|pair| {
176 use itertools::EitherOrBoth::*;
255 use itertools::EitherOrBoth::*;
256 let is_fs_only = pair.is_right();
177 match pair {
257 match pair {
178 Both(dirstate_node, fs_entry) => self
258 Both(dirstate_node, fs_entry) => self
179 .traverse_fs_and_dirstate(
259 .traverse_fs_and_dirstate(
@@ -181,18 +261,19 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
181 &fs_entry.metadata,
261 &fs_entry.metadata,
182 dirstate_node,
262 dirstate_node,
183 has_ignored_ancestor,
263 has_ignored_ancestor,
184 ),
264 )?,
185 Left(dirstate_node) => {
265 Left(dirstate_node) => {
186 self.traverse_dirstate_only(dirstate_node)
266 self.traverse_dirstate_only(dirstate_node)?
187 }
267 }
188 Right(fs_entry) => Ok(self.traverse_fs_only(
268 Right(fs_entry) => self.traverse_fs_only(
189 has_ignored_ancestor,
269 has_ignored_ancestor,
190 directory_hg_path,
270 directory_hg_path,
191 fs_entry,
271 fs_entry,
192 )),
272 ),
193 }
273 }
274 Ok(is_fs_only)
194 })
275 })
195 .collect()
276 .try_reduce(|| false, |a, b| Ok(a || b))
196 }
277 }
197
278
198 fn traverse_fs_and_dirstate(
279 fn traverse_fs_and_dirstate(
@@ -224,12 +305,20 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
224 }
305 }
225 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
306 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
226 let is_at_repo_root = false;
307 let is_at_repo_root = false;
227 self.traverse_fs_directory_and_dirstate(
308 let directory_has_any_fs_only_entry = self
228 is_ignored,
309 .traverse_fs_directory_and_dirstate(
229 dirstate_node.children(self.dmap.on_disk)?,
310 is_ignored,
230 hg_path,
311 dirstate_node.children(self.dmap.on_disk)?,
231 fs_path,
312 hg_path,
232 is_at_repo_root,
313 fs_path,
314 Some(fs_metadata),
315 dirstate_node.cached_directory_mtime(),
316 is_at_repo_root,
317 )?;
318 self.maybe_save_directory_mtime(
319 directory_has_any_fs_only_entry,
320 fs_metadata,
321 dirstate_node,
233 )?
322 )?
234 } else {
323 } else {
235 if file_or_symlink && self.matcher.matches(hg_path) {
324 if file_or_symlink && self.matcher.matches(hg_path) {
@@ -274,6 +363,75 b" impl<'a, 'tree, 'on_disk> StatusCommon<'"
274 Ok(())
363 Ok(())
275 }
364 }
276
365
366 fn maybe_save_directory_mtime(
367 &self,
368 directory_has_any_fs_only_entry: bool,
369 directory_metadata: &std::fs::Metadata,
370 dirstate_node: NodeRef<'tree, 'on_disk>,
371 ) -> Result<(), DirstateV2ParseError> {
372 if !directory_has_any_fs_only_entry {
373 // All filesystem directory entries from `read_dir` have a
374 // corresponding node in the dirstate, so we can reconstitute the
375 // names of those entries without calling `read_dir` again.
376 if let (Some(status_start), Ok(directory_mtime)) = (
377 &self.filesystem_time_at_status_start,
378 directory_metadata.modified(),
379 ) {
380 // Although the Rust standard library’s `SystemTime` type
381 // has nanosecond precision, the times reported for a
382 // directory’s (or file’s) modified time may have lower
383 // resolution based on the filesystem (for example ext3
384 // only stores integer seconds), kernel (see
385 // https://stackoverflow.com/a/14393315/1162888), etc.
386 if &directory_mtime >= status_start {
387 // The directory was modified too recently, don’t cache its
388 // `read_dir` results.
389 //
390 // A timeline like this is possible:
391 //
392 // 1. A change to this directory (direct child was
393 // added or removed) cause its mtime to be set
394 // (possibly truncated) to `directory_mtime`
395 // 2. This `status` algorithm calls `read_dir`
396 // 3. An other change is made to the same directory is
397 // made so that calling `read_dir` agin would give
398 // different results, but soon enough after 1. that
399 // the mtime stays the same
400 //
401 // On a system where the time resolution poor, this
402 // scenario is not unlikely if all three steps are caused
403 // by the same script.
404 } else {
405 // We’ve observed (through `status_start`) that time has
406 // “progressed” since `directory_mtime`, so any further
407 // change to this directory is extremely likely to cause a
408 // different mtime.
409 //
410 // Having the same mtime again is not entirely impossible
411 // since the system clock is not monotonous. It could jump
412 // backward to some point before `directory_mtime`, then a
413 // directory change could potentially happen during exactly
414 // the wrong tick.
415 //
416 // We deem this scenario (unlike the previous one) to be
417 // unlikely enough in practice.
418 let timestamp = directory_mtime.into();
419 let cached = dirstate_node.cached_directory_mtime();
420 if cached != Some(&timestamp) {
421 let hg_path = dirstate_node
422 .full_path_borrowed(self.dmap.on_disk)?
423 .detach_from_tree();
424 self.cached_directory_mtimes_to_add
425 .lock()
426 .unwrap()
427 .push((hg_path, timestamp))
428 }
429 }
430 }
431 }
432 Ok(())
433 }
434
277 /// A file with `EntryState::Normal` in the dirstate was found in the
435 /// A file with `EntryState::Normal` in the dirstate was found in the
278 /// filesystem
436 /// filesystem
279 fn handle_normal_file(
437 fn handle_normal_file(
@@ -505,3 +663,22 b' impl DirEntry {'
505 Ok(results)
663 Ok(results)
506 }
664 }
507 }
665 }
666
667 /// Return the `mtime` of a temporary file newly-created in the `.hg` directory
668 /// of the give repository.
669 ///
670 /// This is similar to `SystemTime::now()`, with the result truncated to the
671 /// same time resolution as other files’ modification times. Using `.hg`
672 /// instead of the system’s default temporary directory (such as `/tmp`) makes
673 /// it more likely the temporary file is in the same disk partition as contents
674 /// of the working directory, which can matter since different filesystems may
675 /// store timestamps with different resolutions.
676 ///
677 /// This may fail, typically if we lack write permissions. In that case we
678 /// should continue the `status()` algoritm anyway and consider the current
679 /// date/time to be unknown.
680 fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
681 tempfile::tempfile_in(repo_root.join(".hg"))?
682 .metadata()?
683 .modified()
684 }
General Comments 0
You need to be logged in to leave comments. Login now