diff --git a/rust/Cargo.lock b/rust/Cargo.lock --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -146,6 +146,7 @@ dependencies = [ "rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "twox-hash 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -393,6 +394,11 @@ dependencies = [ ] [[package]] +name = "redox_syscall" +version = "0.1.56" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] name = "regex" version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -409,6 +415,14 @@ version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] +name = "remove_dir_all" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "rustc_version" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -435,6 +449,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] +name = "tempfile" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)", + "remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "thread_local" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -519,12 +546,15 @@ source = "registry+https://github.com/ru "checksum rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098" "checksum rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9" "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" "checksum regex 1.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b5508c1941e4e7cb19965abef075d35a9a8b5cdf0846f30b4050e9b55dc55e87" "checksum regex-syntax 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "e734e891f5b408a29efbf8309e656876276f49ab6a6ac208600b4419bd893d90" +"checksum remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e" "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" "checksum thread_local 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" "checksum twox-hash 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3bfd5b7557925ce778ff9b9ef90e3ade34c524b5ff10e239c69a42d546d2af56" "checksum wasi 0.9.0+wasi-snapshot-preview1 (registry+https://github.com/rust-lang/crates.io-index)" = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml --- a/rust/hg-core/Cargo.toml +++ b/rust/hg-core/Cargo.toml @@ -18,3 +18,6 @@ rand_pcg = "0.1.1" rayon = "1.3.0" regex = "1.1.0" twox-hash = "1.5.0" + +[dev-dependencies] +tempfile = "3.1.0" \ No newline at end of file diff --git a/rust/hg-core/src/utils.rs b/rust/hg-core/src/utils.rs --- a/rust/hg-core/src/utils.rs +++ b/rust/hg-core/src/utils.rs @@ -9,6 +9,7 @@ pub mod files; pub mod hg_path; +pub mod path_auditor; /// Useful until rust/issues/56345 is stable /// diff --git a/rust/hg-core/src/utils/files.rs b/rust/hg-core/src/utils/files.rs --- a/rust/hg-core/src/utils/files.rs +++ b/rust/hg-core/src/utils/files.rs @@ -12,6 +12,8 @@ use crate::utils::hg_path::{HgPath, HgPathBuf}; use std::iter::FusedIterator; +use crate::utils::replace_slice; +use lazy_static::lazy_static; use std::fs::Metadata; use std::path::Path; @@ -85,6 +87,41 @@ pub fn normalize_case(path: &HgPath) -> path.to_ascii_lowercase() } +lazy_static! { + static ref IGNORED_CHARS: Vec> = { + [ + 0x200c, 0x200d, 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, + 0x202e, 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff, + ] + .iter() + .map(|code| { + std::char::from_u32(*code) + .unwrap() + .encode_utf8(&mut [0; 3]) + .bytes() + .collect() + }) + .collect() + }; +} + +fn hfs_ignore_clean(bytes: &[u8]) -> Vec { + let mut buf = bytes.to_owned(); + let needs_escaping = bytes.iter().any(|b| *b == b'\xe2' || *b == b'\xef'); + if needs_escaping { + for forbidden in IGNORED_CHARS.iter() { + replace_slice(&mut buf, forbidden, &[]) + } + buf + } else { + buf + } +} + +pub fn lower_clean(bytes: &[u8]) -> Vec { + hfs_ignore_clean(&bytes.to_ascii_lowercase()) +} + #[derive(Eq, PartialEq, Ord, PartialOrd, Copy, Clone)] pub struct HgMetadata { pub st_dev: u64, diff --git a/rust/hg-core/src/utils/hg_path.rs b/rust/hg-core/src/utils/hg_path.rs --- a/rust/hg-core/src/utils/hg_path.rs +++ b/rust/hg-core/src/utils/hg_path.rs @@ -15,12 +15,35 @@ use std::path::{Path, PathBuf}; pub enum HgPathError { /// Bytes from the invalid `HgPath` LeadingSlash(Vec), - /// Bytes and index of the second slash - ConsecutiveSlashes(Vec, usize), - /// Bytes and index of the null byte - ContainsNullByte(Vec, usize), + ConsecutiveSlashes { + bytes: Vec, + second_slash_index: usize, + }, + ContainsNullByte { + bytes: Vec, + null_byte_index: usize, + }, /// Bytes DecodeError(Vec), + /// The rest come from audit errors + EndsWithSlash(HgPathBuf), + ContainsIllegalComponent(HgPathBuf), + /// Path is inside the `.hg` folder + InsideDotHg(HgPathBuf), + IsInsideNestedRepo { + path: HgPathBuf, + nested_repo: HgPathBuf, + }, + TraversesSymbolicLink { + path: HgPathBuf, + symlink: HgPathBuf, + }, + NotFsCompliant(HgPathBuf), + /// `path` is the smallest invalid path + NotUnderRoot { + path: PathBuf, + root: PathBuf, + }, } impl ToString for HgPathError { @@ -29,17 +52,55 @@ impl ToString for HgPathError { HgPathError::LeadingSlash(bytes) => { format!("Invalid HgPath '{:?}': has a leading slash.", bytes) } - HgPathError::ConsecutiveSlashes(bytes, pos) => format!( - "Invalid HgPath '{:?}': consecutive slahes at pos {}.", + HgPathError::ConsecutiveSlashes { + bytes, + second_slash_index: pos, + } => format!( + "Invalid HgPath '{:?}': consecutive slashes at pos {}.", bytes, pos ), - HgPathError::ContainsNullByte(bytes, pos) => format!( + HgPathError::ContainsNullByte { + bytes, + null_byte_index: pos, + } => format!( "Invalid HgPath '{:?}': contains null byte at pos {}.", bytes, pos ), HgPathError::DecodeError(bytes) => { format!("Invalid HgPath '{:?}': could not be decoded.", bytes) } + HgPathError::EndsWithSlash(path) => { + format!("Audit failed for '{}': ends with a slash.", path) + } + HgPathError::ContainsIllegalComponent(path) => format!( + "Audit failed for '{}': contains an illegal component.", + path + ), + HgPathError::InsideDotHg(path) => format!( + "Audit failed for '{}': is inside the '.hg' folder.", + path + ), + HgPathError::IsInsideNestedRepo { + path, + nested_repo: nested, + } => format!( + "Audit failed for '{}': is inside a nested repository '{}'.", + path, nested + ), + HgPathError::TraversesSymbolicLink { path, symlink } => format!( + "Audit failed for '{}': traverses symbolic link '{}'.", + path, symlink + ), + HgPathError::NotFsCompliant(path) => format!( + "Audit failed for '{}': cannot be turned into a \ + filesystem path.", + path + ), + HgPathError::NotUnderRoot { path, root } => format!( + "Audit failed for '{}': not under root {}.", + path.display(), + root.display() + ), } } } @@ -229,17 +290,17 @@ impl HgPath { for (index, byte) in bytes.iter().enumerate() { match byte { 0 => { - return Err(HgPathError::ContainsNullByte( - bytes.to_vec(), - index, - )) + return Err(HgPathError::ContainsNullByte { + bytes: bytes.to_vec(), + null_byte_index: index, + }) } b'/' => { if previous_byte.is_some() && previous_byte == Some(b'/') { - return Err(HgPathError::ConsecutiveSlashes( - bytes.to_vec(), - index, - )); + return Err(HgPathError::ConsecutiveSlashes { + bytes: bytes.to_vec(), + second_slash_index: index, + }); } } _ => (), @@ -431,11 +492,17 @@ mod tests { HgPath::new(b"/").check_state() ); assert_eq!( - Err(HgPathError::ConsecutiveSlashes(b"a/b//c".to_vec(), 4)), + Err(HgPathError::ConsecutiveSlashes { + bytes: b"a/b//c".to_vec(), + second_slash_index: 4 + }), HgPath::new(b"a/b//c").check_state() ); assert_eq!( - Err(HgPathError::ContainsNullByte(b"a/b/\0c".to_vec(), 4)), + Err(HgPathError::ContainsNullByte { + bytes: b"a/b/\0c".to_vec(), + null_byte_index: 4 + }), HgPath::new(b"a/b/\0c").check_state() ); // TODO test HgPathError::DecodeError for the Windows implementation. diff --git a/rust/hg-core/src/utils/path_auditor.rs b/rust/hg-core/src/utils/path_auditor.rs new file mode 100644 --- /dev/null +++ b/rust/hg-core/src/utils/path_auditor.rs @@ -0,0 +1,230 @@ +// path_auditor.rs +// +// Copyright 2020 +// Raphaël Gomès , +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +use crate::utils::{ + files::lower_clean, + find_slice_in_slice, + hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError}, +}; +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +/// Ensures that a path is valid for use in the repository i.e. does not use +/// any banned components, does not traverse a symlink, etc. +#[derive(Debug, Default)] +pub struct PathAuditor { + audited: HashSet, + audited_dirs: HashSet, + root: PathBuf, +} + +impl PathAuditor { + pub fn new(root: impl AsRef) -> Self { + Self { + root: root.as_ref().to_owned(), + ..Default::default() + } + } + pub fn audit_path( + &mut self, + path: impl AsRef, + ) -> Result<(), HgPathError> { + // TODO windows "localpath" normalization + let path = path.as_ref(); + if path.is_empty() { + return Ok(()); + } + // TODO case normalization + if self.audited.contains(path) { + return Ok(()); + } + // AIX ignores "/" at end of path, others raise EISDIR. + let last_byte = path.as_bytes()[path.len() - 1]; + if last_byte == b'/' || last_byte == b'\\' { + return Err(HgPathError::EndsWithSlash(path.to_owned())); + } + let parts: Vec<_> = path + .as_bytes() + .split(|b| std::path::is_separator(*b as char)) + .collect(); + + let first_component = lower_clean(parts[0]); + let first_component = first_component.as_slice(); + if !path.split_drive().0.is_empty() + || (first_component == b".hg" + || first_component == b".hg." + || first_component == b"") + || parts.iter().any(|c| c == b"..") + { + return Err(HgPathError::InsideDotHg(path.to_owned())); + } + + // Windows shortname aliases + for part in parts.iter() { + if part.contains(&b'~') { + let mut split = part.splitn(1, |b| *b == b'~'); + let first = + split.next().unwrap().to_owned().to_ascii_uppercase(); + let last = split.next().unwrap(); + if last.iter().all(u8::is_ascii_digit) + && (first == b"HG" || first == b"HG8B6C") + { + return Err(HgPathError::ContainsIllegalComponent( + path.to_owned(), + )); + } + } + } + let lower_path = lower_clean(path.as_bytes()); + if find_slice_in_slice(&lower_path, b".hg").is_some() { + let lower_parts: Vec<_> = path + .as_bytes() + .split(|b| std::path::is_separator(*b as char)) + .collect(); + for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() { + if let Some(pos) = lower_parts[1..] + .iter() + .position(|part| part == &pattern.as_slice()) + { + let base = lower_parts[..=pos] + .iter() + .fold(HgPathBuf::new(), |acc, p| { + acc.join(HgPath::new(p)) + }); + return Err(HgPathError::IsInsideNestedRepo { + path: path.to_owned(), + nested_repo: base, + }); + } + } + } + + let parts = &parts[..parts.len().saturating_sub(1)]; + + // We don't want to add "foo/bar/baz" to `audited_dirs` before checking + // if there's a "foo/.hg" directory. This also means we won't + // accidentally traverse a symlink into some other filesystem (which + // is potentially expensive to access). + for index in 0..parts.len() { + let prefix = &parts[..index + 1].join(&b'/'); + let prefix = HgPath::new(prefix); + if self.audited_dirs.contains(prefix) { + continue; + } + self.check_filesystem(&prefix, &path)?; + } + + self.audited.insert(path.to_owned()); + + Ok(()) + } + + pub fn check_filesystem( + &self, + prefix: impl AsRef, + path: impl AsRef, + ) -> Result<(), HgPathError> { + let prefix = prefix.as_ref(); + let path = path.as_ref(); + let current_path = self.root.join( + hg_path_to_path_buf(prefix) + .map_err(|_| HgPathError::NotFsCompliant(path.to_owned()))?, + ); + match std::fs::symlink_metadata(¤t_path) { + Err(e) => { + // EINVAL can be raised as invalid path syntax under win32. + if e.kind() != std::io::ErrorKind::NotFound + && e.kind() != std::io::ErrorKind::InvalidInput + && e.raw_os_error() != Some(20) + { + // Rust does not yet have an `ErrorKind` for + // `NotADirectory` (errno 20) + // It happens if the dirstate contains `foo/bar` and + // foo is not a directory + return Err(HgPathError::NotFsCompliant(path.to_owned())); + } + } + Ok(meta) => { + if meta.file_type().is_symlink() { + return Err(HgPathError::TraversesSymbolicLink { + path: path.to_owned(), + symlink: prefix.to_owned(), + }); + } + if meta.file_type().is_dir() + && current_path.join(".hg").is_dir() + { + return Err(HgPathError::IsInsideNestedRepo { + path: path.to_owned(), + nested_repo: prefix.to_owned(), + }); + } + } + }; + + Ok(()) + } + + pub fn check(&mut self, path: impl AsRef) -> bool { + self.audit_path(path).is_ok() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::files::get_path_from_bytes; + use crate::utils::hg_path::path_to_hg_path_buf; + + #[test] + fn test_path_auditor() { + let mut auditor = PathAuditor::new(get_path_from_bytes(b"/tmp")); + + let path = HgPath::new(b".hg/00changelog.i"); + assert_eq!( + auditor.audit_path(path), + Err(HgPathError::InsideDotHg(path.to_owned())) + ); + let path = HgPath::new(b"this/is/nested/.hg/thing.txt"); + assert_eq!( + auditor.audit_path(path), + Err(HgPathError::IsInsideNestedRepo { + path: path.to_owned(), + nested_repo: HgPathBuf::from_bytes(b"this/is/nested") + }) + ); + + use std::fs::{create_dir, File}; + use tempfile::tempdir; + + let base_dir = tempdir().unwrap(); + let base_dir_path = base_dir.path(); + let a = base_dir_path.join("a"); + let b = base_dir_path.join("b"); + create_dir(&a).unwrap(); + let in_a_path = a.join("in_a"); + File::create(in_a_path).unwrap(); + + // TODO make portable + std::os::unix::fs::symlink(&a, &b).unwrap(); + + let buf = b.join("in_a").components().skip(2).collect::(); + eprintln!("buf: {}", buf.display()); + let path = path_to_hg_path_buf(buf).unwrap(); + assert_eq!( + auditor.audit_path(&path), + Err(HgPathError::TraversesSymbolicLink { + path: path, + symlink: path_to_hg_path_buf( + b.components().skip(2).collect::() + ) + .unwrap() + }) + ); + } +}