##// END OF EJS Templates
rust-regex: fix issues with regex anchoring and performance...
Raphaël Gomès -
r45347:ad1ec409 default
parent child Browse files
Show More
@@ -1,660 +1,657 b''
1 1 // filepatterns.rs
2 2 //
3 3 // Copyright 2019 Raphaël Gomès <rgomes@octobus.net>
4 4 //
5 5 // This software may be used and distributed according to the terms of the
6 6 // GNU General Public License version 2 or any later version.
7 7
8 8 //! Handling of Mercurial-specific patterns.
9 9
10 10 use crate::{
11 11 utils::{
12 12 files::{canonical_path, get_bytes_from_path, get_path_from_bytes},
13 13 hg_path::{path_to_hg_path_buf, HgPathBuf, HgPathError},
14 14 SliceExt,
15 15 },
16 16 FastHashMap, PatternError,
17 17 };
18 18 use lazy_static::lazy_static;
19 19 use regex::bytes::{NoExpand, Regex};
20 20 use std::fs::File;
21 21 use std::io::Read;
22 22 use std::ops::Deref;
23 23 use std::path::{Path, PathBuf};
24 24 use std::vec::Vec;
25 25
26 26 lazy_static! {
27 27 static ref RE_ESCAPE: Vec<Vec<u8>> = {
28 28 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
29 29 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
30 30 for byte in to_escape {
31 31 v[*byte as usize].insert(0, b'\\');
32 32 }
33 33 v
34 34 };
35 35 }
36 36
37 37 /// These are matched in order
38 38 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
39 39 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
40 40
41 41 /// Appended to the regexp of globs
42 42 const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
43 43
44 44 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
45 45 pub enum PatternSyntax {
46 46 /// A regular expression
47 47 Regexp,
48 48 /// Glob that matches at the front of the path
49 49 RootGlob,
50 50 /// Glob that matches at any suffix of the path (still anchored at
51 51 /// slashes)
52 52 Glob,
53 53 /// a path relative to repository root, which is matched recursively
54 54 Path,
55 55 /// A path relative to cwd
56 56 RelPath,
57 57 /// an unrooted glob (*.rs matches Rust files in all dirs)
58 58 RelGlob,
59 59 /// A regexp that needn't match the start of a name
60 60 RelRegexp,
61 61 /// A path relative to repository root, which is matched non-recursively
62 62 /// (will not match subdirectories)
63 63 RootFiles,
64 64 /// A file of patterns to read and include
65 65 Include,
66 66 /// A file of patterns to match against files under the same directory
67 67 SubInclude,
68 68 }
69 69
70 70 /// Transforms a glob pattern into a regex
71 71 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
72 72 let mut input = pat;
73 73 let mut res: Vec<u8> = vec![];
74 74 let mut group_depth = 0;
75 75
76 76 while let Some((c, rest)) = input.split_first() {
77 77 input = rest;
78 78
79 79 match c {
80 80 b'*' => {
81 81 for (source, repl) in GLOB_REPLACEMENTS {
82 82 if let Some(rest) = input.drop_prefix(source) {
83 83 input = rest;
84 84 res.extend(*repl);
85 85 break;
86 86 }
87 87 }
88 88 }
89 89 b'?' => res.extend(b"."),
90 90 b'[' => {
91 91 match input.iter().skip(1).position(|b| *b == b']') {
92 92 None => res.extend(b"\\["),
93 93 Some(end) => {
94 94 // Account for the one we skipped
95 95 let end = end + 1;
96 96
97 97 res.extend(b"[");
98 98
99 99 for (i, b) in input[..end].iter().enumerate() {
100 100 if *b == b'!' && i == 0 {
101 101 res.extend(b"^")
102 102 } else if *b == b'^' && i == 0 {
103 103 res.extend(b"\\^")
104 104 } else if *b == b'\\' {
105 105 res.extend(b"\\\\")
106 106 } else {
107 107 res.push(*b)
108 108 }
109 109 }
110 110 res.extend(b"]");
111 111 input = &input[end + 1..];
112 112 }
113 113 }
114 114 }
115 115 b'{' => {
116 116 group_depth += 1;
117 117 res.extend(b"(?:")
118 118 }
119 119 b'}' if group_depth > 0 => {
120 120 group_depth -= 1;
121 121 res.extend(b")");
122 122 }
123 123 b',' if group_depth > 0 => res.extend(b"|"),
124 124 b'\\' => {
125 125 let c = {
126 126 if let Some((c, rest)) = input.split_first() {
127 127 input = rest;
128 128 c
129 129 } else {
130 130 c
131 131 }
132 132 };
133 133 res.extend(&RE_ESCAPE[*c as usize])
134 134 }
135 135 _ => res.extend(&RE_ESCAPE[*c as usize]),
136 136 }
137 137 }
138 138 res
139 139 }
140 140
141 141 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
142 142 pattern
143 143 .iter()
144 144 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
145 145 .collect()
146 146 }
147 147
148 148 pub fn parse_pattern_syntax(
149 149 kind: &[u8],
150 150 ) -> Result<PatternSyntax, PatternError> {
151 151 match kind {
152 152 b"re:" => Ok(PatternSyntax::Regexp),
153 153 b"path:" => Ok(PatternSyntax::Path),
154 154 b"relpath:" => Ok(PatternSyntax::RelPath),
155 155 b"rootfilesin:" => Ok(PatternSyntax::RootFiles),
156 156 b"relglob:" => Ok(PatternSyntax::RelGlob),
157 157 b"relre:" => Ok(PatternSyntax::RelRegexp),
158 158 b"glob:" => Ok(PatternSyntax::Glob),
159 159 b"rootglob:" => Ok(PatternSyntax::RootGlob),
160 160 b"include:" => Ok(PatternSyntax::Include),
161 161 b"subinclude:" => Ok(PatternSyntax::SubInclude),
162 162 _ => Err(PatternError::UnsupportedSyntax(
163 163 String::from_utf8_lossy(kind).to_string(),
164 164 )),
165 165 }
166 166 }
167 167
168 168 /// Builds the regex that corresponds to the given pattern.
169 169 /// If within a `syntax: regexp` context, returns the pattern,
170 170 /// otherwise, returns the corresponding regex.
171 171 fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> {
172 172 let IgnorePattern {
173 173 syntax, pattern, ..
174 174 } = entry;
175 175 if pattern.is_empty() {
176 176 return vec![];
177 177 }
178 178 match syntax {
179 // The `regex` crate adds `.*` to the start and end of expressions
180 // if there are no anchors, so add them.
181 PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(),
179 PatternSyntax::Regexp => pattern.to_owned(),
182 180 PatternSyntax::RelRegexp => {
183 181 // The `regex` crate accepts `**` while `re2` and Python's `re`
184 182 // do not. Checking for `*` correctly triggers the same error all
185 183 // engines.
186 184 if pattern[0] == b'^' || pattern[0] == b'*' {
187 185 return pattern.to_owned();
188 186 }
189 187 [&b".*"[..], pattern].concat()
190 188 }
191 189 PatternSyntax::Path | PatternSyntax::RelPath => {
192 190 if pattern == b"." {
193 191 return vec![];
194 192 }
195 193 [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat()
196 194 }
197 195 PatternSyntax::RootFiles => {
198 196 let mut res = if pattern == b"." {
199 vec![b'^']
197 vec![]
200 198 } else {
201 199 // Pattern is a directory name.
202 [b"^", escape_pattern(pattern).as_slice(), b"/"].concat()
200 [escape_pattern(pattern).as_slice(), b"/"].concat()
203 201 };
204 202
205 203 // Anything after the pattern must be a non-directory.
206 204 res.extend(b"[^/]+$");
207 res.push(b'$');
208 205 res
209 206 }
210 207 PatternSyntax::RelGlob => {
211 208 let glob_re = glob_to_re(pattern);
212 209 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
213 210 [b".*", rest, GLOB_SUFFIX].concat()
214 211 } else {
215 212 [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
216 213 }
217 214 }
218 215 PatternSyntax::Glob | PatternSyntax::RootGlob => {
219 [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
216 [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
220 217 }
221 218 PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
222 219 }
223 220 }
224 221
225 222 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
226 223 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
227 224
228 225 /// TODO support other platforms
229 226 #[cfg(unix)]
230 227 pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> {
231 228 if bytes.is_empty() {
232 229 return b".".to_vec();
233 230 }
234 231 let sep = b'/';
235 232
236 233 let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count();
237 234 if initial_slashes > 2 {
238 235 // POSIX allows one or two initial slashes, but treats three or more
239 236 // as single slash.
240 237 initial_slashes = 1;
241 238 }
242 239 let components = bytes
243 240 .split(|b| *b == sep)
244 241 .filter(|c| !(c.is_empty() || c == b"."))
245 242 .fold(vec![], |mut acc, component| {
246 243 if component != b".."
247 244 || (initial_slashes == 0 && acc.is_empty())
248 245 || (!acc.is_empty() && acc[acc.len() - 1] == b"..")
249 246 {
250 247 acc.push(component)
251 248 } else if !acc.is_empty() {
252 249 acc.pop();
253 250 }
254 251 acc
255 252 });
256 253 let mut new_bytes = components.join(&sep);
257 254
258 255 if initial_slashes > 0 {
259 256 let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect();
260 257 buf.extend(new_bytes);
261 258 new_bytes = buf;
262 259 }
263 260 if new_bytes.is_empty() {
264 261 b".".to_vec()
265 262 } else {
266 263 new_bytes
267 264 }
268 265 }
269 266
270 267 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
271 268 /// that don't need to be transformed into a regex.
272 269 pub fn build_single_regex(
273 270 entry: &IgnorePattern,
274 271 ) -> Result<Option<Vec<u8>>, PatternError> {
275 272 let IgnorePattern {
276 273 pattern, syntax, ..
277 274 } = entry;
278 275 let pattern = match syntax {
279 276 PatternSyntax::RootGlob
280 277 | PatternSyntax::Path
281 278 | PatternSyntax::RelGlob
282 279 | PatternSyntax::RootFiles => normalize_path_bytes(&pattern),
283 280 PatternSyntax::Include | PatternSyntax::SubInclude => {
284 281 return Err(PatternError::NonRegexPattern(entry.clone()))
285 282 }
286 283 _ => pattern.to_owned(),
287 284 };
288 285 if *syntax == PatternSyntax::RootGlob
289 286 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
290 287 {
291 288 Ok(None)
292 289 } else {
293 290 let mut entry = entry.clone();
294 291 entry.pattern = pattern;
295 292 Ok(Some(_build_single_regex(&entry)))
296 293 }
297 294 }
298 295
299 296 lazy_static! {
300 297 static ref SYNTAXES: FastHashMap<&'static [u8], &'static [u8]> = {
301 298 let mut m = FastHashMap::default();
302 299
303 300 m.insert(b"re".as_ref(), b"relre:".as_ref());
304 301 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
305 302 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
306 303 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
307 304 m.insert(b"include".as_ref(), b"include:".as_ref());
308 305 m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref());
309 306 m
310 307 };
311 308 }
312 309
313 310 #[derive(Debug)]
314 311 pub enum PatternFileWarning {
315 312 /// (file path, syntax bytes)
316 313 InvalidSyntax(PathBuf, Vec<u8>),
317 314 /// File path
318 315 NoSuchFile(PathBuf),
319 316 }
320 317
321 318 pub fn parse_pattern_file_contents<P: AsRef<Path>>(
322 319 lines: &[u8],
323 320 file_path: P,
324 321 warn: bool,
325 322 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
326 323 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
327 324 let comment_escape_regex = Regex::new(r"\\#").unwrap();
328 325 let mut inputs: Vec<IgnorePattern> = vec![];
329 326 let mut warnings: Vec<PatternFileWarning> = vec![];
330 327
331 328 let mut current_syntax = b"relre:".as_ref();
332 329
333 330 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
334 331 let line_number = line_number + 1;
335 332
336 333 let line_buf;
337 334 if line.contains(&b'#') {
338 335 if let Some(cap) = comment_regex.captures(line) {
339 336 line = &line[..cap.get(1).unwrap().end()]
340 337 }
341 338 line_buf = comment_escape_regex.replace_all(line, NoExpand(b"#"));
342 339 line = &line_buf;
343 340 }
344 341
345 342 let mut line = line.trim_end();
346 343
347 344 if line.is_empty() {
348 345 continue;
349 346 }
350 347
351 348 if let Some(syntax) = line.drop_prefix(b"syntax:") {
352 349 let syntax = syntax.trim();
353 350
354 351 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
355 352 current_syntax = rel_syntax;
356 353 } else if warn {
357 354 warnings.push(PatternFileWarning::InvalidSyntax(
358 355 file_path.as_ref().to_owned(),
359 356 syntax.to_owned(),
360 357 ));
361 358 }
362 359 continue;
363 360 }
364 361
365 362 let mut line_syntax: &[u8] = &current_syntax;
366 363
367 364 for (s, rels) in SYNTAXES.iter() {
368 365 if let Some(rest) = line.drop_prefix(rels) {
369 366 line_syntax = rels;
370 367 line = rest;
371 368 break;
372 369 }
373 370 if let Some(rest) = line.drop_prefix(&[s, &b":"[..]].concat()) {
374 371 line_syntax = rels;
375 372 line = rest;
376 373 break;
377 374 }
378 375 }
379 376
380 377 inputs.push(IgnorePattern::new(
381 378 parse_pattern_syntax(&line_syntax).map_err(|e| match e {
382 379 PatternError::UnsupportedSyntax(syntax) => {
383 380 PatternError::UnsupportedSyntaxInFile(
384 381 syntax,
385 382 file_path.as_ref().to_string_lossy().into(),
386 383 line_number,
387 384 )
388 385 }
389 386 _ => e,
390 387 })?,
391 388 &line,
392 389 &file_path,
393 390 ));
394 391 }
395 392 Ok((inputs, warnings))
396 393 }
397 394
398 395 pub fn read_pattern_file<P: AsRef<Path>>(
399 396 file_path: P,
400 397 warn: bool,
401 398 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
402 399 let mut f = match File::open(file_path.as_ref()) {
403 400 Ok(f) => Ok(f),
404 401 Err(e) => match e.kind() {
405 402 std::io::ErrorKind::NotFound => {
406 403 return Ok((
407 404 vec![],
408 405 vec![PatternFileWarning::NoSuchFile(
409 406 file_path.as_ref().to_owned(),
410 407 )],
411 408 ))
412 409 }
413 410 _ => Err(e),
414 411 },
415 412 }?;
416 413 let mut contents = Vec::new();
417 414
418 415 f.read_to_end(&mut contents)?;
419 416
420 417 Ok(parse_pattern_file_contents(&contents, file_path, warn)?)
421 418 }
422 419
423 420 /// Represents an entry in an "ignore" file.
424 421 #[derive(Debug, Eq, PartialEq, Clone)]
425 422 pub struct IgnorePattern {
426 423 pub syntax: PatternSyntax,
427 424 pub pattern: Vec<u8>,
428 425 pub source: PathBuf,
429 426 }
430 427
431 428 impl IgnorePattern {
432 429 pub fn new(
433 430 syntax: PatternSyntax,
434 431 pattern: &[u8],
435 432 source: impl AsRef<Path>,
436 433 ) -> Self {
437 434 Self {
438 435 syntax,
439 436 pattern: pattern.to_owned(),
440 437 source: source.as_ref().to_owned(),
441 438 }
442 439 }
443 440 }
444 441
445 442 pub type PatternResult<T> = Result<T, PatternError>;
446 443
447 444 /// Wrapper for `read_pattern_file` that also recursively expands `include:`
448 445 /// patterns.
449 446 ///
450 447 /// `subinclude:` is not treated as a special pattern here: unraveling them
451 448 /// needs to occur in the "ignore" phase.
452 449 pub fn get_patterns_from_file(
453 450 pattern_file: impl AsRef<Path>,
454 451 root_dir: impl AsRef<Path>,
455 452 ) -> PatternResult<(Vec<IgnorePattern>, Vec<PatternFileWarning>)> {
456 453 let (patterns, mut warnings) = read_pattern_file(&pattern_file, true)?;
457 454 let patterns = patterns
458 455 .into_iter()
459 456 .flat_map(|entry| -> PatternResult<_> {
460 457 let IgnorePattern {
461 458 syntax,
462 459 pattern,
463 460 source: _,
464 461 } = &entry;
465 462 Ok(match syntax {
466 463 PatternSyntax::Include => {
467 464 let inner_include =
468 465 root_dir.as_ref().join(get_path_from_bytes(&pattern));
469 466 let (inner_pats, inner_warnings) = get_patterns_from_file(
470 467 &inner_include,
471 468 root_dir.as_ref(),
472 469 )?;
473 470 warnings.extend(inner_warnings);
474 471 inner_pats
475 472 }
476 473 _ => vec![entry],
477 474 })
478 475 })
479 476 .flatten()
480 477 .collect();
481 478
482 479 Ok((patterns, warnings))
483 480 }
484 481
485 482 /// Holds all the information needed to handle a `subinclude:` pattern.
486 483 pub struct SubInclude {
487 484 /// Will be used for repository (hg) paths that start with this prefix.
488 485 /// It is relative to the current working directory, so comparing against
489 486 /// repository paths is painless.
490 487 pub prefix: HgPathBuf,
491 488 /// The file itself, containing the patterns
492 489 pub path: PathBuf,
493 490 /// Folder in the filesystem where this it applies
494 491 pub root: PathBuf,
495 492 }
496 493
497 494 impl SubInclude {
498 495 pub fn new(
499 496 root_dir: impl AsRef<Path>,
500 497 pattern: &[u8],
501 498 source: impl AsRef<Path>,
502 499 ) -> Result<SubInclude, HgPathError> {
503 500 let normalized_source =
504 501 normalize_path_bytes(&get_bytes_from_path(source));
505 502
506 503 let source_root = get_path_from_bytes(&normalized_source);
507 504 let source_root = source_root.parent().unwrap_or(source_root.deref());
508 505
509 506 let path = source_root.join(get_path_from_bytes(pattern));
510 507 let new_root = path.parent().unwrap_or(path.deref());
511 508
512 509 let prefix = canonical_path(&root_dir, &root_dir, new_root)?;
513 510
514 511 Ok(Self {
515 512 prefix: path_to_hg_path_buf(prefix).and_then(|mut p| {
516 513 if !p.is_empty() {
517 514 p.push(b'/');
518 515 }
519 516 Ok(p)
520 517 })?,
521 518 path: path.to_owned(),
522 519 root: new_root.to_owned(),
523 520 })
524 521 }
525 522 }
526 523
527 524 /// Separate and pre-process subincludes from other patterns for the "ignore"
528 525 /// phase.
529 526 pub fn filter_subincludes(
530 527 ignore_patterns: &[IgnorePattern],
531 528 root_dir: impl AsRef<Path>,
532 529 ) -> Result<(Vec<SubInclude>, Vec<&IgnorePattern>), HgPathError> {
533 530 let mut subincludes = vec![];
534 531 let mut others = vec![];
535 532
536 533 for ignore_pattern in ignore_patterns.iter() {
537 534 let IgnorePattern {
538 535 syntax,
539 536 pattern,
540 537 source,
541 538 } = ignore_pattern;
542 539 if *syntax == PatternSyntax::SubInclude {
543 540 subincludes.push(SubInclude::new(&root_dir, pattern, &source)?);
544 541 } else {
545 542 others.push(ignore_pattern)
546 543 }
547 544 }
548 545 Ok((subincludes, others))
549 546 }
550 547
551 548 #[cfg(test)]
552 549 mod tests {
553 550 use super::*;
554 551 use pretty_assertions::assert_eq;
555 552
556 553 #[test]
557 554 fn escape_pattern_test() {
558 555 let untouched =
559 556 br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
560 557 assert_eq!(escape_pattern(untouched), untouched.to_vec());
561 558 // All escape codes
562 559 assert_eq!(
563 560 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
564 561 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
565 562 .to_vec()
566 563 );
567 564 }
568 565
569 566 #[test]
570 567 fn glob_test() {
571 568 assert_eq!(glob_to_re(br#"?"#), br#"."#);
572 569 assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
573 570 assert_eq!(glob_to_re(br#"**"#), br#".*"#);
574 571 assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
575 572 assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
576 573 assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
577 574 assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
578 575 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
579 576 }
580 577
581 578 #[test]
582 579 fn test_parse_pattern_file_contents() {
583 580 let lines = b"syntax: glob\n*.elc";
584 581
585 582 assert_eq!(
586 583 parse_pattern_file_contents(lines, Path::new("file_path"), false)
587 584 .unwrap()
588 585 .0,
589 586 vec![IgnorePattern::new(
590 587 PatternSyntax::RelGlob,
591 588 b"*.elc",
592 589 Path::new("file_path")
593 590 )],
594 591 );
595 592
596 593 let lines = b"syntax: include\nsyntax: glob";
597 594
598 595 assert_eq!(
599 596 parse_pattern_file_contents(lines, Path::new("file_path"), false)
600 597 .unwrap()
601 598 .0,
602 599 vec![]
603 600 );
604 601 let lines = b"glob:**.o";
605 602 assert_eq!(
606 603 parse_pattern_file_contents(lines, Path::new("file_path"), false)
607 604 .unwrap()
608 605 .0,
609 606 vec![IgnorePattern::new(
610 607 PatternSyntax::RelGlob,
611 608 b"**.o",
612 609 Path::new("file_path")
613 610 )]
614 611 );
615 612 }
616 613
617 614 #[test]
618 615 fn test_build_single_regex() {
619 616 assert_eq!(
620 617 build_single_regex(&IgnorePattern::new(
621 618 PatternSyntax::RelGlob,
622 619 b"rust/target/",
623 620 Path::new("")
624 621 ))
625 622 .unwrap(),
626 623 Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()),
627 624 );
628 625 }
629 626
630 627 #[test]
631 628 fn test_build_single_regex_shortcut() {
632 629 assert_eq!(
633 630 build_single_regex(&IgnorePattern::new(
634 631 PatternSyntax::RootGlob,
635 632 b"",
636 633 Path::new("")
637 634 ))
638 635 .unwrap(),
639 636 None,
640 637 );
641 638 assert_eq!(
642 639 build_single_regex(&IgnorePattern::new(
643 640 PatternSyntax::RootGlob,
644 641 b"whatever",
645 642 Path::new("")
646 643 ))
647 644 .unwrap(),
648 645 None,
649 646 );
650 647 assert_eq!(
651 648 build_single_regex(&IgnorePattern::new(
652 649 PatternSyntax::RootGlob,
653 650 b"*.o",
654 651 Path::new("")
655 652 ))
656 653 .unwrap(),
657 Some(br"^[^/]*\.o(?:/|$)".to_vec()),
654 Some(br"[^/]*\.o(?:/|$)".to_vec()),
658 655 );
659 656 }
660 657 }
@@ -1,954 +1,957 b''
1 1 // matchers.rs
2 2 //
3 3 // Copyright 2019 Raphaël Gomès <rgomes@octobus.net>
4 4 //
5 5 // This software may be used and distributed according to the terms of the
6 6 // GNU General Public License version 2 or any later version.
7 7
8 8 //! Structs and types for matching files and directories.
9 9
10 10 #[cfg(feature = "with-re2")]
11 11 use crate::re2::Re2;
12 12 use crate::{
13 13 dirstate::dirs_multiset::DirsChildrenMultiset,
14 14 filepatterns::{
15 15 build_single_regex, filter_subincludes, get_patterns_from_file,
16 16 PatternFileWarning, PatternResult, SubInclude,
17 17 },
18 18 utils::{
19 19 files::find_dirs,
20 20 hg_path::{HgPath, HgPathBuf},
21 21 Escaped,
22 22 },
23 23 DirsMultiset, DirstateMapError, FastHashMap, IgnorePattern, PatternError,
24 24 PatternSyntax,
25 25 };
26 26
27 27 use crate::filepatterns::normalize_path_bytes;
28 28 use std::borrow::ToOwned;
29 29 use std::collections::HashSet;
30 30 use std::fmt::{Display, Error, Formatter};
31 31 use std::iter::FromIterator;
32 32 use std::ops::Deref;
33 33 use std::path::{Path, PathBuf};
34 34
35 35 use micro_timer::timed;
36 36
37 37 #[derive(Debug, PartialEq)]
38 38 pub enum VisitChildrenSet<'a> {
39 39 /// Don't visit anything
40 40 Empty,
41 41 /// Only visit this directory
42 42 This,
43 43 /// Visit this directory and these subdirectories
44 44 /// TODO Should we implement a `NonEmptyHashSet`?
45 45 Set(HashSet<&'a HgPath>),
46 46 /// Visit this directory and all subdirectories
47 47 Recursive,
48 48 }
49 49
50 50 pub trait Matcher {
51 51 /// Explicitly listed files
52 52 fn file_set(&self) -> Option<&HashSet<&HgPath>>;
53 53 /// Returns whether `filename` is in `file_set`
54 54 fn exact_match(&self, filename: impl AsRef<HgPath>) -> bool;
55 55 /// Returns whether `filename` is matched by this matcher
56 56 fn matches(&self, filename: impl AsRef<HgPath>) -> bool;
57 57 /// Decides whether a directory should be visited based on whether it
58 58 /// has potential matches in it or one of its subdirectories, and
59 59 /// potentially lists which subdirectories of that directory should be
60 60 /// visited. This is based on the match's primary, included, and excluded
61 61 /// patterns.
62 62 ///
63 63 /// # Example
64 64 ///
65 65 /// Assume matchers `['path:foo/bar', 'rootfilesin:qux']`, we would
66 66 /// return the following values (assuming the implementation of
67 67 /// visit_children_set is capable of recognizing this; some implementations
68 68 /// are not).
69 69 ///
70 70 /// ```text
71 71 /// ```ignore
72 72 /// '' -> {'foo', 'qux'}
73 73 /// 'baz' -> set()
74 74 /// 'foo' -> {'bar'}
75 75 /// // Ideally this would be `Recursive`, but since the prefix nature of
76 76 /// // matchers is applied to the entire matcher, we have to downgrade this
77 77 /// // to `This` due to the (yet to be implemented in Rust) non-prefix
78 78 /// // `RootFilesIn'-kind matcher being mixed in.
79 79 /// 'foo/bar' -> 'this'
80 80 /// 'qux' -> 'this'
81 81 /// ```
82 82 /// # Important
83 83 ///
84 84 /// Most matchers do not know if they're representing files or
85 85 /// directories. They see `['path:dir/f']` and don't know whether `f` is a
86 86 /// file or a directory, so `visit_children_set('dir')` for most matchers
87 87 /// will return `HashSet{ HgPath { "f" } }`, but if the matcher knows it's
88 88 /// a file (like the yet to be implemented in Rust `ExactMatcher` does),
89 89 /// it may return `VisitChildrenSet::This`.
90 90 /// Do not rely on the return being a `HashSet` indicating that there are
91 91 /// no files in this dir to investigate (or equivalently that if there are
92 92 /// files to investigate in 'dir' that it will always return
93 93 /// `VisitChildrenSet::This`).
94 94 fn visit_children_set(
95 95 &self,
96 96 directory: impl AsRef<HgPath>,
97 97 ) -> VisitChildrenSet;
98 98 /// Matcher will match everything and `files_set()` will be empty:
99 99 /// optimization might be possible.
100 100 fn matches_everything(&self) -> bool;
101 101 /// Matcher will match exactly the files in `files_set()`: optimization
102 102 /// might be possible.
103 103 fn is_exact(&self) -> bool;
104 104 }
105 105
106 106 /// Matches everything.
107 107 ///```
108 108 /// use hg::{ matchers::{Matcher, AlwaysMatcher}, utils::hg_path::HgPath };
109 109 ///
110 110 /// let matcher = AlwaysMatcher;
111 111 ///
112 112 /// assert_eq!(matcher.matches(HgPath::new(b"whatever")), true);
113 113 /// assert_eq!(matcher.matches(HgPath::new(b"b.txt")), true);
114 114 /// assert_eq!(matcher.matches(HgPath::new(b"main.c")), true);
115 115 /// assert_eq!(matcher.matches(HgPath::new(br"re:.*\.c$")), true);
116 116 /// ```
117 117 #[derive(Debug)]
118 118 pub struct AlwaysMatcher;
119 119
120 120 impl Matcher for AlwaysMatcher {
121 121 fn file_set(&self) -> Option<&HashSet<&HgPath>> {
122 122 None
123 123 }
124 124 fn exact_match(&self, _filename: impl AsRef<HgPath>) -> bool {
125 125 false
126 126 }
127 127 fn matches(&self, _filename: impl AsRef<HgPath>) -> bool {
128 128 true
129 129 }
130 130 fn visit_children_set(
131 131 &self,
132 132 _directory: impl AsRef<HgPath>,
133 133 ) -> VisitChildrenSet {
134 134 VisitChildrenSet::Recursive
135 135 }
136 136 fn matches_everything(&self) -> bool {
137 137 true
138 138 }
139 139 fn is_exact(&self) -> bool {
140 140 false
141 141 }
142 142 }
143 143
144 144 /// Matches the input files exactly. They are interpreted as paths, not
145 145 /// patterns.
146 146 ///
147 147 ///```
148 148 /// use hg::{ matchers::{Matcher, FileMatcher}, utils::hg_path::HgPath };
149 149 ///
150 150 /// let files = [HgPath::new(b"a.txt"), HgPath::new(br"re:.*\.c$")];
151 151 /// let matcher = FileMatcher::new(&files).unwrap();
152 152 ///
153 153 /// assert_eq!(matcher.matches(HgPath::new(b"a.txt")), true);
154 154 /// assert_eq!(matcher.matches(HgPath::new(b"b.txt")), false);
155 155 /// assert_eq!(matcher.matches(HgPath::new(b"main.c")), false);
156 156 /// assert_eq!(matcher.matches(HgPath::new(br"re:.*\.c$")), true);
157 157 /// ```
158 158 #[derive(Debug)]
159 159 pub struct FileMatcher<'a> {
160 160 files: HashSet<&'a HgPath>,
161 161 dirs: DirsMultiset,
162 162 }
163 163
164 164 impl<'a> FileMatcher<'a> {
165 165 pub fn new(
166 166 files: &'a [impl AsRef<HgPath>],
167 167 ) -> Result<Self, DirstateMapError> {
168 168 Ok(Self {
169 169 files: HashSet::from_iter(files.iter().map(|f| f.as_ref())),
170 170 dirs: DirsMultiset::from_manifest(files)?,
171 171 })
172 172 }
173 173 fn inner_matches(&self, filename: impl AsRef<HgPath>) -> bool {
174 174 self.files.contains(filename.as_ref())
175 175 }
176 176 }
177 177
178 178 impl<'a> Matcher for FileMatcher<'a> {
179 179 fn file_set(&self) -> Option<&HashSet<&HgPath>> {
180 180 Some(&self.files)
181 181 }
182 182 fn exact_match(&self, filename: impl AsRef<HgPath>) -> bool {
183 183 self.inner_matches(filename)
184 184 }
185 185 fn matches(&self, filename: impl AsRef<HgPath>) -> bool {
186 186 self.inner_matches(filename)
187 187 }
188 188 fn visit_children_set(
189 189 &self,
190 190 directory: impl AsRef<HgPath>,
191 191 ) -> VisitChildrenSet {
192 192 if self.files.is_empty() || !self.dirs.contains(&directory) {
193 193 return VisitChildrenSet::Empty;
194 194 }
195 195 let dirs_as_set = self.dirs.iter().map(|k| k.deref()).collect();
196 196
197 197 let mut candidates: HashSet<&HgPath> =
198 198 self.files.union(&dirs_as_set).map(|k| *k).collect();
199 199 candidates.remove(HgPath::new(b""));
200 200
201 201 if !directory.as_ref().is_empty() {
202 202 let directory = [directory.as_ref().as_bytes(), b"/"].concat();
203 203 candidates = candidates
204 204 .iter()
205 205 .filter_map(|c| {
206 206 if c.as_bytes().starts_with(&directory) {
207 207 Some(HgPath::new(&c.as_bytes()[directory.len()..]))
208 208 } else {
209 209 None
210 210 }
211 211 })
212 212 .collect();
213 213 }
214 214
215 215 // `self.dirs` includes all of the directories, recursively, so if
216 216 // we're attempting to match 'foo/bar/baz.txt', it'll have '', 'foo',
217 217 // 'foo/bar' in it. Thus we can safely ignore a candidate that has a
218 218 // '/' in it, indicating it's for a subdir-of-a-subdir; the immediate
219 219 // subdir will be in there without a slash.
220 220 VisitChildrenSet::Set(
221 221 candidates
222 222 .iter()
223 223 .filter_map(|c| {
224 224 if c.bytes().all(|b| *b != b'/') {
225 225 Some(*c)
226 226 } else {
227 227 None
228 228 }
229 229 })
230 230 .collect(),
231 231 )
232 232 }
233 233 fn matches_everything(&self) -> bool {
234 234 false
235 235 }
236 236 fn is_exact(&self) -> bool {
237 237 true
238 238 }
239 239 }
240 240
241 241 /// Matches files that are included in the ignore rules.
242 242 #[cfg_attr(
243 243 feature = "with-re2",
244 244 doc = r##"
245 245 ```
246 246 use hg::{
247 247 matchers::{IncludeMatcher, Matcher},
248 248 IgnorePattern,
249 249 PatternSyntax,
250 250 utils::hg_path::HgPath
251 251 };
252 252 use std::path::Path;
253 253 ///
254 254 let ignore_patterns =
255 255 vec![IgnorePattern::new(PatternSyntax::RootGlob, b"this*", Path::new(""))];
256 256 let (matcher, _) = IncludeMatcher::new(ignore_patterns, "").unwrap();
257 257 ///
258 258 assert_eq!(matcher.matches(HgPath::new(b"testing")), false);
259 259 assert_eq!(matcher.matches(HgPath::new(b"this should work")), true);
260 260 assert_eq!(matcher.matches(HgPath::new(b"this also")), true);
261 261 assert_eq!(matcher.matches(HgPath::new(b"but not this")), false);
262 262 ```
263 263 "##
264 264 )]
265 265 pub struct IncludeMatcher<'a> {
266 266 patterns: Vec<u8>,
267 267 match_fn: Box<dyn for<'r> Fn(&'r HgPath) -> bool + 'a + Sync>,
268 268 /// Whether all the patterns match a prefix (i.e. recursively)
269 269 prefix: bool,
270 270 roots: HashSet<HgPathBuf>,
271 271 dirs: HashSet<HgPathBuf>,
272 272 parents: HashSet<HgPathBuf>,
273 273 }
274 274
275 275 impl<'a> Matcher for IncludeMatcher<'a> {
276 276 fn file_set(&self) -> Option<&HashSet<&HgPath>> {
277 277 None
278 278 }
279 279
280 280 fn exact_match(&self, _filename: impl AsRef<HgPath>) -> bool {
281 281 false
282 282 }
283 283
284 284 fn matches(&self, filename: impl AsRef<HgPath>) -> bool {
285 285 (self.match_fn)(filename.as_ref())
286 286 }
287 287
288 288 fn visit_children_set(
289 289 &self,
290 290 directory: impl AsRef<HgPath>,
291 291 ) -> VisitChildrenSet {
292 292 let dir = directory.as_ref();
293 293 if self.prefix && self.roots.contains(dir) {
294 294 return VisitChildrenSet::Recursive;
295 295 }
296 296 if self.roots.contains(HgPath::new(b""))
297 297 || self.roots.contains(dir)
298 298 || self.dirs.contains(dir)
299 299 || find_dirs(dir).any(|parent_dir| self.roots.contains(parent_dir))
300 300 {
301 301 return VisitChildrenSet::This;
302 302 }
303 303
304 304 if self.parents.contains(directory.as_ref()) {
305 305 let multiset = self.get_all_parents_children();
306 306 if let Some(children) = multiset.get(dir) {
307 307 return VisitChildrenSet::Set(children.to_owned());
308 308 }
309 309 }
310 310 VisitChildrenSet::Empty
311 311 }
312 312
313 313 fn matches_everything(&self) -> bool {
314 314 false
315 315 }
316 316
317 317 fn is_exact(&self) -> bool {
318 318 false
319 319 }
320 320 }
321 321
322 322 #[cfg(feature = "with-re2")]
323 323 /// Returns a function that matches an `HgPath` against the given regex
324 324 /// pattern.
325 325 ///
326 326 /// This can fail when the pattern is invalid or not supported by the
327 327 /// underlying engine `Re2`, for instance anything with back-references.
328 328 #[timed]
329 329 fn re_matcher(
330 330 pattern: &[u8],
331 331 ) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
332 332 let regex = Re2::new(pattern);
333 333 let regex = regex.map_err(|e| PatternError::UnsupportedSyntax(e))?;
334 334 Ok(move |path: &HgPath| regex.is_match(path.as_bytes()))
335 335 }
336 336
337 337 #[cfg(not(feature = "with-re2"))]
338 338 /// Returns a function that matches an `HgPath` against the given regex
339 339 /// pattern.
340 340 ///
341 341 /// This can fail when the pattern is invalid or not supported by the
342 342 /// underlying engine (the `regex` crate), for instance anything with
343 343 /// back-references.
344 344 #[timed]
345 345 fn re_matcher(
346 346 pattern: &[u8],
347 347 ) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
348 348 use std::io::Write;
349 349
350 let mut escaped_bytes = vec![];
350 // The `regex` crate adds `.*` to the start and end of expressions if there
351 // are no anchors, so add the start anchor.
352 let mut escaped_bytes = vec![b'^', b'(', b'?', b':'];
351 353 for byte in pattern {
352 354 if *byte > 127 {
353 355 write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
354 356 } else {
355 357 escaped_bytes.push(*byte);
356 358 }
357 359 }
360 escaped_bytes.push(b')');
358 361
359 362 // Avoid the cost of UTF8 checking
360 363 //
361 364 // # Safety
362 365 // This is safe because we escaped all non-ASCII bytes.
363 366 let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
364 367 let re = regex::bytes::RegexBuilder::new(&pattern_string)
365 368 .unicode(false)
366 369 .build()
367 370 .map_err(|e| PatternError::UnsupportedSyntax(e.to_string()))?;
368 371
369 372 Ok(move |path: &HgPath| re.is_match(path.as_bytes()))
370 373 }
371 374
372 375 /// Returns the regex pattern and a function that matches an `HgPath` against
373 376 /// said regex formed by the given ignore patterns.
374 377 fn build_regex_match<'a>(
375 378 ignore_patterns: &'a [&'a IgnorePattern],
376 379 ) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> {
377 380 let mut regexps = vec![];
378 381 let mut exact_set = HashSet::new();
379 382
380 383 for pattern in ignore_patterns {
381 384 if let Some(re) = build_single_regex(pattern)? {
382 385 regexps.push(re);
383 386 } else {
384 387 let exact = normalize_path_bytes(&pattern.pattern);
385 388 exact_set.insert(HgPathBuf::from_bytes(&exact));
386 389 }
387 390 }
388 391
389 392 let full_regex = regexps.join(&b'|');
390 393
391 394 // An empty pattern would cause the regex engine to incorrectly match the
392 395 // (empty) root directory
393 396 let func = if !(regexps.is_empty()) {
394 397 let matcher = re_matcher(&full_regex)?;
395 398 let func = move |filename: &HgPath| {
396 399 exact_set.contains(filename) || matcher(filename)
397 400 };
398 401 Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
399 402 } else {
400 403 let func = move |filename: &HgPath| exact_set.contains(filename);
401 404 Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
402 405 };
403 406
404 407 Ok((full_regex, func))
405 408 }
406 409
407 410 /// Returns roots and directories corresponding to each pattern.
408 411 ///
409 412 /// This calculates the roots and directories exactly matching the patterns and
410 413 /// returns a tuple of (roots, dirs). It does not return other directories
411 414 /// which may also need to be considered, like the parent directories.
412 415 fn roots_and_dirs(
413 416 ignore_patterns: &[IgnorePattern],
414 417 ) -> (Vec<HgPathBuf>, Vec<HgPathBuf>) {
415 418 let mut roots = Vec::new();
416 419 let mut dirs = Vec::new();
417 420
418 421 for ignore_pattern in ignore_patterns {
419 422 let IgnorePattern {
420 423 syntax, pattern, ..
421 424 } = ignore_pattern;
422 425 match syntax {
423 426 PatternSyntax::RootGlob | PatternSyntax::Glob => {
424 427 let mut root = vec![];
425 428
426 429 for p in pattern.split(|c| *c == b'/') {
427 430 if p.iter().any(|c| match *c {
428 431 b'[' | b'{' | b'*' | b'?' => true,
429 432 _ => false,
430 433 }) {
431 434 break;
432 435 }
433 436 root.push(HgPathBuf::from_bytes(p));
434 437 }
435 438 let buf =
436 439 root.iter().fold(HgPathBuf::new(), |acc, r| acc.join(r));
437 440 roots.push(buf);
438 441 }
439 442 PatternSyntax::Path | PatternSyntax::RelPath => {
440 443 let pat = HgPath::new(if pattern == b"." {
441 444 &[] as &[u8]
442 445 } else {
443 446 pattern
444 447 });
445 448 roots.push(pat.to_owned());
446 449 }
447 450 PatternSyntax::RootFiles => {
448 451 let pat = if pattern == b"." {
449 452 &[] as &[u8]
450 453 } else {
451 454 pattern
452 455 };
453 456 dirs.push(HgPathBuf::from_bytes(pat));
454 457 }
455 458 _ => {
456 459 roots.push(HgPathBuf::new());
457 460 }
458 461 }
459 462 }
460 463 (roots, dirs)
461 464 }
462 465
463 466 /// Paths extracted from patterns
464 467 #[derive(Debug, PartialEq)]
465 468 struct RootsDirsAndParents {
466 469 /// Directories to match recursively
467 470 pub roots: HashSet<HgPathBuf>,
468 471 /// Directories to match non-recursively
469 472 pub dirs: HashSet<HgPathBuf>,
470 473 /// Implicitly required directories to go to items in either roots or dirs
471 474 pub parents: HashSet<HgPathBuf>,
472 475 }
473 476
474 477 /// Extract roots, dirs and parents from patterns.
475 478 fn roots_dirs_and_parents(
476 479 ignore_patterns: &[IgnorePattern],
477 480 ) -> PatternResult<RootsDirsAndParents> {
478 481 let (roots, dirs) = roots_and_dirs(ignore_patterns);
479 482
480 483 let mut parents = HashSet::new();
481 484
482 485 parents.extend(
483 486 DirsMultiset::from_manifest(&dirs)
484 487 .map_err(|e| match e {
485 488 DirstateMapError::InvalidPath(e) => e,
486 489 _ => unreachable!(),
487 490 })?
488 491 .iter()
489 492 .map(|k| k.to_owned()),
490 493 );
491 494 parents.extend(
492 495 DirsMultiset::from_manifest(&roots)
493 496 .map_err(|e| match e {
494 497 DirstateMapError::InvalidPath(e) => e,
495 498 _ => unreachable!(),
496 499 })?
497 500 .iter()
498 501 .map(|k| k.to_owned()),
499 502 );
500 503
501 504 Ok(RootsDirsAndParents {
502 505 roots: HashSet::from_iter(roots),
503 506 dirs: HashSet::from_iter(dirs),
504 507 parents,
505 508 })
506 509 }
507 510
508 511 /// Returns a function that checks whether a given file (in the general sense)
509 512 /// should be matched.
510 513 fn build_match<'a, 'b>(
511 514 ignore_patterns: &'a [IgnorePattern],
512 515 root_dir: impl AsRef<Path>,
513 516 ) -> PatternResult<(
514 517 Vec<u8>,
515 518 Box<dyn Fn(&HgPath) -> bool + 'b + Sync>,
516 519 Vec<PatternFileWarning>,
517 520 )> {
518 521 let mut match_funcs: Vec<Box<dyn Fn(&HgPath) -> bool + Sync>> = vec![];
519 522 // For debugging and printing
520 523 let mut patterns = vec![];
521 524 let mut all_warnings = vec![];
522 525
523 526 let (subincludes, ignore_patterns) =
524 527 filter_subincludes(ignore_patterns, root_dir)?;
525 528
526 529 if !subincludes.is_empty() {
527 530 // Build prefix-based matcher functions for subincludes
528 531 let mut submatchers = FastHashMap::default();
529 532 let mut prefixes = vec![];
530 533
531 534 for SubInclude { prefix, root, path } in subincludes.into_iter() {
532 535 let (match_fn, warnings) =
533 536 get_ignore_function(vec![path.to_path_buf()], root)?;
534 537 all_warnings.extend(warnings);
535 538 prefixes.push(prefix.to_owned());
536 539 submatchers.insert(prefix.to_owned(), match_fn);
537 540 }
538 541
539 542 let match_subinclude = move |filename: &HgPath| {
540 543 for prefix in prefixes.iter() {
541 544 if let Some(rel) = filename.relative_to(prefix) {
542 545 if (submatchers.get(prefix).unwrap())(rel) {
543 546 return true;
544 547 }
545 548 }
546 549 }
547 550 false
548 551 };
549 552
550 553 match_funcs.push(Box::new(match_subinclude));
551 554 }
552 555
553 556 if !ignore_patterns.is_empty() {
554 557 // Either do dumb matching if all patterns are rootfiles, or match
555 558 // with a regex.
556 559 if ignore_patterns
557 560 .iter()
558 561 .all(|k| k.syntax == PatternSyntax::RootFiles)
559 562 {
560 563 let dirs: HashSet<_> = ignore_patterns
561 564 .iter()
562 565 .map(|k| k.pattern.to_owned())
563 566 .collect();
564 567 let mut dirs_vec: Vec<_> = dirs.iter().cloned().collect();
565 568
566 569 let match_func = move |path: &HgPath| -> bool {
567 570 let path = path.as_bytes();
568 571 let i = path.iter().rfind(|a| **a == b'/');
569 572 let dir = if let Some(i) = i {
570 573 &path[..*i as usize]
571 574 } else {
572 575 b"."
573 576 };
574 577 dirs.contains(dir.deref())
575 578 };
576 579 match_funcs.push(Box::new(match_func));
577 580
578 581 patterns.extend(b"rootfilesin: ");
579 582 dirs_vec.sort();
580 583 patterns.extend(dirs_vec.escaped_bytes());
581 584 } else {
582 585 let (new_re, match_func) = build_regex_match(&ignore_patterns)?;
583 586 patterns = new_re;
584 587 match_funcs.push(match_func)
585 588 }
586 589 }
587 590
588 591 Ok(if match_funcs.len() == 1 {
589 592 (patterns, match_funcs.remove(0), all_warnings)
590 593 } else {
591 594 (
592 595 patterns,
593 596 Box::new(move |f: &HgPath| -> bool {
594 597 match_funcs.iter().any(|match_func| match_func(f))
595 598 }),
596 599 all_warnings,
597 600 )
598 601 })
599 602 }
600 603
601 604 /// Parses all "ignore" files with their recursive includes and returns a
602 605 /// function that checks whether a given file (in the general sense) should be
603 606 /// ignored.
604 607 pub fn get_ignore_function<'a>(
605 608 all_pattern_files: Vec<PathBuf>,
606 609 root_dir: impl AsRef<Path>,
607 610 ) -> PatternResult<(
608 611 Box<dyn for<'r> Fn(&'r HgPath) -> bool + Sync + 'a>,
609 612 Vec<PatternFileWarning>,
610 613 )> {
611 614 let mut all_patterns = vec![];
612 615 let mut all_warnings = vec![];
613 616
614 617 for pattern_file in all_pattern_files.into_iter() {
615 618 let (patterns, warnings) =
616 619 get_patterns_from_file(pattern_file, &root_dir)?;
617 620
618 621 all_patterns.extend(patterns.to_owned());
619 622 all_warnings.extend(warnings);
620 623 }
621 624 let (matcher, warnings) = IncludeMatcher::new(all_patterns, root_dir)?;
622 625 all_warnings.extend(warnings);
623 626 Ok((
624 627 Box::new(move |path: &HgPath| matcher.matches(path)),
625 628 all_warnings,
626 629 ))
627 630 }
628 631
629 632 impl<'a> IncludeMatcher<'a> {
630 633 pub fn new(
631 634 ignore_patterns: Vec<IgnorePattern>,
632 635 root_dir: impl AsRef<Path>,
633 636 ) -> PatternResult<(Self, Vec<PatternFileWarning>)> {
634 637 let (patterns, match_fn, warnings) =
635 638 build_match(&ignore_patterns, root_dir)?;
636 639 let RootsDirsAndParents {
637 640 roots,
638 641 dirs,
639 642 parents,
640 643 } = roots_dirs_and_parents(&ignore_patterns)?;
641 644
642 645 let prefix = ignore_patterns.iter().any(|k| match k.syntax {
643 646 PatternSyntax::Path | PatternSyntax::RelPath => true,
644 647 _ => false,
645 648 });
646 649
647 650 Ok((
648 651 Self {
649 652 patterns,
650 653 match_fn,
651 654 prefix,
652 655 roots,
653 656 dirs,
654 657 parents,
655 658 },
656 659 warnings,
657 660 ))
658 661 }
659 662
660 663 fn get_all_parents_children(&self) -> DirsChildrenMultiset {
661 664 // TODO cache
662 665 let thing = self
663 666 .dirs
664 667 .iter()
665 668 .chain(self.roots.iter())
666 669 .chain(self.parents.iter());
667 670 DirsChildrenMultiset::new(thing, Some(&self.parents))
668 671 }
669 672 }
670 673
671 674 impl<'a> Display for IncludeMatcher<'a> {
672 675 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
673 676 // XXX What about exact matches?
674 677 // I'm not sure it's worth it to clone the HashSet and keep it
675 678 // around just in case someone wants to display the matcher, plus
676 679 // it's going to be unreadable after a few entries, but we need to
677 680 // inform in this display that exact matches are being used and are
678 681 // (on purpose) missing from the `includes`.
679 682 write!(
680 683 f,
681 684 "IncludeMatcher(includes='{}')",
682 685 String::from_utf8_lossy(&self.patterns.escaped_bytes())
683 686 )
684 687 }
685 688 }
686 689
687 690 #[cfg(test)]
688 691 mod tests {
689 692 use super::*;
690 693 use pretty_assertions::assert_eq;
691 694 use std::path::Path;
692 695
693 696 #[test]
694 697 fn test_roots_and_dirs() {
695 698 let pats = vec![
696 699 IgnorePattern::new(PatternSyntax::Glob, b"g/h/*", Path::new("")),
697 700 IgnorePattern::new(PatternSyntax::Glob, b"g/h", Path::new("")),
698 701 IgnorePattern::new(PatternSyntax::Glob, b"g*", Path::new("")),
699 702 ];
700 703 let (roots, dirs) = roots_and_dirs(&pats);
701 704
702 705 assert_eq!(
703 706 roots,
704 707 vec!(
705 708 HgPathBuf::from_bytes(b"g/h"),
706 709 HgPathBuf::from_bytes(b"g/h"),
707 710 HgPathBuf::new()
708 711 ),
709 712 );
710 713 assert_eq!(dirs, vec!());
711 714 }
712 715
713 716 #[test]
714 717 fn test_roots_dirs_and_parents() {
715 718 let pats = vec![
716 719 IgnorePattern::new(PatternSyntax::Glob, b"g/h/*", Path::new("")),
717 720 IgnorePattern::new(PatternSyntax::Glob, b"g/h", Path::new("")),
718 721 IgnorePattern::new(PatternSyntax::Glob, b"g*", Path::new("")),
719 722 ];
720 723
721 724 let mut roots = HashSet::new();
722 725 roots.insert(HgPathBuf::from_bytes(b"g/h"));
723 726 roots.insert(HgPathBuf::new());
724 727
725 728 let dirs = HashSet::new();
726 729
727 730 let mut parents = HashSet::new();
728 731 parents.insert(HgPathBuf::new());
729 732 parents.insert(HgPathBuf::from_bytes(b"g"));
730 733
731 734 assert_eq!(
732 735 roots_dirs_and_parents(&pats).unwrap(),
733 736 RootsDirsAndParents {
734 737 roots,
735 738 dirs,
736 739 parents
737 740 }
738 741 );
739 742 }
740 743
741 744 #[test]
742 745 fn test_filematcher_visit_children_set() {
743 746 // Visitchildrenset
744 747 let files = vec![HgPath::new(b"dir/subdir/foo.txt")];
745 748 let matcher = FileMatcher::new(&files).unwrap();
746 749
747 750 let mut set = HashSet::new();
748 751 set.insert(HgPath::new(b"dir"));
749 752 assert_eq!(
750 753 matcher.visit_children_set(HgPath::new(b"")),
751 754 VisitChildrenSet::Set(set)
752 755 );
753 756
754 757 let mut set = HashSet::new();
755 758 set.insert(HgPath::new(b"subdir"));
756 759 assert_eq!(
757 760 matcher.visit_children_set(HgPath::new(b"dir")),
758 761 VisitChildrenSet::Set(set)
759 762 );
760 763
761 764 let mut set = HashSet::new();
762 765 set.insert(HgPath::new(b"foo.txt"));
763 766 assert_eq!(
764 767 matcher.visit_children_set(HgPath::new(b"dir/subdir")),
765 768 VisitChildrenSet::Set(set)
766 769 );
767 770
768 771 assert_eq!(
769 772 matcher.visit_children_set(HgPath::new(b"dir/subdir/x")),
770 773 VisitChildrenSet::Empty
771 774 );
772 775 assert_eq!(
773 776 matcher.visit_children_set(HgPath::new(b"dir/subdir/foo.txt")),
774 777 VisitChildrenSet::Empty
775 778 );
776 779 assert_eq!(
777 780 matcher.visit_children_set(HgPath::new(b"folder")),
778 781 VisitChildrenSet::Empty
779 782 );
780 783 }
781 784
782 785 #[test]
783 786 fn test_filematcher_visit_children_set_files_and_dirs() {
784 787 let files = vec![
785 788 HgPath::new(b"rootfile.txt"),
786 789 HgPath::new(b"a/file1.txt"),
787 790 HgPath::new(b"a/b/file2.txt"),
788 791 // No file in a/b/c
789 792 HgPath::new(b"a/b/c/d/file4.txt"),
790 793 ];
791 794 let matcher = FileMatcher::new(&files).unwrap();
792 795
793 796 let mut set = HashSet::new();
794 797 set.insert(HgPath::new(b"a"));
795 798 set.insert(HgPath::new(b"rootfile.txt"));
796 799 assert_eq!(
797 800 matcher.visit_children_set(HgPath::new(b"")),
798 801 VisitChildrenSet::Set(set)
799 802 );
800 803
801 804 let mut set = HashSet::new();
802 805 set.insert(HgPath::new(b"b"));
803 806 set.insert(HgPath::new(b"file1.txt"));
804 807 assert_eq!(
805 808 matcher.visit_children_set(HgPath::new(b"a")),
806 809 VisitChildrenSet::Set(set)
807 810 );
808 811
809 812 let mut set = HashSet::new();
810 813 set.insert(HgPath::new(b"c"));
811 814 set.insert(HgPath::new(b"file2.txt"));
812 815 assert_eq!(
813 816 matcher.visit_children_set(HgPath::new(b"a/b")),
814 817 VisitChildrenSet::Set(set)
815 818 );
816 819
817 820 let mut set = HashSet::new();
818 821 set.insert(HgPath::new(b"d"));
819 822 assert_eq!(
820 823 matcher.visit_children_set(HgPath::new(b"a/b/c")),
821 824 VisitChildrenSet::Set(set)
822 825 );
823 826 let mut set = HashSet::new();
824 827 set.insert(HgPath::new(b"file4.txt"));
825 828 assert_eq!(
826 829 matcher.visit_children_set(HgPath::new(b"a/b/c/d")),
827 830 VisitChildrenSet::Set(set)
828 831 );
829 832
830 833 assert_eq!(
831 834 matcher.visit_children_set(HgPath::new(b"a/b/c/d/e")),
832 835 VisitChildrenSet::Empty
833 836 );
834 837 assert_eq!(
835 838 matcher.visit_children_set(HgPath::new(b"folder")),
836 839 VisitChildrenSet::Empty
837 840 );
838 841 }
839 842
840 843 #[cfg(feature = "with-re2")]
841 844 #[test]
842 845 fn test_includematcher() {
843 846 // VisitchildrensetPrefix
844 847 let (matcher, _) = IncludeMatcher::new(
845 848 vec![IgnorePattern::new(
846 849 PatternSyntax::RelPath,
847 850 b"dir/subdir",
848 851 Path::new(""),
849 852 )],
850 853 "",
851 854 )
852 855 .unwrap();
853 856
854 857 let mut set = HashSet::new();
855 858 set.insert(HgPath::new(b"dir"));
856 859 assert_eq!(
857 860 matcher.visit_children_set(HgPath::new(b"")),
858 861 VisitChildrenSet::Set(set)
859 862 );
860 863
861 864 let mut set = HashSet::new();
862 865 set.insert(HgPath::new(b"subdir"));
863 866 assert_eq!(
864 867 matcher.visit_children_set(HgPath::new(b"dir")),
865 868 VisitChildrenSet::Set(set)
866 869 );
867 870 assert_eq!(
868 871 matcher.visit_children_set(HgPath::new(b"dir/subdir")),
869 872 VisitChildrenSet::Recursive
870 873 );
871 874 // OPT: This should probably be 'all' if its parent is?
872 875 assert_eq!(
873 876 matcher.visit_children_set(HgPath::new(b"dir/subdir/x")),
874 877 VisitChildrenSet::This
875 878 );
876 879 assert_eq!(
877 880 matcher.visit_children_set(HgPath::new(b"folder")),
878 881 VisitChildrenSet::Empty
879 882 );
880 883
881 884 // VisitchildrensetRootfilesin
882 885 let (matcher, _) = IncludeMatcher::new(
883 886 vec![IgnorePattern::new(
884 887 PatternSyntax::RootFiles,
885 888 b"dir/subdir",
886 889 Path::new(""),
887 890 )],
888 891 "",
889 892 )
890 893 .unwrap();
891 894
892 895 let mut set = HashSet::new();
893 896 set.insert(HgPath::new(b"dir"));
894 897 assert_eq!(
895 898 matcher.visit_children_set(HgPath::new(b"")),
896 899 VisitChildrenSet::Set(set)
897 900 );
898 901
899 902 let mut set = HashSet::new();
900 903 set.insert(HgPath::new(b"subdir"));
901 904 assert_eq!(
902 905 matcher.visit_children_set(HgPath::new(b"dir")),
903 906 VisitChildrenSet::Set(set)
904 907 );
905 908
906 909 assert_eq!(
907 910 matcher.visit_children_set(HgPath::new(b"dir/subdir")),
908 911 VisitChildrenSet::This
909 912 );
910 913 assert_eq!(
911 914 matcher.visit_children_set(HgPath::new(b"dir/subdir/x")),
912 915 VisitChildrenSet::Empty
913 916 );
914 917 assert_eq!(
915 918 matcher.visit_children_set(HgPath::new(b"folder")),
916 919 VisitChildrenSet::Empty
917 920 );
918 921
919 922 // VisitchildrensetGlob
920 923 let (matcher, _) = IncludeMatcher::new(
921 924 vec![IgnorePattern::new(
922 925 PatternSyntax::Glob,
923 926 b"dir/z*",
924 927 Path::new(""),
925 928 )],
926 929 "",
927 930 )
928 931 .unwrap();
929 932
930 933 let mut set = HashSet::new();
931 934 set.insert(HgPath::new(b"dir"));
932 935 assert_eq!(
933 936 matcher.visit_children_set(HgPath::new(b"")),
934 937 VisitChildrenSet::Set(set)
935 938 );
936 939 assert_eq!(
937 940 matcher.visit_children_set(HgPath::new(b"folder")),
938 941 VisitChildrenSet::Empty
939 942 );
940 943 assert_eq!(
941 944 matcher.visit_children_set(HgPath::new(b"dir")),
942 945 VisitChildrenSet::This
943 946 );
944 947 // OPT: these should probably be set().
945 948 assert_eq!(
946 949 matcher.visit_children_set(HgPath::new(b"dir/subdir")),
947 950 VisitChildrenSet::This
948 951 );
949 952 assert_eq!(
950 953 matcher.visit_children_set(HgPath::new(b"dir/subdir/x")),
951 954 VisitChildrenSet::This
952 955 );
953 956 }
954 957 }
General Comments 0
You need to be logged in to leave comments. Login now