# HG changeset patch # User Raphaël Gomès # Date 2020-05-07 21:52:08 # Node ID eb301282bacc3b053afc3e6df7d70bf3a2a62a28 # Parent 8c0c1161614efaaddf9b12a454d1419f62ceb60a rust-regex: fix issues with regex anchoring and performance It turns out that the way I tried to work around `regex`'s behavior difference with `re2` and Python's `re` was 1) buggy and 2) much more complicated than needed. In a few words: `regex` adds `.*` on either side of patterns when no start or end anchor is present. My previous workaround put `^` or `$` for every pattern, which is wrong even without the other 2 bugs on top of it. Using `^(?:)` right at the end of the `regex` path fixes the issue. I've opened an issue to get a build option instead: https://github.com/rust-lang/regex/issues/675 Differential Revision: https://phab.mercurial-scm.org/D8506 diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs --- a/rust/hg-core/src/filepatterns.rs +++ b/rust/hg-core/src/filepatterns.rs @@ -176,9 +176,7 @@ fn _build_single_regex(entry: &IgnorePat return vec![]; } match syntax { - // The `regex` crate adds `.*` to the start and end of expressions - // if there are no anchors, so add them. - PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(), + PatternSyntax::Regexp => pattern.to_owned(), PatternSyntax::RelRegexp => { // The `regex` crate accepts `**` while `re2` and Python's `re` // do not. Checking for `*` correctly triggers the same error all @@ -196,15 +194,14 @@ fn _build_single_regex(entry: &IgnorePat } PatternSyntax::RootFiles => { let mut res = if pattern == b"." { - vec![b'^'] + vec![] } else { // Pattern is a directory name. - [b"^", escape_pattern(pattern).as_slice(), b"/"].concat() + [escape_pattern(pattern).as_slice(), b"/"].concat() }; // Anything after the pattern must be a non-directory. res.extend(b"[^/]+$"); - res.push(b'$'); res } PatternSyntax::RelGlob => { @@ -216,7 +213,7 @@ fn _build_single_regex(entry: &IgnorePat } } PatternSyntax::Glob | PatternSyntax::RootGlob => { - [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat() + [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat() } PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(), } diff --git a/rust/hg-core/src/matchers.rs b/rust/hg-core/src/matchers.rs --- a/rust/hg-core/src/matchers.rs +++ b/rust/hg-core/src/matchers.rs @@ -347,7 +347,9 @@ fn re_matcher( ) -> PatternResult bool + Sync> { use std::io::Write; - let mut escaped_bytes = vec![]; + // The `regex` crate adds `.*` to the start and end of expressions if there + // are no anchors, so add the start anchor. + let mut escaped_bytes = vec![b'^', b'(', b'?', b':']; for byte in pattern { if *byte > 127 { write!(escaped_bytes, "\\x{:x}", *byte).unwrap(); @@ -355,6 +357,7 @@ fn re_matcher( escaped_bytes.push(*byte); } } + escaped_bytes.push(b')'); // Avoid the cost of UTF8 checking //