##// END OF EJS Templates
rust-regex: fix shortcut for exact matches...
Raphaël Gomès -
r42631:48f1f864 default
parent child Browse files
Show More
@@ -1,354 +1,372 b''
1 1 use crate::{LineNumber, PatternError, PatternFileError};
2 2 use regex::bytes::Regex;
3 3 use std::collections::HashMap;
4 4 use std::fs::File;
5 5 use std::io::Read;
6 6 use std::vec::Vec;
7 7 use utils::files::get_path_from_bytes;
8 8 use utils::{replace_slice, SliceExt};
9 9
10 10 lazy_static! {
11 11 static ref reescape: Vec<Vec<u8>> = {
12 12 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
13 13 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
14 14 for byte in to_escape {
15 15 v[*byte as usize].insert(0, b'\\');
16 16 }
17 17 v
18 18 };
19 19 }
20 20
21 21 /// These are matched in order
22 22 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
23 23 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
24 24
25 25 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
26 26 pub enum PatternSyntax {
27 27 Regexp,
28 28 /// Glob that matches at the front of the path
29 29 RootGlob,
30 30 /// Glob that matches at any suffix of the path (still anchored at slashes)
31 31 Glob,
32 32 Path,
33 33 RelPath,
34 34 RelGlob,
35 35 RelRegexp,
36 36 RootFiles,
37 37 }
38 38
39 39 /// Transforms a glob pattern into a regex
40 40 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
41 41 let mut input = pat;
42 42 let mut res: Vec<u8> = vec![];
43 43 let mut group_depth = 0;
44 44
45 45 while let Some((c, rest)) = input.split_first() {
46 46 input = rest;
47 47
48 48 match c {
49 49 b'*' => {
50 50 for (source, repl) in GLOB_REPLACEMENTS {
51 51 if input.starts_with(source) {
52 52 input = &input[source.len()..];
53 53 res.extend(*repl);
54 54 break;
55 55 }
56 56 }
57 57 }
58 58 b'?' => res.extend(b"."),
59 59 b'[' => {
60 60 match input.iter().skip(1).position(|b| *b == b']') {
61 61 None => res.extend(b"\\["),
62 62 Some(end) => {
63 63 // Account for the one we skipped
64 64 let end = end + 1;
65 65
66 66 res.extend(b"[");
67 67
68 68 for (i, b) in input[..end].iter().enumerate() {
69 69 if *b == b'!' && i == 0 {
70 70 res.extend(b"^")
71 71 } else if *b == b'^' && i == 0 {
72 72 res.extend(b"\\^")
73 73 } else if *b == b'\\' {
74 74 res.extend(b"\\\\")
75 75 } else {
76 76 res.push(*b)
77 77 }
78 78 }
79 79 res.extend(b"]");
80 80 input = &input[end + 1..];
81 81 }
82 82 }
83 83 }
84 84 b'{' => {
85 85 group_depth += 1;
86 86 res.extend(b"(?:")
87 87 }
88 88 b'}' if group_depth > 0 => {
89 89 group_depth -= 1;
90 90 res.extend(b")");
91 91 }
92 92 b',' if group_depth > 0 => res.extend(b"|"),
93 93 b'\\' => {
94 94 let c = {
95 95 if let Some((c, rest)) = input.split_first() {
96 96 input = rest;
97 97 c
98 98 } else {
99 99 c
100 100 }
101 101 };
102 102 res.extend(&reescape[*c as usize])
103 103 }
104 104 _ => res.extend(&reescape[*c as usize]),
105 105 }
106 106 }
107 107 res
108 108 }
109 109
110 110 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
111 111 pattern
112 112 .iter()
113 113 .flat_map(|c| reescape[*c as usize].clone())
114 114 .collect()
115 115 }
116 116
117 117 fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
118 118 match kind {
119 119 b"re" => Ok(PatternSyntax::Regexp),
120 120 b"path" => Ok(PatternSyntax::Path),
121 121 b"relpath" => Ok(PatternSyntax::RelPath),
122 122 b"rootfilesin" => Ok(PatternSyntax::RootFiles),
123 123 b"relglob" => Ok(PatternSyntax::RelGlob),
124 124 b"relre" => Ok(PatternSyntax::RelRegexp),
125 125 b"glob" => Ok(PatternSyntax::Glob),
126 126 b"rootglob" => Ok(PatternSyntax::RootGlob),
127 127 _ => Err(PatternError::UnsupportedSyntax(
128 128 String::from_utf8_lossy(kind).to_string(),
129 129 )),
130 130 }
131 131 }
132 132
133 133 /// Builds the regex that corresponds to the given pattern.
134 134 /// If within a `syntax: regexp` context, returns the pattern,
135 135 /// otherwise, returns the corresponding regex.
136 136 fn _build_single_regex(
137 137 syntax: PatternSyntax,
138 138 pattern: &[u8],
139 139 globsuffix: &[u8],
140 140 ) -> Vec<u8> {
141 141 if pattern.is_empty() {
142 142 return vec![];
143 143 }
144 144 match syntax {
145 145 PatternSyntax::Regexp => pattern.to_owned(),
146 146 PatternSyntax::RelRegexp => {
147 147 if pattern[0] == b'^' {
148 148 return pattern.to_owned();
149 149 }
150 150 let mut res = b".*".to_vec();
151 151 res.extend(pattern);
152 152 res
153 153 }
154 154 PatternSyntax::Path | PatternSyntax::RelPath => {
155 155 if pattern == b"." {
156 156 return vec![];
157 157 }
158 158 let mut pattern = escape_pattern(pattern);
159 159 pattern.extend(b"(?:/|$)");
160 160 pattern
161 161 }
162 162 PatternSyntax::RootFiles => {
163 163 let mut res = if pattern == b"." {
164 164 vec![]
165 165 } else {
166 166 // Pattern is a directory name.
167 167 let mut as_vec: Vec<u8> = escape_pattern(pattern);
168 168 as_vec.push(b'/');
169 169 as_vec
170 170 };
171 171
172 172 // Anything after the pattern must be a non-directory.
173 173 res.extend(b"[^/]+$");
174 174 res
175 175 }
176 176 PatternSyntax::Glob
177 177 | PatternSyntax::RelGlob
178 178 | PatternSyntax::RootGlob => {
179 179 let mut res: Vec<u8> = vec![];
180 180 if syntax == PatternSyntax::RelGlob {
181 181 res.extend(b"(?:|.*/)");
182 182 }
183 183
184 184 res.extend(glob_to_re(pattern));
185 185 res.extend(globsuffix.iter());
186 186 res
187 187 }
188 188 }
189 189 }
190 190
191 191 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
192 192 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
193 193
194 194 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
195 195 /// that don't need to be transformed into a regex.
196 196 pub fn build_single_regex(
197 197 kind: &[u8],
198 198 pat: &[u8],
199 199 globsuffix: &[u8],
200 200 ) -> Result<Vec<u8>, PatternError> {
201 201 let enum_kind = parse_pattern_syntax(kind)?;
202 202 if enum_kind == PatternSyntax::RootGlob
203 && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
203 && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
204 204 {
205 Ok(pat.to_vec())
205 let mut escaped = escape_pattern(pat);
206 escaped.extend(b"(?:/|$)");
207 Ok(escaped)
206 208 } else {
207 209 Ok(_build_single_regex(enum_kind, pat, globsuffix))
208 210 }
209 211 }
210 212
211 213 lazy_static! {
212 214 static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = {
213 215 let mut m = HashMap::new();
214 216
215 217 m.insert(b"re".as_ref(), b"relre:".as_ref());
216 218 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
217 219 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
218 220 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
219 221 m.insert(b"include".as_ref(), b"include".as_ref());
220 222 m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
221 223 m
222 224 };
223 225 }
224 226
225 227 pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
226 228 type WarningTuple = (String, String);
227 229
228 230 pub fn parse_pattern_file_contents(
229 231 lines: &[u8],
230 232 file_path: &[u8],
231 233 warn: bool,
232 234 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
233 235 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
234 236 let mut inputs: Vec<PatternTuple> = vec![];
235 237 let mut warnings: Vec<WarningTuple> = vec![];
236 238
237 239 let mut current_syntax = b"relre:".as_ref();
238 240
239 241 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
240 242 let line_number = line_number + 1;
241 243
242 244 if line.contains(&('#' as u8)) {
243 245 if let Some(cap) = comment_regex.captures(line) {
244 246 line = &line[..cap.get(1).unwrap().end()]
245 247 }
246 248 let mut line = line.to_owned();
247 249 replace_slice(&mut line, br"\#", b"#");
248 250 }
249 251
250 252 let mut line = line.trim_end();
251 253
252 254 if line.is_empty() {
253 255 continue;
254 256 }
255 257
256 258 if line.starts_with(b"syntax:") {
257 259 let syntax = line[b"syntax:".len()..].trim();
258 260
259 261 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
260 262 current_syntax = rel_syntax;
261 263 } else if warn {
262 264 warnings.push((
263 265 String::from_utf8_lossy(file_path).to_string(),
264 266 String::from_utf8_lossy(syntax).to_string(),
265 267 ));
266 268 }
267 269 continue;
268 270 }
269 271
270 272 let mut line_syntax: &[u8] = &current_syntax;
271 273
272 274 for (s, rels) in SYNTAXES.iter() {
273 275 if line.starts_with(rels) {
274 276 line_syntax = rels;
275 277 line = &line[rels.len()..];
276 278 break;
277 279 } else if line.starts_with(&[s, b":".as_ref()].concat()) {
278 280 line_syntax = rels;
279 281 line = &line[s.len() + 1..];
280 282 break;
281 283 }
282 284 }
283 285
284 286 inputs.push((
285 287 [line_syntax, line].concat(),
286 288 line_number,
287 289 line.to_owned(),
288 290 ));
289 291 }
290 292 (inputs, warnings)
291 293 }
292 294
293 295 pub fn read_pattern_file(
294 296 file_path: &[u8],
295 297 warn: bool,
296 298 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
297 299 let mut f = File::open(get_path_from_bytes(file_path))?;
298 300 let mut contents = Vec::new();
299 301
300 302 f.read_to_end(&mut contents)?;
301 303
302 304 Ok(parse_pattern_file_contents(&contents, file_path, warn))
303 305 }
304 306
305 307 #[cfg(test)]
306 308 mod tests {
307 309 use super::*;
308 310
309 311 #[test]
310 312 fn escape_pattern_test() {
311 313 let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
312 314 assert_eq!(escape_pattern(untouched), untouched.to_vec());
313 315 // All escape codes
314 316 assert_eq!(
315 317 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
316 318 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
317 319 .to_vec()
318 320 );
319 321 }
320 322
321 323 #[test]
322 324 fn glob_test() {
323 325 assert_eq!(glob_to_re(br#"?"#), br#"."#);
324 326 assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
325 327 assert_eq!(glob_to_re(br#"**"#), br#".*"#);
326 328 assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
327 329 assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
328 330 assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
329 331 assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
330 332 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
331 333 }
332 334
333 335 #[test]
334 336 fn test_parse_pattern_file_contents() {
335 337 let lines = b"syntax: glob\n*.elc";
336 338
337 339 assert_eq!(
338 340 vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
339 341 parse_pattern_file_contents(lines, b"file_path", false).0,
340 342 );
341 343
342 344 let lines = b"syntax: include\nsyntax: glob";
343 345
344 346 assert_eq!(
345 347 parse_pattern_file_contents(lines, b"file_path", false).0,
346 348 vec![]
347 349 );
348 350 let lines = b"glob:**.o";
349 351 assert_eq!(
350 352 parse_pattern_file_contents(lines, b"file_path", false).0,
351 353 vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
352 354 );
353 355 }
356
357 #[test]
358 fn test_build_single_regex_shortcut() {
359 assert_eq!(
360 br"(?:/|$)".to_vec(),
361 build_single_regex(b"rootglob", b"", b"").unwrap()
362 );
363 assert_eq!(
364 br"whatever(?:/|$)".to_vec(),
365 build_single_regex(b"rootglob", b"whatever", b"").unwrap()
366 );
367 assert_eq!(
368 br"[^/]*\.o".to_vec(),
369 build_single_regex(b"rootglob", b"*.o", b"").unwrap()
370 );
371 }
354 372 }
General Comments 0
You need to be logged in to leave comments. Login now