##// END OF EJS Templates
rust-filepatterns: unescape comment character property...
Yuya Nishihara -
r42859:12addcc7 default
parent child Browse files
Show More
@@ -1,371 +1,373 b''
1 1 use crate::{
2 utils::{files::get_path_from_bytes, replace_slice, SliceExt},
2 utils::{files::get_path_from_bytes, SliceExt},
3 3 LineNumber, PatternError, PatternFileError,
4 4 };
5 5 use lazy_static::lazy_static;
6 use regex::bytes::Regex;
6 use regex::bytes::{NoExpand, Regex};
7 7 use std::collections::HashMap;
8 8 use std::fs::File;
9 9 use std::io::Read;
10 10 use std::vec::Vec;
11 11
12 12 lazy_static! {
13 13 static ref RE_ESCAPE: Vec<Vec<u8>> = {
14 14 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
15 15 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
16 16 for byte in to_escape {
17 17 v[*byte as usize].insert(0, b'\\');
18 18 }
19 19 v
20 20 };
21 21 }
22 22
23 23 /// These are matched in order
24 24 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
25 25 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
26 26
27 27 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
28 28 pub enum PatternSyntax {
29 29 Regexp,
30 30 /// Glob that matches at the front of the path
31 31 RootGlob,
32 32 /// Glob that matches at any suffix of the path (still anchored at slashes)
33 33 Glob,
34 34 Path,
35 35 RelPath,
36 36 RelGlob,
37 37 RelRegexp,
38 38 RootFiles,
39 39 }
40 40
41 41 /// Transforms a glob pattern into a regex
42 42 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
43 43 let mut input = pat;
44 44 let mut res: Vec<u8> = vec![];
45 45 let mut group_depth = 0;
46 46
47 47 while let Some((c, rest)) = input.split_first() {
48 48 input = rest;
49 49
50 50 match c {
51 51 b'*' => {
52 52 for (source, repl) in GLOB_REPLACEMENTS {
53 53 if input.starts_with(source) {
54 54 input = &input[source.len()..];
55 55 res.extend(*repl);
56 56 break;
57 57 }
58 58 }
59 59 }
60 60 b'?' => res.extend(b"."),
61 61 b'[' => {
62 62 match input.iter().skip(1).position(|b| *b == b']') {
63 63 None => res.extend(b"\\["),
64 64 Some(end) => {
65 65 // Account for the one we skipped
66 66 let end = end + 1;
67 67
68 68 res.extend(b"[");
69 69
70 70 for (i, b) in input[..end].iter().enumerate() {
71 71 if *b == b'!' && i == 0 {
72 72 res.extend(b"^")
73 73 } else if *b == b'^' && i == 0 {
74 74 res.extend(b"\\^")
75 75 } else if *b == b'\\' {
76 76 res.extend(b"\\\\")
77 77 } else {
78 78 res.push(*b)
79 79 }
80 80 }
81 81 res.extend(b"]");
82 82 input = &input[end + 1..];
83 83 }
84 84 }
85 85 }
86 86 b'{' => {
87 87 group_depth += 1;
88 88 res.extend(b"(?:")
89 89 }
90 90 b'}' if group_depth > 0 => {
91 91 group_depth -= 1;
92 92 res.extend(b")");
93 93 }
94 94 b',' if group_depth > 0 => res.extend(b"|"),
95 95 b'\\' => {
96 96 let c = {
97 97 if let Some((c, rest)) = input.split_first() {
98 98 input = rest;
99 99 c
100 100 } else {
101 101 c
102 102 }
103 103 };
104 104 res.extend(&RE_ESCAPE[*c as usize])
105 105 }
106 106 _ => res.extend(&RE_ESCAPE[*c as usize]),
107 107 }
108 108 }
109 109 res
110 110 }
111 111
112 112 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
113 113 pattern
114 114 .iter()
115 115 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
116 116 .collect()
117 117 }
118 118
119 119 fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
120 120 match kind {
121 121 b"re" => Ok(PatternSyntax::Regexp),
122 122 b"path" => Ok(PatternSyntax::Path),
123 123 b"relpath" => Ok(PatternSyntax::RelPath),
124 124 b"rootfilesin" => Ok(PatternSyntax::RootFiles),
125 125 b"relglob" => Ok(PatternSyntax::RelGlob),
126 126 b"relre" => Ok(PatternSyntax::RelRegexp),
127 127 b"glob" => Ok(PatternSyntax::Glob),
128 128 b"rootglob" => Ok(PatternSyntax::RootGlob),
129 129 _ => Err(PatternError::UnsupportedSyntax(
130 130 String::from_utf8_lossy(kind).to_string(),
131 131 )),
132 132 }
133 133 }
134 134
135 135 /// Builds the regex that corresponds to the given pattern.
136 136 /// If within a `syntax: regexp` context, returns the pattern,
137 137 /// otherwise, returns the corresponding regex.
138 138 fn _build_single_regex(
139 139 syntax: PatternSyntax,
140 140 pattern: &[u8],
141 141 globsuffix: &[u8],
142 142 ) -> Vec<u8> {
143 143 if pattern.is_empty() {
144 144 return vec![];
145 145 }
146 146 match syntax {
147 147 PatternSyntax::Regexp => pattern.to_owned(),
148 148 PatternSyntax::RelRegexp => {
149 149 if pattern[0] == b'^' {
150 150 return pattern.to_owned();
151 151 }
152 152 let mut res = b".*".to_vec();
153 153 res.extend(pattern);
154 154 res
155 155 }
156 156 PatternSyntax::Path | PatternSyntax::RelPath => {
157 157 if pattern == b"." {
158 158 return vec![];
159 159 }
160 160 let mut pattern = escape_pattern(pattern);
161 161 pattern.extend(b"(?:/|$)");
162 162 pattern
163 163 }
164 164 PatternSyntax::RootFiles => {
165 165 let mut res = if pattern == b"." {
166 166 vec![]
167 167 } else {
168 168 // Pattern is a directory name.
169 169 let mut as_vec: Vec<u8> = escape_pattern(pattern);
170 170 as_vec.push(b'/');
171 171 as_vec
172 172 };
173 173
174 174 // Anything after the pattern must be a non-directory.
175 175 res.extend(b"[^/]+$");
176 176 res
177 177 }
178 178 PatternSyntax::Glob
179 179 | PatternSyntax::RelGlob
180 180 | PatternSyntax::RootGlob => {
181 181 let mut res: Vec<u8> = vec![];
182 182 if syntax == PatternSyntax::RelGlob {
183 183 res.extend(b"(?:|.*/)");
184 184 }
185 185
186 186 res.extend(glob_to_re(pattern));
187 187 res.extend(globsuffix.iter());
188 188 res
189 189 }
190 190 }
191 191 }
192 192
193 193 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
194 194 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
195 195
196 196 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
197 197 /// that don't need to be transformed into a regex.
198 198 pub fn build_single_regex(
199 199 kind: &[u8],
200 200 pat: &[u8],
201 201 globsuffix: &[u8],
202 202 ) -> Result<Vec<u8>, PatternError> {
203 203 let enum_kind = parse_pattern_syntax(kind)?;
204 204 if enum_kind == PatternSyntax::RootGlob
205 205 && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
206 206 {
207 207 let mut escaped = escape_pattern(pat);
208 208 escaped.extend(b"(?:/|$)");
209 209 Ok(escaped)
210 210 } else {
211 211 Ok(_build_single_regex(enum_kind, pat, globsuffix))
212 212 }
213 213 }
214 214
215 215 lazy_static! {
216 216 static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = {
217 217 let mut m = HashMap::new();
218 218
219 219 m.insert(b"re".as_ref(), b"relre:".as_ref());
220 220 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
221 221 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
222 222 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
223 223 m.insert(b"include".as_ref(), b"include".as_ref());
224 224 m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
225 225 m
226 226 };
227 227 }
228 228
229 229 pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
230 230 type WarningTuple = (Vec<u8>, Vec<u8>);
231 231
232 232 pub fn parse_pattern_file_contents(
233 233 lines: &[u8],
234 234 file_path: &[u8],
235 235 warn: bool,
236 236 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
237 237 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
238 let comment_escape_regex = Regex::new(r"\\#").unwrap();
238 239 let mut inputs: Vec<PatternTuple> = vec![];
239 240 let mut warnings: Vec<WarningTuple> = vec![];
240 241
241 242 let mut current_syntax = b"relre:".as_ref();
242 243
243 244 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
244 245 let line_number = line_number + 1;
245 246
247 let line_buf;
246 248 if line.contains(&b'#') {
247 249 if let Some(cap) = comment_regex.captures(line) {
248 250 line = &line[..cap.get(1).unwrap().end()]
249 251 }
250 let mut line = line.to_owned();
251 replace_slice(&mut line, br"\#", b"#");
252 line_buf = comment_escape_regex.replace_all(line, NoExpand(b"#"));
253 line = &line_buf;
252 254 }
253 255
254 256 let mut line = line.trim_end();
255 257
256 258 if line.is_empty() {
257 259 continue;
258 260 }
259 261
260 262 if line.starts_with(b"syntax:") {
261 263 let syntax = line[b"syntax:".len()..].trim();
262 264
263 265 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
264 266 current_syntax = rel_syntax;
265 267 } else if warn {
266 268 warnings.push((file_path.to_owned(), syntax.to_owned()));
267 269 }
268 270 continue;
269 271 }
270 272
271 273 let mut line_syntax: &[u8] = &current_syntax;
272 274
273 275 for (s, rels) in SYNTAXES.iter() {
274 276 if line.starts_with(rels) {
275 277 line_syntax = rels;
276 278 line = &line[rels.len()..];
277 279 break;
278 280 } else if line.starts_with(&[s, b":".as_ref()].concat()) {
279 281 line_syntax = rels;
280 282 line = &line[s.len() + 1..];
281 283 break;
282 284 }
283 285 }
284 286
285 287 inputs.push((
286 288 [line_syntax, line].concat(),
287 289 line_number,
288 290 line.to_owned(),
289 291 ));
290 292 }
291 293 (inputs, warnings)
292 294 }
293 295
294 296 pub fn read_pattern_file(
295 297 file_path: &[u8],
296 298 warn: bool,
297 299 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
298 300 let mut f = File::open(get_path_from_bytes(file_path))?;
299 301 let mut contents = Vec::new();
300 302
301 303 f.read_to_end(&mut contents)?;
302 304
303 305 Ok(parse_pattern_file_contents(&contents, file_path, warn))
304 306 }
305 307
306 308 #[cfg(test)]
307 309 mod tests {
308 310 use super::*;
309 311
310 312 #[test]
311 313 fn escape_pattern_test() {
312 314 let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
313 315 assert_eq!(escape_pattern(untouched), untouched.to_vec());
314 316 // All escape codes
315 317 assert_eq!(
316 318 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
317 319 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
318 320 .to_vec()
319 321 );
320 322 }
321 323
322 324 #[test]
323 325 fn glob_test() {
324 326 assert_eq!(glob_to_re(br#"?"#), br#"."#);
325 327 assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
326 328 assert_eq!(glob_to_re(br#"**"#), br#".*"#);
327 329 assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
328 330 assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
329 331 assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
330 332 assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
331 333 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
332 334 }
333 335
334 336 #[test]
335 337 fn test_parse_pattern_file_contents() {
336 338 let lines = b"syntax: glob\n*.elc";
337 339
338 340 assert_eq!(
339 341 vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
340 342 parse_pattern_file_contents(lines, b"file_path", false).0,
341 343 );
342 344
343 345 let lines = b"syntax: include\nsyntax: glob";
344 346
345 347 assert_eq!(
346 348 parse_pattern_file_contents(lines, b"file_path", false).0,
347 349 vec![]
348 350 );
349 351 let lines = b"glob:**.o";
350 352 assert_eq!(
351 353 parse_pattern_file_contents(lines, b"file_path", false).0,
352 354 vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
353 355 );
354 356 }
355 357
356 358 #[test]
357 359 fn test_build_single_regex_shortcut() {
358 360 assert_eq!(
359 361 br"(?:/|$)".to_vec(),
360 362 build_single_regex(b"rootglob", b"", b"").unwrap()
361 363 );
362 364 assert_eq!(
363 365 br"whatever(?:/|$)".to_vec(),
364 366 build_single_regex(b"rootglob", b"whatever", b"").unwrap()
365 367 );
366 368 assert_eq!(
367 369 br"[^/]*\.o".to_vec(),
368 370 build_single_regex(b"rootglob", b"*.o", b"").unwrap()
369 371 );
370 372 }
371 373 }
General Comments 0
You need to be logged in to leave comments. Login now