##// END OF EJS Templates
rust-filepatterns: fix type of warnings tuple to (bytes, bytes)...
Yuya Nishihara -
r42857:02476018 default
parent child Browse files
Show More
@@ -1,374 +1,371 b''
1 1 use crate::{
2 2 utils::{files::get_path_from_bytes, replace_slice, SliceExt},
3 3 LineNumber, PatternError, PatternFileError,
4 4 };
5 5 use lazy_static::lazy_static;
6 6 use regex::bytes::Regex;
7 7 use std::collections::HashMap;
8 8 use std::fs::File;
9 9 use std::io::Read;
10 10 use std::vec::Vec;
11 11
12 12 lazy_static! {
13 13 static ref RE_ESCAPE: Vec<Vec<u8>> = {
14 14 let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
15 15 let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
16 16 for byte in to_escape {
17 17 v[*byte as usize].insert(0, b'\\');
18 18 }
19 19 v
20 20 };
21 21 }
22 22
23 23 /// These are matched in order
24 24 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
25 25 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
26 26
27 27 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
28 28 pub enum PatternSyntax {
29 29 Regexp,
30 30 /// Glob that matches at the front of the path
31 31 RootGlob,
32 32 /// Glob that matches at any suffix of the path (still anchored at slashes)
33 33 Glob,
34 34 Path,
35 35 RelPath,
36 36 RelGlob,
37 37 RelRegexp,
38 38 RootFiles,
39 39 }
40 40
41 41 /// Transforms a glob pattern into a regex
42 42 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
43 43 let mut input = pat;
44 44 let mut res: Vec<u8> = vec![];
45 45 let mut group_depth = 0;
46 46
47 47 while let Some((c, rest)) = input.split_first() {
48 48 input = rest;
49 49
50 50 match c {
51 51 b'*' => {
52 52 for (source, repl) in GLOB_REPLACEMENTS {
53 53 if input.starts_with(source) {
54 54 input = &input[source.len()..];
55 55 res.extend(*repl);
56 56 break;
57 57 }
58 58 }
59 59 }
60 60 b'?' => res.extend(b"."),
61 61 b'[' => {
62 62 match input.iter().skip(1).position(|b| *b == b']') {
63 63 None => res.extend(b"\\["),
64 64 Some(end) => {
65 65 // Account for the one we skipped
66 66 let end = end + 1;
67 67
68 68 res.extend(b"[");
69 69
70 70 for (i, b) in input[..end].iter().enumerate() {
71 71 if *b == b'!' && i == 0 {
72 72 res.extend(b"^")
73 73 } else if *b == b'^' && i == 0 {
74 74 res.extend(b"\\^")
75 75 } else if *b == b'\\' {
76 76 res.extend(b"\\\\")
77 77 } else {
78 78 res.push(*b)
79 79 }
80 80 }
81 81 res.extend(b"]");
82 82 input = &input[end + 1..];
83 83 }
84 84 }
85 85 }
86 86 b'{' => {
87 87 group_depth += 1;
88 88 res.extend(b"(?:")
89 89 }
90 90 b'}' if group_depth > 0 => {
91 91 group_depth -= 1;
92 92 res.extend(b")");
93 93 }
94 94 b',' if group_depth > 0 => res.extend(b"|"),
95 95 b'\\' => {
96 96 let c = {
97 97 if let Some((c, rest)) = input.split_first() {
98 98 input = rest;
99 99 c
100 100 } else {
101 101 c
102 102 }
103 103 };
104 104 res.extend(&RE_ESCAPE[*c as usize])
105 105 }
106 106 _ => res.extend(&RE_ESCAPE[*c as usize]),
107 107 }
108 108 }
109 109 res
110 110 }
111 111
112 112 fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
113 113 pattern
114 114 .iter()
115 115 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
116 116 .collect()
117 117 }
118 118
119 119 fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
120 120 match kind {
121 121 b"re" => Ok(PatternSyntax::Regexp),
122 122 b"path" => Ok(PatternSyntax::Path),
123 123 b"relpath" => Ok(PatternSyntax::RelPath),
124 124 b"rootfilesin" => Ok(PatternSyntax::RootFiles),
125 125 b"relglob" => Ok(PatternSyntax::RelGlob),
126 126 b"relre" => Ok(PatternSyntax::RelRegexp),
127 127 b"glob" => Ok(PatternSyntax::Glob),
128 128 b"rootglob" => Ok(PatternSyntax::RootGlob),
129 129 _ => Err(PatternError::UnsupportedSyntax(
130 130 String::from_utf8_lossy(kind).to_string(),
131 131 )),
132 132 }
133 133 }
134 134
135 135 /// Builds the regex that corresponds to the given pattern.
136 136 /// If within a `syntax: regexp` context, returns the pattern,
137 137 /// otherwise, returns the corresponding regex.
138 138 fn _build_single_regex(
139 139 syntax: PatternSyntax,
140 140 pattern: &[u8],
141 141 globsuffix: &[u8],
142 142 ) -> Vec<u8> {
143 143 if pattern.is_empty() {
144 144 return vec![];
145 145 }
146 146 match syntax {
147 147 PatternSyntax::Regexp => pattern.to_owned(),
148 148 PatternSyntax::RelRegexp => {
149 149 if pattern[0] == b'^' {
150 150 return pattern.to_owned();
151 151 }
152 152 let mut res = b".*".to_vec();
153 153 res.extend(pattern);
154 154 res
155 155 }
156 156 PatternSyntax::Path | PatternSyntax::RelPath => {
157 157 if pattern == b"." {
158 158 return vec![];
159 159 }
160 160 let mut pattern = escape_pattern(pattern);
161 161 pattern.extend(b"(?:/|$)");
162 162 pattern
163 163 }
164 164 PatternSyntax::RootFiles => {
165 165 let mut res = if pattern == b"." {
166 166 vec![]
167 167 } else {
168 168 // Pattern is a directory name.
169 169 let mut as_vec: Vec<u8> = escape_pattern(pattern);
170 170 as_vec.push(b'/');
171 171 as_vec
172 172 };
173 173
174 174 // Anything after the pattern must be a non-directory.
175 175 res.extend(b"[^/]+$");
176 176 res
177 177 }
178 178 PatternSyntax::Glob
179 179 | PatternSyntax::RelGlob
180 180 | PatternSyntax::RootGlob => {
181 181 let mut res: Vec<u8> = vec![];
182 182 if syntax == PatternSyntax::RelGlob {
183 183 res.extend(b"(?:|.*/)");
184 184 }
185 185
186 186 res.extend(glob_to_re(pattern));
187 187 res.extend(globsuffix.iter());
188 188 res
189 189 }
190 190 }
191 191 }
192 192
193 193 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
194 194 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
195 195
196 196 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
197 197 /// that don't need to be transformed into a regex.
198 198 pub fn build_single_regex(
199 199 kind: &[u8],
200 200 pat: &[u8],
201 201 globsuffix: &[u8],
202 202 ) -> Result<Vec<u8>, PatternError> {
203 203 let enum_kind = parse_pattern_syntax(kind)?;
204 204 if enum_kind == PatternSyntax::RootGlob
205 205 && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
206 206 {
207 207 let mut escaped = escape_pattern(pat);
208 208 escaped.extend(b"(?:/|$)");
209 209 Ok(escaped)
210 210 } else {
211 211 Ok(_build_single_regex(enum_kind, pat, globsuffix))
212 212 }
213 213 }
214 214
215 215 lazy_static! {
216 216 static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = {
217 217 let mut m = HashMap::new();
218 218
219 219 m.insert(b"re".as_ref(), b"relre:".as_ref());
220 220 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
221 221 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
222 222 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
223 223 m.insert(b"include".as_ref(), b"include".as_ref());
224 224 m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
225 225 m
226 226 };
227 227 }
228 228
229 229 pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
230 type WarningTuple = (String, String);
230 type WarningTuple = (Vec<u8>, Vec<u8>);
231 231
232 232 pub fn parse_pattern_file_contents(
233 233 lines: &[u8],
234 234 file_path: &[u8],
235 235 warn: bool,
236 236 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
237 237 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
238 238 let mut inputs: Vec<PatternTuple> = vec![];
239 239 let mut warnings: Vec<WarningTuple> = vec![];
240 240
241 241 let mut current_syntax = b"relre:".as_ref();
242 242
243 243 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
244 244 let line_number = line_number + 1;
245 245
246 246 if line.contains(&('#' as u8)) {
247 247 if let Some(cap) = comment_regex.captures(line) {
248 248 line = &line[..cap.get(1).unwrap().end()]
249 249 }
250 250 let mut line = line.to_owned();
251 251 replace_slice(&mut line, br"\#", b"#");
252 252 }
253 253
254 254 let mut line = line.trim_end();
255 255
256 256 if line.is_empty() {
257 257 continue;
258 258 }
259 259
260 260 if line.starts_with(b"syntax:") {
261 261 let syntax = line[b"syntax:".len()..].trim();
262 262
263 263 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
264 264 current_syntax = rel_syntax;
265 265 } else if warn {
266 warnings.push((
267 String::from_utf8_lossy(file_path).to_string(),
268 String::from_utf8_lossy(syntax).to_string(),
269 ));
266 warnings.push((file_path.to_owned(), syntax.to_owned()));
270 267 }
271 268 continue;
272 269 }
273 270
274 271 let mut line_syntax: &[u8] = &current_syntax;
275 272
276 273 for (s, rels) in SYNTAXES.iter() {
277 274 if line.starts_with(rels) {
278 275 line_syntax = rels;
279 276 line = &line[rels.len()..];
280 277 break;
281 278 } else if line.starts_with(&[s, b":".as_ref()].concat()) {
282 279 line_syntax = rels;
283 280 line = &line[s.len() + 1..];
284 281 break;
285 282 }
286 283 }
287 284
288 285 inputs.push((
289 286 [line_syntax, line].concat(),
290 287 line_number,
291 288 line.to_owned(),
292 289 ));
293 290 }
294 291 (inputs, warnings)
295 292 }
296 293
297 294 pub fn read_pattern_file(
298 295 file_path: &[u8],
299 296 warn: bool,
300 297 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
301 298 let mut f = File::open(get_path_from_bytes(file_path))?;
302 299 let mut contents = Vec::new();
303 300
304 301 f.read_to_end(&mut contents)?;
305 302
306 303 Ok(parse_pattern_file_contents(&contents, file_path, warn))
307 304 }
308 305
309 306 #[cfg(test)]
310 307 mod tests {
311 308 use super::*;
312 309
313 310 #[test]
314 311 fn escape_pattern_test() {
315 312 let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
316 313 assert_eq!(escape_pattern(untouched), untouched.to_vec());
317 314 // All escape codes
318 315 assert_eq!(
319 316 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
320 317 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
321 318 .to_vec()
322 319 );
323 320 }
324 321
325 322 #[test]
326 323 fn glob_test() {
327 324 assert_eq!(glob_to_re(br#"?"#), br#"."#);
328 325 assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
329 326 assert_eq!(glob_to_re(br#"**"#), br#".*"#);
330 327 assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
331 328 assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
332 329 assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
333 330 assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
334 331 assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
335 332 }
336 333
337 334 #[test]
338 335 fn test_parse_pattern_file_contents() {
339 336 let lines = b"syntax: glob\n*.elc";
340 337
341 338 assert_eq!(
342 339 vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
343 340 parse_pattern_file_contents(lines, b"file_path", false).0,
344 341 );
345 342
346 343 let lines = b"syntax: include\nsyntax: glob";
347 344
348 345 assert_eq!(
349 346 parse_pattern_file_contents(lines, b"file_path", false).0,
350 347 vec![]
351 348 );
352 349 let lines = b"glob:**.o";
353 350 assert_eq!(
354 351 parse_pattern_file_contents(lines, b"file_path", false).0,
355 352 vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
356 353 );
357 354 }
358 355
359 356 #[test]
360 357 fn test_build_single_regex_shortcut() {
361 358 assert_eq!(
362 359 br"(?:/|$)".to_vec(),
363 360 build_single_regex(b"rootglob", b"", b"").unwrap()
364 361 );
365 362 assert_eq!(
366 363 br"whatever(?:/|$)".to_vec(),
367 364 build_single_regex(b"rootglob", b"whatever", b"").unwrap()
368 365 );
369 366 assert_eq!(
370 367 br"[^/]*\.o".to_vec(),
371 368 build_single_regex(b"rootglob", b"*.o", b"").unwrap()
372 369 );
373 370 }
374 371 }
@@ -1,110 +1,124 b''
1 1 // filepatterns.rs
2 2 //
3 3 // Copyright 2019, Georges Racinet <gracinet@anybox.fr>,
4 4 // Raphaël Gomès <rgomes@octobus.net>
5 5 //
6 6 // This software may be used and distributed according to the terms of the
7 7 // GNU General Public License version 2 or any later version.
8 8
9 9 //! Bindings for the `hg::filepatterns` module provided by the
10 10 //! `hg-core` crate. From Python, this will be seen as `rustext.filepatterns`
11 11 //! and can be used as replacement for the the pure `filepatterns` Python module.
12 12 //!
13 13 use crate::exceptions::{PatternError, PatternFileError};
14 14 use cpython::{
15 15 PyBytes, PyDict, PyModule, PyObject, PyResult, PyTuple, Python, ToPyObject,
16 16 };
17 17 use hg::{build_single_regex, read_pattern_file, LineNumber, PatternTuple};
18 18
19 19 /// Rust does not like functions with different return signatures.
20 20 /// The 3-tuple version is always returned by the hg-core function,
21 21 /// the (potential) conversion is handled at this level since it is not likely
22 22 /// to have any measurable impact on performance.
23 23 ///
24 24 /// The Python implementation passes a function reference for `warn` instead
25 25 /// of a boolean that is used to emit warnings while parsing. The Rust
26 26 /// implementation chooses to accumulate the warnings and propagate them to
27 27 /// Python upon completion. See the `readpatternfile` function in `match.py`
28 28 /// for more details.
29 29 fn read_pattern_file_wrapper(
30 30 py: Python,
31 31 file_path: PyObject,
32 32 warn: bool,
33 33 source_info: bool,
34 34 ) -> PyResult<PyTuple> {
35 35 match read_pattern_file(file_path.extract::<PyBytes>(py)?.data(py), warn) {
36 36 Ok((patterns, warnings)) => {
37 37 if source_info {
38 38 let itemgetter = |x: &PatternTuple| {
39 39 (PyBytes::new(py, &x.0), x.1, PyBytes::new(py, &x.2))
40 40 };
41 41 let results: Vec<(PyBytes, LineNumber, PyBytes)> =
42 42 patterns.iter().map(itemgetter).collect();
43 return Ok((results, warnings).to_py_object(py));
43 return Ok((results, warnings_to_py_bytes(py, &warnings))
44 .to_py_object(py));
44 45 }
45 46 let itemgetter = |x: &PatternTuple| PyBytes::new(py, &x.0);
46 47 let results: Vec<PyBytes> =
47 48 patterns.iter().map(itemgetter).collect();
48 Ok((results, warnings).to_py_object(py))
49 Ok(
50 (results, warnings_to_py_bytes(py, &warnings))
51 .to_py_object(py),
52 )
49 53 }
50 54 Err(e) => Err(PatternFileError::pynew(py, e)),
51 55 }
52 56 }
53 57
58 fn warnings_to_py_bytes(
59 py: Python,
60 warnings: &[(Vec<u8>, Vec<u8>)],
61 ) -> Vec<(PyBytes, PyBytes)> {
62 warnings
63 .iter()
64 .map(|(path, syn)| (PyBytes::new(py, path), PyBytes::new(py, syn)))
65 .collect()
66 }
67
54 68 fn build_single_regex_wrapper(
55 69 py: Python,
56 70 kind: PyObject,
57 71 pat: PyObject,
58 72 globsuffix: PyObject,
59 73 ) -> PyResult<PyBytes> {
60 74 match build_single_regex(
61 75 kind.extract::<PyBytes>(py)?.data(py),
62 76 pat.extract::<PyBytes>(py)?.data(py),
63 77 globsuffix.extract::<PyBytes>(py)?.data(py),
64 78 ) {
65 79 Ok(regex) => Ok(PyBytes::new(py, &regex)),
66 80 Err(e) => Err(PatternError::pynew(py, e)),
67 81 }
68 82 }
69 83
70 84 pub fn init_module(py: Python, package: &str) -> PyResult<PyModule> {
71 85 let dotted_name = &format!("{}.filepatterns", package);
72 86 let m = PyModule::new(py, dotted_name)?;
73 87
74 88 m.add(py, "__package__", package)?;
75 89 m.add(
76 90 py,
77 91 "__doc__",
78 92 "Patterns files parsing - Rust implementation",
79 93 )?;
80 94 m.add(
81 95 py,
82 96 "build_single_regex",
83 97 py_fn!(
84 98 py,
85 99 build_single_regex_wrapper(
86 100 kind: PyObject,
87 101 pat: PyObject,
88 102 globsuffix: PyObject
89 103 )
90 104 ),
91 105 )?;
92 106 m.add(
93 107 py,
94 108 "read_pattern_file",
95 109 py_fn!(
96 110 py,
97 111 read_pattern_file_wrapper(
98 112 file_path: PyObject,
99 113 warn: bool,
100 114 source_info: bool
101 115 )
102 116 ),
103 117 )?;
104 118 m.add(py, "PatternError", py.get_type::<PatternError>())?;
105 119 let sys = PyModule::import(py, "sys")?;
106 120 let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?;
107 121 sys_modules.set_item(py, dotted_name, &m)?;
108 122
109 123 Ok(m)
110 124 }
General Comments 0
You need to be logged in to leave comments. Login now