##// END OF EJS Templates
match: optimize escaping in _globre...
Matt Mackall -
r8583:19d1b2ae default
parent child Browse files
Show More
@@ -1,251 +1,254 b''
1 # match.py - file name matching
1 # match.py - file name matching
2 #
2 #
3 # Copyright 2008, 2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2008, 2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2, incorporated herein by reference.
6 # GNU General Public License version 2, incorporated herein by reference.
7
7
8 import util, re
8 import util, re
9
9
10 class _match(object):
10 class _match(object):
11 def __init__(self, root, cwd, files, mf, ap):
11 def __init__(self, root, cwd, files, mf, ap):
12 self._root = root
12 self._root = root
13 self._cwd = cwd
13 self._cwd = cwd
14 self._files = files
14 self._files = files
15 self._fmap = set(files)
15 self._fmap = set(files)
16 self.matchfn = mf
16 self.matchfn = mf
17 self._anypats = ap
17 self._anypats = ap
18 def __call__(self, fn):
18 def __call__(self, fn):
19 return self.matchfn(fn)
19 return self.matchfn(fn)
20 def __iter__(self):
20 def __iter__(self):
21 for f in self._files:
21 for f in self._files:
22 yield f
22 yield f
23 def bad(self, f, msg):
23 def bad(self, f, msg):
24 return True
24 return True
25 def dir(self, f):
25 def dir(self, f):
26 pass
26 pass
27 def missing(self, f):
27 def missing(self, f):
28 pass
28 pass
29 def exact(self, f):
29 def exact(self, f):
30 return f in self._fmap
30 return f in self._fmap
31 def rel(self, f):
31 def rel(self, f):
32 return util.pathto(self._root, self._cwd, f)
32 return util.pathto(self._root, self._cwd, f)
33 def files(self):
33 def files(self):
34 return self._files
34 return self._files
35 def anypats(self):
35 def anypats(self):
36 return self._anypats
36 return self._anypats
37
37
38 class always(_match):
38 class always(_match):
39 def __init__(self, root, cwd):
39 def __init__(self, root, cwd):
40 _match.__init__(self, root, cwd, [], lambda f: True, False)
40 _match.__init__(self, root, cwd, [], lambda f: True, False)
41
41
42 class never(_match):
42 class never(_match):
43 def __init__(self, root, cwd):
43 def __init__(self, root, cwd):
44 _match.__init__(self, root, cwd, [], lambda f: False, False)
44 _match.__init__(self, root, cwd, [], lambda f: False, False)
45
45
46 class exact(_match):
46 class exact(_match):
47 def __init__(self, root, cwd, files):
47 def __init__(self, root, cwd, files):
48 _match.__init__(self, root, cwd, files, self.exact, False)
48 _match.__init__(self, root, cwd, files, self.exact, False)
49
49
50 class match(_match):
50 class match(_match):
51 def __init__(self, root, cwd, patterns, include=[], exclude=[],
51 def __init__(self, root, cwd, patterns, include=[], exclude=[],
52 default='glob'):
52 default='glob'):
53 """build an object to match a set of file patterns
53 """build an object to match a set of file patterns
54
54
55 arguments:
55 arguments:
56 root - the canonical root of the tree you're matching against
56 root - the canonical root of the tree you're matching against
57 cwd - the current working directory, if relevant
57 cwd - the current working directory, if relevant
58 patterns - patterns to find
58 patterns - patterns to find
59 include - patterns to include
59 include - patterns to include
60 exclude - patterns to exclude
60 exclude - patterns to exclude
61 default - if a pattern in names has no explicit type, assume this one
61 default - if a pattern in names has no explicit type, assume this one
62
62
63 a pattern is one of:
63 a pattern is one of:
64 'glob:<glob>' - a glob relative to cwd
64 'glob:<glob>' - a glob relative to cwd
65 're:<regexp>' - a regular expression
65 're:<regexp>' - a regular expression
66 'path:<path>' - a path relative to canonroot
66 'path:<path>' - a path relative to canonroot
67 'relglob:<glob>' - an unrooted glob (*.c matches C files in all dirs)
67 'relglob:<glob>' - an unrooted glob (*.c matches C files in all dirs)
68 'relpath:<path>' - a path relative to cwd
68 'relpath:<path>' - a path relative to cwd
69 'relre:<regexp>' - a regexp that doesn't have to match the start of a name
69 'relre:<regexp>' - a regexp that doesn't have to match the start of a name
70 '<something>' - one of the cases above, selected by the dflt_pat argument
70 '<something>' - one of the cases above, selected by the dflt_pat argument
71 """
71 """
72
72
73 roots = []
73 roots = []
74 anypats = bool(include or exclude)
74 anypats = bool(include or exclude)
75
75
76 if patterns:
76 if patterns:
77 pats = _normalize(patterns, default, root, cwd)
77 pats = _normalize(patterns, default, root, cwd)
78 roots = _roots(pats)
78 roots = _roots(pats)
79 anypats = anypats or _anypats(pats)
79 anypats = anypats or _anypats(pats)
80 pm = _buildmatch(pats, '$')
80 pm = _buildmatch(pats, '$')
81 if include:
81 if include:
82 im = _buildmatch(_normalize(include, 'glob', root, cwd), '(?:/|$)')
82 im = _buildmatch(_normalize(include, 'glob', root, cwd), '(?:/|$)')
83 if exclude:
83 if exclude:
84 em = _buildmatch(_normalize(exclude, 'glob', root, cwd), '(?:/|$)')
84 em = _buildmatch(_normalize(exclude, 'glob', root, cwd), '(?:/|$)')
85
85
86 if patterns:
86 if patterns:
87 if include:
87 if include:
88 if exclude:
88 if exclude:
89 m = lambda f: im(f) and not em(f) and pm(f)
89 m = lambda f: im(f) and not em(f) and pm(f)
90 else:
90 else:
91 m = lambda f: im(f) and pm(f)
91 m = lambda f: im(f) and pm(f)
92 else:
92 else:
93 if exclude:
93 if exclude:
94 m = lambda f: not em(f) and pm(f)
94 m = lambda f: not em(f) and pm(f)
95 else:
95 else:
96 m = pm
96 m = pm
97 else:
97 else:
98 if include:
98 if include:
99 if exclude:
99 if exclude:
100 m = lambda f: im(f) and not em(f)
100 m = lambda f: im(f) and not em(f)
101 else:
101 else:
102 m = im
102 m = im
103 else:
103 else:
104 if exclude:
104 if exclude:
105 m = lambda f: not em(f)
105 m = lambda f: not em(f)
106 else:
106 else:
107 m = lambda f: True
107 m = lambda f: True
108
108
109 _match.__init__(self, root, cwd, roots, m, anypats)
109 _match.__init__(self, root, cwd, roots, m, anypats)
110
110
111 def patkind(pat):
111 def patkind(pat):
112 return _patsplit(pat, None)[0]
112 return _patsplit(pat, None)[0]
113
113
114 def _patsplit(pat, default):
114 def _patsplit(pat, default):
115 """Split a string into an optional pattern kind prefix and the
115 """Split a string into an optional pattern kind prefix and the
116 actual pattern."""
116 actual pattern."""
117 if ':' in pat:
117 if ':' in pat:
118 pat, val = pat.split(':', 1)
118 pat, val = pat.split(':', 1)
119 if pat in ('re', 'glob', 'path', 'relglob', 'relpath', 'relre'):
119 if pat in ('re', 'glob', 'path', 'relglob', 'relpath', 'relre'):
120 return pat, val
120 return pat, val
121 return default, pat
121 return default, pat
122
122
123 def _globre(pat):
123 def _globre(pat):
124 "convert a glob pattern into a regexp"
124 "convert a glob pattern into a regexp"
125 i, n = 0, len(pat)
125 i, n = 0, len(pat)
126 res = ''
126 res = ''
127 group = 0
127 group = 0
128 escape = re.escape
128 def peek(): return i < n and pat[i]
129 def peek(): return i < n and pat[i]
129 while i < n:
130 while i < n:
130 c = pat[i]
131 c = pat[i]
131 i = i+1
132 i = i+1
132 if c == '*':
133 if c not in '*?[{},\\':
134 res += escape(c)
135 elif c == '*':
133 if peek() == '*':
136 if peek() == '*':
134 i += 1
137 i += 1
135 res += '.*'
138 res += '.*'
136 else:
139 else:
137 res += '[^/]*'
140 res += '[^/]*'
138 elif c == '?':
141 elif c == '?':
139 res += '.'
142 res += '.'
140 elif c == '[':
143 elif c == '[':
141 j = i
144 j = i
142 if j < n and pat[j] in '!]':
145 if j < n and pat[j] in '!]':
143 j += 1
146 j += 1
144 while j < n and pat[j] != ']':
147 while j < n and pat[j] != ']':
145 j += 1
148 j += 1
146 if j >= n:
149 if j >= n:
147 res += '\\['
150 res += '\\['
148 else:
151 else:
149 stuff = pat[i:j].replace('\\','\\\\')
152 stuff = pat[i:j].replace('\\','\\\\')
150 i = j + 1
153 i = j + 1
151 if stuff[0] == '!':
154 if stuff[0] == '!':
152 stuff = '^' + stuff[1:]
155 stuff = '^' + stuff[1:]
153 elif stuff[0] == '^':
156 elif stuff[0] == '^':
154 stuff = '\\' + stuff
157 stuff = '\\' + stuff
155 res = '%s[%s]' % (res, stuff)
158 res = '%s[%s]' % (res, stuff)
156 elif c == '{':
159 elif c == '{':
157 group += 1
160 group += 1
158 res += '(?:'
161 res += '(?:'
159 elif c == '}' and group:
162 elif c == '}' and group:
160 res += ')'
163 res += ')'
161 group -= 1
164 group -= 1
162 elif c == ',' and group:
165 elif c == ',' and group:
163 res += '|'
166 res += '|'
164 elif c == '\\':
167 elif c == '\\':
165 p = peek()
168 p = peek()
166 if p:
169 if p:
167 i += 1
170 i += 1
168 res += re.escape(p)
171 res += escape(p)
169 else:
172 else:
170 res += re.escape(c)
173 res += escape(c)
171 else:
174 else:
172 res += re.escape(c)
175 res += escape(c)
173 return res
176 return res
174
177
175 def _regex(kind, name, tail):
178 def _regex(kind, name, tail):
176 '''convert a pattern into a regular expression'''
179 '''convert a pattern into a regular expression'''
177 if not name:
180 if not name:
178 return ''
181 return ''
179 if kind == 're':
182 if kind == 're':
180 return name
183 return name
181 elif kind == 'path':
184 elif kind == 'path':
182 return '^' + re.escape(name) + '(?:/|$)'
185 return '^' + re.escape(name) + '(?:/|$)'
183 elif kind == 'relglob':
186 elif kind == 'relglob':
184 return '(?:|.*/)' + _globre(name) + tail
187 return '(?:|.*/)' + _globre(name) + tail
185 elif kind == 'relpath':
188 elif kind == 'relpath':
186 return re.escape(name) + '(?:/|$)'
189 return re.escape(name) + '(?:/|$)'
187 elif kind == 'relre':
190 elif kind == 'relre':
188 if name.startswith('^'):
191 if name.startswith('^'):
189 return name
192 return name
190 return '.*' + name
193 return '.*' + name
191 return _globre(name) + tail
194 return _globre(name) + tail
192
195
193 def _buildmatch(pats, tail):
196 def _buildmatch(pats, tail):
194 """build a matching function from a set of patterns"""
197 """build a matching function from a set of patterns"""
195 try:
198 try:
196 pat = '(?:%s)' % '|'.join([_regex(k, p, tail) for (k, p) in pats])
199 pat = '(?:%s)' % '|'.join([_regex(k, p, tail) for (k, p) in pats])
197 if len(pat) > 20000:
200 if len(pat) > 20000:
198 raise OverflowError()
201 raise OverflowError()
199 return re.compile(pat).match
202 return re.compile(pat).match
200 except OverflowError:
203 except OverflowError:
201 # We're using a Python with a tiny regex engine and we
204 # We're using a Python with a tiny regex engine and we
202 # made it explode, so we'll divide the pattern list in two
205 # made it explode, so we'll divide the pattern list in two
203 # until it works
206 # until it works
204 l = len(pats)
207 l = len(pats)
205 if l < 2:
208 if l < 2:
206 raise
209 raise
207 a, b = _buildmatch(pats[:l//2], tail), _buildmatch(pats[l//2:], tail)
210 a, b = _buildmatch(pats[:l//2], tail), _buildmatch(pats[l//2:], tail)
208 return lambda s: a(s) or b(s)
211 return lambda s: a(s) or b(s)
209 except re.error:
212 except re.error:
210 for k, p in pats:
213 for k, p in pats:
211 try:
214 try:
212 re.compile('(?:%s)' % _regex(k, p, tail))
215 re.compile('(?:%s)' % _regex(k, p, tail))
213 except re.error:
216 except re.error:
214 raise util.Abort("invalid pattern (%s): %s" % (k, p))
217 raise util.Abort("invalid pattern (%s): %s" % (k, p))
215 raise util.Abort("invalid pattern")
218 raise util.Abort("invalid pattern")
216
219
217 def _globprefix(pat):
220 def _globprefix(pat):
218 '''return the non-glob prefix of a path, e.g. foo/* -> foo'''
221 '''return the non-glob prefix of a path, e.g. foo/* -> foo'''
219 root = []
222 root = []
220 for p in pat.split('/'):
223 for p in pat.split('/'):
221 if '[' in p or '{' in p or '*' in p or '?' in p:
224 if '[' in p or '{' in p or '*' in p or '?' in p:
222 break
225 break
223 root.append(p)
226 root.append(p)
224 return '/'.join(root) or '.'
227 return '/'.join(root) or '.'
225
228
226 def _normalize(names, default, root, cwd):
229 def _normalize(names, default, root, cwd):
227 pats = []
230 pats = []
228 for kind, name in [_patsplit(p, default) for p in names]:
231 for kind, name in [_patsplit(p, default) for p in names]:
229 if kind in ('glob', 'relpath'):
232 if kind in ('glob', 'relpath'):
230 name = util.canonpath(root, cwd, name)
233 name = util.canonpath(root, cwd, name)
231 elif kind in ('relglob', 'path'):
234 elif kind in ('relglob', 'path'):
232 name = util.normpath(name)
235 name = util.normpath(name)
233
236
234 pats.append((kind, name))
237 pats.append((kind, name))
235 return pats
238 return pats
236
239
237 def _roots(patterns):
240 def _roots(patterns):
238 r = []
241 r = []
239 for kind, name in patterns:
242 for kind, name in patterns:
240 if kind == 'glob':
243 if kind == 'glob':
241 r.append(_globprefix(name))
244 r.append(_globprefix(name))
242 elif kind in ('relpath', 'path'):
245 elif kind in ('relpath', 'path'):
243 r.append(name or '.')
246 r.append(name or '.')
244 elif kind == 'relglob':
247 elif kind == 'relglob':
245 r.append('.')
248 r.append('.')
246 return r
249 return r
247
250
248 def _anypats(patterns):
251 def _anypats(patterns):
249 for kind, name in patterns:
252 for kind, name in patterns:
250 if kind in ('glob', 're', 'relglob', 'relre'):
253 if kind in ('glob', 're', 'relglob', 'relre'):
251 return True
254 return True
General Comments 0
You need to be logged in to leave comments. Login now