##// END OF EJS Templates
stringutil: improve check for failed mailmap line parsing...
Connor Sheehan -
r37263:0e7550b0 default
parent child Browse files
Show More
@@ -1,469 +1,499
1 1 # stringutil.py - utility for generic string formatting, parsing, etc.
2 2 #
3 3 # Copyright 2005 K. Thananchayan <thananck@yahoo.com>
4 4 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
5 5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import codecs
13 13 import re as remod
14 14 import textwrap
15 15
16 16 from ..i18n import _
17 17 from ..thirdparty import attr
18 18
19 19 from .. import (
20 20 encoding,
21 21 error,
22 22 pycompat,
23 23 )
24 24
25 25 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
26 26 _DATA_ESCAPE_MAP.update({
27 27 b'\\': b'\\\\',
28 28 b'\r': br'\r',
29 29 b'\n': br'\n',
30 30 })
31 31 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')
32 32
33 33 def escapedata(s):
34 34 if isinstance(s, bytearray):
35 35 s = bytes(s)
36 36
37 37 return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)
38 38
39 39 def binary(s):
40 40 """return true if a string is binary data"""
41 41 return bool(s and '\0' in s)
42 42
43 43 def stringmatcher(pattern, casesensitive=True):
44 44 """
45 45 accepts a string, possibly starting with 're:' or 'literal:' prefix.
46 46 returns the matcher name, pattern, and matcher function.
47 47 missing or unknown prefixes are treated as literal matches.
48 48
49 49 helper for tests:
50 50 >>> def test(pattern, *tests):
51 51 ... kind, pattern, matcher = stringmatcher(pattern)
52 52 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
53 53 >>> def itest(pattern, *tests):
54 54 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
55 55 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
56 56
57 57 exact matching (no prefix):
58 58 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
59 59 ('literal', 'abcdefg', [False, False, True])
60 60
61 61 regex matching ('re:' prefix)
62 62 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
63 63 ('re', 'a.+b', [False, False, True])
64 64
65 65 force exact matches ('literal:' prefix)
66 66 >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
67 67 ('literal', 're:foobar', [False, True])
68 68
69 69 unknown prefixes are ignored and treated as literals
70 70 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
71 71 ('literal', 'foo:bar', [False, False, True])
72 72
73 73 case insensitive regex matches
74 74 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
75 75 ('re', 'A.+b', [False, False, True])
76 76
77 77 case insensitive literal matches
78 78 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
79 79 ('literal', 'ABCDEFG', [False, False, True])
80 80 """
81 81 if pattern.startswith('re:'):
82 82 pattern = pattern[3:]
83 83 try:
84 84 flags = 0
85 85 if not casesensitive:
86 86 flags = remod.I
87 87 regex = remod.compile(pattern, flags)
88 88 except remod.error as e:
89 89 raise error.ParseError(_('invalid regular expression: %s')
90 90 % e)
91 91 return 're', pattern, regex.search
92 92 elif pattern.startswith('literal:'):
93 93 pattern = pattern[8:]
94 94
95 95 match = pattern.__eq__
96 96
97 97 if not casesensitive:
98 98 ipat = encoding.lower(pattern)
99 99 match = lambda s: ipat == encoding.lower(s)
100 100 return 'literal', pattern, match
101 101
102 102 def shortuser(user):
103 103 """Return a short representation of a user name or email address."""
104 104 f = user.find('@')
105 105 if f >= 0:
106 106 user = user[:f]
107 107 f = user.find('<')
108 108 if f >= 0:
109 109 user = user[f + 1:]
110 110 f = user.find(' ')
111 111 if f >= 0:
112 112 user = user[:f]
113 113 f = user.find('.')
114 114 if f >= 0:
115 115 user = user[:f]
116 116 return user
117 117
118 118 def emailuser(user):
119 119 """Return the user portion of an email address."""
120 120 f = user.find('@')
121 121 if f >= 0:
122 122 user = user[:f]
123 123 f = user.find('<')
124 124 if f >= 0:
125 125 user = user[f + 1:]
126 126 return user
127 127
128 128 def email(author):
129 129 '''get email of author.'''
130 130 r = author.find('>')
131 131 if r == -1:
132 132 r = None
133 133 return author[author.find('<') + 1:r]
134 134
135 135 def person(author):
136 136 """Returns the name before an email address,
137 137 interpreting it as per RFC 5322
138 138
139 139 >>> person(b'foo@bar')
140 140 'foo'
141 141 >>> person(b'Foo Bar <foo@bar>')
142 142 'Foo Bar'
143 143 >>> person(b'"Foo Bar" <foo@bar>')
144 144 'Foo Bar'
145 145 >>> person(b'"Foo \"buz\" Bar" <foo@bar>')
146 146 'Foo "buz" Bar'
147 147 >>> # The following are invalid, but do exist in real-life
148 148 ...
149 149 >>> person(b'Foo "buz" Bar <foo@bar>')
150 150 'Foo "buz" Bar'
151 151 >>> person(b'"Foo Bar <foo@bar>')
152 152 'Foo Bar'
153 153 """
154 154 if '@' not in author:
155 155 return author
156 156 f = author.find('<')
157 157 if f != -1:
158 158 return author[:f].strip(' "').replace('\\"', '"')
159 159 f = author.find('@')
160 160 return author[:f].replace('.', ' ')
161 161
162 162 @attr.s(hash=True)
163 163 class mailmapping(object):
164 164 '''Represents a username/email key or value in
165 165 a mailmap file'''
166 166 email = attr.ib()
167 167 name = attr.ib(default=None)
168 168
169 def _ismailmaplineinvalid(names, emails):
170 '''Returns True if the parsed names and emails
171 in a mailmap entry are invalid.
172
173 >>> # No names or emails fails
174 >>> names, emails = [], []
175 >>> _ismailmaplineinvalid(names, emails)
176 True
177 >>> # Only one email fails
178 >>> emails = [b'email@email.com']
179 >>> _ismailmaplineinvalid(names, emails)
180 True
181 >>> # One email and one name passes
182 >>> names = [b'Test Name']
183 >>> _ismailmaplineinvalid(names, emails)
184 False
185 >>> # No names but two emails passes
186 >>> names = []
187 >>> emails = [b'proper@email.com', b'commit@email.com']
188 >>> _ismailmaplineinvalid(names, emails)
189 False
190 '''
191 return not emails or not names and len(emails) < 2
192
169 193 def parsemailmap(mailmapcontent):
170 194 """Parses data in the .mailmap format
171 195
172 196 >>> mmdata = b"\\n".join([
173 197 ... b'# Comment',
174 198 ... b'Name <commit1@email.xx>',
175 199 ... b'<name@email.xx> <commit2@email.xx>',
176 200 ... b'Name <proper@email.xx> <commit3@email.xx>',
177 201 ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
178 202 ... ])
179 203 >>> mm = parsemailmap(mmdata)
180 204 >>> for key in sorted(mm.keys()):
181 205 ... print(key)
182 206 mailmapping(email='commit1@email.xx', name=None)
183 207 mailmapping(email='commit2@email.xx', name=None)
184 208 mailmapping(email='commit3@email.xx', name=None)
185 209 mailmapping(email='commit4@email.xx', name='Commit')
186 210 >>> for val in sorted(mm.values()):
187 211 ... print(val)
188 212 mailmapping(email='commit1@email.xx', name='Name')
189 213 mailmapping(email='name@email.xx', name=None)
190 214 mailmapping(email='proper@email.xx', name='Name')
191 215 mailmapping(email='proper@email.xx', name='Name')
192 216 """
193 217 mailmap = {}
194 218
195 219 if mailmapcontent is None:
196 220 return mailmap
197 221
198 222 for line in mailmapcontent.splitlines():
199 223
200 224 # Don't bother checking the line if it is a comment or
201 225 # is an improperly formed author field
202 if line.lstrip().startswith('#') or any(c not in line for c in '<>@'):
226 if line.lstrip().startswith('#'):
203 227 continue
204 228
205 229 # names, emails hold the parsed emails and names for each line
206 230 # name_builder holds the words in a persons name
207 231 names, emails = [], []
208 232 namebuilder = []
209 233
210 234 for element in line.split():
211 235 if element.startswith('#'):
212 236 # If we reach a comment in the mailmap file, move on
213 237 break
214 238
215 239 elif element.startswith('<') and element.endswith('>'):
216 240 # We have found an email.
217 241 # Parse it, and finalize any names from earlier
218 242 emails.append(element[1:-1]) # Slice off the "<>"
219 243
220 244 if namebuilder:
221 245 names.append(' '.join(namebuilder))
222 246 namebuilder = []
223 247
224 248 # Break if we have found a second email, any other
225 249 # data does not fit the spec for .mailmap
226 250 if len(emails) > 1:
227 251 break
228 252
229 253 else:
230 254 # We have found another word in the committers name
231 255 namebuilder.append(element)
232 256
257 # Check to see if we have parsed the line into a valid form
258 # We require at least one email, and either at least one
259 # name or a second email
260 if _ismailmaplineinvalid(names, emails):
261 continue
262
233 263 mailmapkey = mailmapping(
234 264 email=emails[-1],
235 265 name=names[-1] if len(names) == 2 else None,
236 266 )
237 267
238 268 mailmap[mailmapkey] = mailmapping(
239 269 email=emails[0],
240 270 name=names[0] if names else None,
241 271 )
242 272
243 273 return mailmap
244 274
245 275 def mapname(mailmap, author):
246 276 """Returns the author field according to the mailmap cache, or
247 277 the original author field.
248 278
249 279 >>> mmdata = b"\\n".join([
250 280 ... b'# Comment',
251 281 ... b'Name <commit1@email.xx>',
252 282 ... b'<name@email.xx> <commit2@email.xx>',
253 283 ... b'Name <proper@email.xx> <commit3@email.xx>',
254 284 ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
255 285 ... ])
256 286 >>> m = parsemailmap(mmdata)
257 287 >>> mapname(m, b'Commit <commit1@email.xx>')
258 288 'Name <commit1@email.xx>'
259 289 >>> mapname(m, b'Name <commit2@email.xx>')
260 290 'Name <name@email.xx>'
261 291 >>> mapname(m, b'Commit <commit3@email.xx>')
262 292 'Name <proper@email.xx>'
263 293 >>> mapname(m, b'Commit <commit4@email.xx>')
264 294 'Name <proper@email.xx>'
265 295 >>> mapname(m, b'Unknown Name <unknown@email.com>')
266 296 'Unknown Name <unknown@email.com>'
267 297 """
268 298 # If the author field coming in isn't in the correct format,
269 299 # or the mailmap is empty just return the original author field
270 300 if not isauthorwellformed(author) or not mailmap:
271 301 return author
272 302
273 303 # Turn the user name into a mailmaptup
274 304 commit = mailmapping(name=person(author), email=email(author))
275 305
276 306 try:
277 307 # Try and use both the commit email and name as the key
278 308 proper = mailmap[commit]
279 309
280 310 except KeyError:
281 311 # If the lookup fails, use just the email as the key instead
282 312 # We call this commit2 as not to erase original commit fields
283 313 commit2 = mailmapping(email=commit.email)
284 314 proper = mailmap.get(commit2, mailmapping(None, None))
285 315
286 316 # Return the author field with proper values filled in
287 317 return '%s <%s>' % (
288 318 proper.name if proper.name else commit.name,
289 319 proper.email if proper.email else commit.email,
290 320 )
291 321
292 322 _correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$')
293 323
294 324 def isauthorwellformed(author):
295 325 '''Return True if the author field is well formed
296 326 (ie "Contributor Name <contrib@email.dom>")
297 327
298 328 >>> isauthorwellformed(b'Good Author <good@author.com>')
299 329 True
300 330 >>> isauthorwellformed(b'Author <good@author.com>')
301 331 True
302 332 >>> isauthorwellformed(b'Bad Author')
303 333 False
304 334 >>> isauthorwellformed(b'Bad Author <author@author.com')
305 335 False
306 336 >>> isauthorwellformed(b'Bad Author author@author.com')
307 337 False
308 338 >>> isauthorwellformed(b'<author@author.com>')
309 339 False
310 340 >>> isauthorwellformed(b'Bad Author <author>')
311 341 False
312 342 '''
313 343 return _correctauthorformat.match(author) is not None
314 344
315 345 def ellipsis(text, maxlength=400):
316 346 """Trim string to at most maxlength (default: 400) columns in display."""
317 347 return encoding.trim(text, maxlength, ellipsis='...')
318 348
319 349 def escapestr(s):
320 350 # call underlying function of s.encode('string_escape') directly for
321 351 # Python 3 compatibility
322 352 return codecs.escape_encode(s)[0]
323 353
324 354 def unescapestr(s):
325 355 return codecs.escape_decode(s)[0]
326 356
327 357 def forcebytestr(obj):
328 358 """Portably format an arbitrary object (e.g. exception) into a byte
329 359 string."""
330 360 try:
331 361 return pycompat.bytestr(obj)
332 362 except UnicodeEncodeError:
333 363 # non-ascii string, may be lossy
334 364 return pycompat.bytestr(encoding.strtolocal(str(obj)))
335 365
336 366 def uirepr(s):
337 367 # Avoid double backslash in Windows path repr()
338 368 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
339 369
340 370 # delay import of textwrap
341 371 def _MBTextWrapper(**kwargs):
342 372 class tw(textwrap.TextWrapper):
343 373 """
344 374 Extend TextWrapper for width-awareness.
345 375
346 376 Neither number of 'bytes' in any encoding nor 'characters' is
347 377 appropriate to calculate terminal columns for specified string.
348 378
349 379 Original TextWrapper implementation uses built-in 'len()' directly,
350 380 so overriding is needed to use width information of each characters.
351 381
352 382 In addition, characters classified into 'ambiguous' width are
353 383 treated as wide in East Asian area, but as narrow in other.
354 384
355 385 This requires use decision to determine width of such characters.
356 386 """
357 387 def _cutdown(self, ucstr, space_left):
358 388 l = 0
359 389 colwidth = encoding.ucolwidth
360 390 for i in xrange(len(ucstr)):
361 391 l += colwidth(ucstr[i])
362 392 if space_left < l:
363 393 return (ucstr[:i], ucstr[i:])
364 394 return ucstr, ''
365 395
366 396 # overriding of base class
367 397 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
368 398 space_left = max(width - cur_len, 1)
369 399
370 400 if self.break_long_words:
371 401 cut, res = self._cutdown(reversed_chunks[-1], space_left)
372 402 cur_line.append(cut)
373 403 reversed_chunks[-1] = res
374 404 elif not cur_line:
375 405 cur_line.append(reversed_chunks.pop())
376 406
377 407 # this overriding code is imported from TextWrapper of Python 2.6
378 408 # to calculate columns of string by 'encoding.ucolwidth()'
379 409 def _wrap_chunks(self, chunks):
380 410 colwidth = encoding.ucolwidth
381 411
382 412 lines = []
383 413 if self.width <= 0:
384 414 raise ValueError("invalid width %r (must be > 0)" % self.width)
385 415
386 416 # Arrange in reverse order so items can be efficiently popped
387 417 # from a stack of chucks.
388 418 chunks.reverse()
389 419
390 420 while chunks:
391 421
392 422 # Start the list of chunks that will make up the current line.
393 423 # cur_len is just the length of all the chunks in cur_line.
394 424 cur_line = []
395 425 cur_len = 0
396 426
397 427 # Figure out which static string will prefix this line.
398 428 if lines:
399 429 indent = self.subsequent_indent
400 430 else:
401 431 indent = self.initial_indent
402 432
403 433 # Maximum width for this line.
404 434 width = self.width - len(indent)
405 435
406 436 # First chunk on line is whitespace -- drop it, unless this
407 437 # is the very beginning of the text (i.e. no lines started yet).
408 438 if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
409 439 del chunks[-1]
410 440
411 441 while chunks:
412 442 l = colwidth(chunks[-1])
413 443
414 444 # Can at least squeeze this chunk onto the current line.
415 445 if cur_len + l <= width:
416 446 cur_line.append(chunks.pop())
417 447 cur_len += l
418 448
419 449 # Nope, this line is full.
420 450 else:
421 451 break
422 452
423 453 # The current line is full, and the next chunk is too big to
424 454 # fit on *any* line (not just this one).
425 455 if chunks and colwidth(chunks[-1]) > width:
426 456 self._handle_long_word(chunks, cur_line, cur_len, width)
427 457
428 458 # If the last chunk on this line is all whitespace, drop it.
429 459 if (self.drop_whitespace and
430 460 cur_line and cur_line[-1].strip() == r''):
431 461 del cur_line[-1]
432 462
433 463 # Convert current line back to a string and store it in list
434 464 # of all lines (return value).
435 465 if cur_line:
436 466 lines.append(indent + r''.join(cur_line))
437 467
438 468 return lines
439 469
440 470 global _MBTextWrapper
441 471 _MBTextWrapper = tw
442 472 return tw(**kwargs)
443 473
444 474 def wrap(line, width, initindent='', hangindent=''):
445 475 maxindent = max(len(hangindent), len(initindent))
446 476 if width <= maxindent:
447 477 # adjust for weird terminal size
448 478 width = max(78, maxindent + 1)
449 479 line = line.decode(pycompat.sysstr(encoding.encoding),
450 480 pycompat.sysstr(encoding.encodingmode))
451 481 initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
452 482 pycompat.sysstr(encoding.encodingmode))
453 483 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
454 484 pycompat.sysstr(encoding.encodingmode))
455 485 wrapper = _MBTextWrapper(width=width,
456 486 initial_indent=initindent,
457 487 subsequent_indent=hangindent)
458 488 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
459 489
460 490 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
461 491 '0': False, 'no': False, 'false': False, 'off': False,
462 492 'never': False}
463 493
464 494 def parsebool(s):
465 495 """Parse s into a boolean.
466 496
467 497 If s is not a valid boolean, returns None.
468 498 """
469 499 return _booleans.get(s.lower(), None)
General Comments 0
You need to be logged in to leave comments. Login now