##// END OF EJS Templates
stringutil: update list of re-special characters to include &~...
Augie Fackler -
r38496:de275ab3 default
parent child Browse files
Show More
@@ -1,560 +1,560
1 1 # stringutil.py - utility for generic string formatting, parsing, etc.
2 2 #
3 3 # Copyright 2005 K. Thananchayan <thananck@yahoo.com>
4 4 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
5 5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import ast
13 13 import codecs
14 14 import re as remod
15 15 import textwrap
16 16
17 17 from ..i18n import _
18 18 from ..thirdparty import attr
19 19
20 20 from .. import (
21 21 encoding,
22 22 error,
23 23 pycompat,
24 24 )
25 25
26 26 # regex special chars pulled from https://bugs.python.org/issue29995
27 27 # which was part of Python 3.7.
28 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.# \t\n\r\v\f')
28 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f')
29 29 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial}
30 30
31 31 def reescape(pat):
32 32 """Drop-in replacement for re.escape."""
33 33 # NOTE: it is intentional that this works on unicodes and not
34 34 # bytes, as it's only possible to do the escaping with
35 35 # unicode.translate, not bytes.translate. Sigh.
36 36 wantuni = True
37 37 if isinstance(pat, bytes):
38 38 wantuni = False
39 39 pat = pat.decode('latin1')
40 40 pat = pat.translate(_regexescapemap)
41 41 if wantuni:
42 42 return pat
43 43 return pat.encode('latin1')
44 44
45 45 def pprint(o, bprefix=False):
46 46 """Pretty print an object."""
47 47 if isinstance(o, bytes):
48 48 if bprefix:
49 49 return "b'%s'" % escapestr(o)
50 50 return "'%s'" % escapestr(o)
51 51 elif isinstance(o, bytearray):
52 52 # codecs.escape_encode() can't handle bytearray, so escapestr fails
53 53 # without coercion.
54 54 return "bytearray['%s']" % escapestr(bytes(o))
55 55 elif isinstance(o, list):
56 56 return '[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
57 57 elif isinstance(o, dict):
58 58 return '{%s}' % (b', '.join(
59 59 '%s: %s' % (pprint(k, bprefix=bprefix),
60 60 pprint(v, bprefix=bprefix))
61 61 for k, v in sorted(o.items())))
62 62 elif isinstance(o, tuple):
63 63 return '(%s)' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
64 64 else:
65 65 return pycompat.byterepr(o)
66 66
67 67 def prettyrepr(o):
68 68 """Pretty print a representation of a possibly-nested object"""
69 69 lines = []
70 70 rs = pycompat.byterepr(o)
71 71 p0 = p1 = 0
72 72 while p0 < len(rs):
73 73 # '... field=<type ... field=<type ...'
74 74 # ~~~~~~~~~~~~~~~~
75 75 # p0 p1 q0 q1
76 76 q0 = -1
77 77 q1 = rs.find('<', p1 + 1)
78 78 if q1 < 0:
79 79 q1 = len(rs)
80 80 elif q1 > p1 + 1 and rs.startswith('=', q1 - 1):
81 81 # backtrack for ' field=<'
82 82 q0 = rs.rfind(' ', p1 + 1, q1 - 1)
83 83 if q0 < 0:
84 84 q0 = q1
85 85 else:
86 86 q0 += 1 # skip ' '
87 87 l = rs.count('<', 0, p0) - rs.count('>', 0, p0)
88 88 assert l >= 0
89 89 lines.append((l, rs[p0:q0].rstrip()))
90 90 p0, p1 = q0, q1
91 91 return '\n'.join(' ' * l + s for l, s in lines)
92 92
93 93 def binary(s):
94 94 """return true if a string is binary data"""
95 95 return bool(s and '\0' in s)
96 96
97 97 def stringmatcher(pattern, casesensitive=True):
98 98 """
99 99 accepts a string, possibly starting with 're:' or 'literal:' prefix.
100 100 returns the matcher name, pattern, and matcher function.
101 101 missing or unknown prefixes are treated as literal matches.
102 102
103 103 helper for tests:
104 104 >>> def test(pattern, *tests):
105 105 ... kind, pattern, matcher = stringmatcher(pattern)
106 106 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
107 107 >>> def itest(pattern, *tests):
108 108 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
109 109 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
110 110
111 111 exact matching (no prefix):
112 112 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
113 113 ('literal', 'abcdefg', [False, False, True])
114 114
115 115 regex matching ('re:' prefix)
116 116 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
117 117 ('re', 'a.+b', [False, False, True])
118 118
119 119 force exact matches ('literal:' prefix)
120 120 >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
121 121 ('literal', 're:foobar', [False, True])
122 122
123 123 unknown prefixes are ignored and treated as literals
124 124 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
125 125 ('literal', 'foo:bar', [False, False, True])
126 126
127 127 case insensitive regex matches
128 128 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
129 129 ('re', 'A.+b', [False, False, True])
130 130
131 131 case insensitive literal matches
132 132 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
133 133 ('literal', 'ABCDEFG', [False, False, True])
134 134 """
135 135 if pattern.startswith('re:'):
136 136 pattern = pattern[3:]
137 137 try:
138 138 flags = 0
139 139 if not casesensitive:
140 140 flags = remod.I
141 141 regex = remod.compile(pattern, flags)
142 142 except remod.error as e:
143 143 raise error.ParseError(_('invalid regular expression: %s')
144 144 % e)
145 145 return 're', pattern, regex.search
146 146 elif pattern.startswith('literal:'):
147 147 pattern = pattern[8:]
148 148
149 149 match = pattern.__eq__
150 150
151 151 if not casesensitive:
152 152 ipat = encoding.lower(pattern)
153 153 match = lambda s: ipat == encoding.lower(s)
154 154 return 'literal', pattern, match
155 155
156 156 def shortuser(user):
157 157 """Return a short representation of a user name or email address."""
158 158 f = user.find('@')
159 159 if f >= 0:
160 160 user = user[:f]
161 161 f = user.find('<')
162 162 if f >= 0:
163 163 user = user[f + 1:]
164 164 f = user.find(' ')
165 165 if f >= 0:
166 166 user = user[:f]
167 167 f = user.find('.')
168 168 if f >= 0:
169 169 user = user[:f]
170 170 return user
171 171
172 172 def emailuser(user):
173 173 """Return the user portion of an email address."""
174 174 f = user.find('@')
175 175 if f >= 0:
176 176 user = user[:f]
177 177 f = user.find('<')
178 178 if f >= 0:
179 179 user = user[f + 1:]
180 180 return user
181 181
182 182 def email(author):
183 183 '''get email of author.'''
184 184 r = author.find('>')
185 185 if r == -1:
186 186 r = None
187 187 return author[author.find('<') + 1:r]
188 188
189 189 def person(author):
190 190 """Returns the name before an email address,
191 191 interpreting it as per RFC 5322
192 192
193 193 >>> person(b'foo@bar')
194 194 'foo'
195 195 >>> person(b'Foo Bar <foo@bar>')
196 196 'Foo Bar'
197 197 >>> person(b'"Foo Bar" <foo@bar>')
198 198 'Foo Bar'
199 199 >>> person(b'"Foo \"buz\" Bar" <foo@bar>')
200 200 'Foo "buz" Bar'
201 201 >>> # The following are invalid, but do exist in real-life
202 202 ...
203 203 >>> person(b'Foo "buz" Bar <foo@bar>')
204 204 'Foo "buz" Bar'
205 205 >>> person(b'"Foo Bar <foo@bar>')
206 206 'Foo Bar'
207 207 """
208 208 if '@' not in author:
209 209 return author
210 210 f = author.find('<')
211 211 if f != -1:
212 212 return author[:f].strip(' "').replace('\\"', '"')
213 213 f = author.find('@')
214 214 return author[:f].replace('.', ' ')
215 215
216 216 @attr.s(hash=True)
217 217 class mailmapping(object):
218 218 '''Represents a username/email key or value in
219 219 a mailmap file'''
220 220 email = attr.ib()
221 221 name = attr.ib(default=None)
222 222
223 223 def _ismailmaplineinvalid(names, emails):
224 224 '''Returns True if the parsed names and emails
225 225 in a mailmap entry are invalid.
226 226
227 227 >>> # No names or emails fails
228 228 >>> names, emails = [], []
229 229 >>> _ismailmaplineinvalid(names, emails)
230 230 True
231 231 >>> # Only one email fails
232 232 >>> emails = [b'email@email.com']
233 233 >>> _ismailmaplineinvalid(names, emails)
234 234 True
235 235 >>> # One email and one name passes
236 236 >>> names = [b'Test Name']
237 237 >>> _ismailmaplineinvalid(names, emails)
238 238 False
239 239 >>> # No names but two emails passes
240 240 >>> names = []
241 241 >>> emails = [b'proper@email.com', b'commit@email.com']
242 242 >>> _ismailmaplineinvalid(names, emails)
243 243 False
244 244 '''
245 245 return not emails or not names and len(emails) < 2
246 246
247 247 def parsemailmap(mailmapcontent):
248 248 """Parses data in the .mailmap format
249 249
250 250 >>> mmdata = b"\\n".join([
251 251 ... b'# Comment',
252 252 ... b'Name <commit1@email.xx>',
253 253 ... b'<name@email.xx> <commit2@email.xx>',
254 254 ... b'Name <proper@email.xx> <commit3@email.xx>',
255 255 ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
256 256 ... ])
257 257 >>> mm = parsemailmap(mmdata)
258 258 >>> for key in sorted(mm.keys()):
259 259 ... print(key)
260 260 mailmapping(email='commit1@email.xx', name=None)
261 261 mailmapping(email='commit2@email.xx', name=None)
262 262 mailmapping(email='commit3@email.xx', name=None)
263 263 mailmapping(email='commit4@email.xx', name='Commit')
264 264 >>> for val in sorted(mm.values()):
265 265 ... print(val)
266 266 mailmapping(email='commit1@email.xx', name='Name')
267 267 mailmapping(email='name@email.xx', name=None)
268 268 mailmapping(email='proper@email.xx', name='Name')
269 269 mailmapping(email='proper@email.xx', name='Name')
270 270 """
271 271 mailmap = {}
272 272
273 273 if mailmapcontent is None:
274 274 return mailmap
275 275
276 276 for line in mailmapcontent.splitlines():
277 277
278 278 # Don't bother checking the line if it is a comment or
279 279 # is an improperly formed author field
280 280 if line.lstrip().startswith('#'):
281 281 continue
282 282
283 283 # names, emails hold the parsed emails and names for each line
284 284 # name_builder holds the words in a persons name
285 285 names, emails = [], []
286 286 namebuilder = []
287 287
288 288 for element in line.split():
289 289 if element.startswith('#'):
290 290 # If we reach a comment in the mailmap file, move on
291 291 break
292 292
293 293 elif element.startswith('<') and element.endswith('>'):
294 294 # We have found an email.
295 295 # Parse it, and finalize any names from earlier
296 296 emails.append(element[1:-1]) # Slice off the "<>"
297 297
298 298 if namebuilder:
299 299 names.append(' '.join(namebuilder))
300 300 namebuilder = []
301 301
302 302 # Break if we have found a second email, any other
303 303 # data does not fit the spec for .mailmap
304 304 if len(emails) > 1:
305 305 break
306 306
307 307 else:
308 308 # We have found another word in the committers name
309 309 namebuilder.append(element)
310 310
311 311 # Check to see if we have parsed the line into a valid form
312 312 # We require at least one email, and either at least one
313 313 # name or a second email
314 314 if _ismailmaplineinvalid(names, emails):
315 315 continue
316 316
317 317 mailmapkey = mailmapping(
318 318 email=emails[-1],
319 319 name=names[-1] if len(names) == 2 else None,
320 320 )
321 321
322 322 mailmap[mailmapkey] = mailmapping(
323 323 email=emails[0],
324 324 name=names[0] if names else None,
325 325 )
326 326
327 327 return mailmap
328 328
329 329 def mapname(mailmap, author):
330 330 """Returns the author field according to the mailmap cache, or
331 331 the original author field.
332 332
333 333 >>> mmdata = b"\\n".join([
334 334 ... b'# Comment',
335 335 ... b'Name <commit1@email.xx>',
336 336 ... b'<name@email.xx> <commit2@email.xx>',
337 337 ... b'Name <proper@email.xx> <commit3@email.xx>',
338 338 ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
339 339 ... ])
340 340 >>> m = parsemailmap(mmdata)
341 341 >>> mapname(m, b'Commit <commit1@email.xx>')
342 342 'Name <commit1@email.xx>'
343 343 >>> mapname(m, b'Name <commit2@email.xx>')
344 344 'Name <name@email.xx>'
345 345 >>> mapname(m, b'Commit <commit3@email.xx>')
346 346 'Name <proper@email.xx>'
347 347 >>> mapname(m, b'Commit <commit4@email.xx>')
348 348 'Name <proper@email.xx>'
349 349 >>> mapname(m, b'Unknown Name <unknown@email.com>')
350 350 'Unknown Name <unknown@email.com>'
351 351 """
352 352 # If the author field coming in isn't in the correct format,
353 353 # or the mailmap is empty just return the original author field
354 354 if not isauthorwellformed(author) or not mailmap:
355 355 return author
356 356
357 357 # Turn the user name into a mailmapping
358 358 commit = mailmapping(name=person(author), email=email(author))
359 359
360 360 try:
361 361 # Try and use both the commit email and name as the key
362 362 proper = mailmap[commit]
363 363
364 364 except KeyError:
365 365 # If the lookup fails, use just the email as the key instead
366 366 # We call this commit2 as not to erase original commit fields
367 367 commit2 = mailmapping(email=commit.email)
368 368 proper = mailmap.get(commit2, mailmapping(None, None))
369 369
370 370 # Return the author field with proper values filled in
371 371 return '%s <%s>' % (
372 372 proper.name if proper.name else commit.name,
373 373 proper.email if proper.email else commit.email,
374 374 )
375 375
376 376 _correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$')
377 377
378 378 def isauthorwellformed(author):
379 379 '''Return True if the author field is well formed
380 380 (ie "Contributor Name <contrib@email.dom>")
381 381
382 382 >>> isauthorwellformed(b'Good Author <good@author.com>')
383 383 True
384 384 >>> isauthorwellformed(b'Author <good@author.com>')
385 385 True
386 386 >>> isauthorwellformed(b'Bad Author')
387 387 False
388 388 >>> isauthorwellformed(b'Bad Author <author@author.com')
389 389 False
390 390 >>> isauthorwellformed(b'Bad Author author@author.com')
391 391 False
392 392 >>> isauthorwellformed(b'<author@author.com>')
393 393 False
394 394 >>> isauthorwellformed(b'Bad Author <author>')
395 395 False
396 396 '''
397 397 return _correctauthorformat.match(author) is not None
398 398
399 399 def ellipsis(text, maxlength=400):
400 400 """Trim string to at most maxlength (default: 400) columns in display."""
401 401 return encoding.trim(text, maxlength, ellipsis='...')
402 402
403 403 def escapestr(s):
404 404 # call underlying function of s.encode('string_escape') directly for
405 405 # Python 3 compatibility
406 406 return codecs.escape_encode(s)[0]
407 407
408 408 def unescapestr(s):
409 409 return codecs.escape_decode(s)[0]
410 410
411 411 def forcebytestr(obj):
412 412 """Portably format an arbitrary object (e.g. exception) into a byte
413 413 string."""
414 414 try:
415 415 return pycompat.bytestr(obj)
416 416 except UnicodeEncodeError:
417 417 # non-ascii string, may be lossy
418 418 return pycompat.bytestr(encoding.strtolocal(str(obj)))
419 419
420 420 def uirepr(s):
421 421 # Avoid double backslash in Windows path repr()
422 422 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
423 423
424 424 # delay import of textwrap
425 425 def _MBTextWrapper(**kwargs):
426 426 class tw(textwrap.TextWrapper):
427 427 """
428 428 Extend TextWrapper for width-awareness.
429 429
430 430 Neither number of 'bytes' in any encoding nor 'characters' is
431 431 appropriate to calculate terminal columns for specified string.
432 432
433 433 Original TextWrapper implementation uses built-in 'len()' directly,
434 434 so overriding is needed to use width information of each characters.
435 435
436 436 In addition, characters classified into 'ambiguous' width are
437 437 treated as wide in East Asian area, but as narrow in other.
438 438
439 439 This requires use decision to determine width of such characters.
440 440 """
441 441 def _cutdown(self, ucstr, space_left):
442 442 l = 0
443 443 colwidth = encoding.ucolwidth
444 444 for i in xrange(len(ucstr)):
445 445 l += colwidth(ucstr[i])
446 446 if space_left < l:
447 447 return (ucstr[:i], ucstr[i:])
448 448 return ucstr, ''
449 449
450 450 # overriding of base class
451 451 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
452 452 space_left = max(width - cur_len, 1)
453 453
454 454 if self.break_long_words:
455 455 cut, res = self._cutdown(reversed_chunks[-1], space_left)
456 456 cur_line.append(cut)
457 457 reversed_chunks[-1] = res
458 458 elif not cur_line:
459 459 cur_line.append(reversed_chunks.pop())
460 460
461 461 # this overriding code is imported from TextWrapper of Python 2.6
462 462 # to calculate columns of string by 'encoding.ucolwidth()'
463 463 def _wrap_chunks(self, chunks):
464 464 colwidth = encoding.ucolwidth
465 465
466 466 lines = []
467 467 if self.width <= 0:
468 468 raise ValueError("invalid width %r (must be > 0)" % self.width)
469 469
470 470 # Arrange in reverse order so items can be efficiently popped
471 471 # from a stack of chucks.
472 472 chunks.reverse()
473 473
474 474 while chunks:
475 475
476 476 # Start the list of chunks that will make up the current line.
477 477 # cur_len is just the length of all the chunks in cur_line.
478 478 cur_line = []
479 479 cur_len = 0
480 480
481 481 # Figure out which static string will prefix this line.
482 482 if lines:
483 483 indent = self.subsequent_indent
484 484 else:
485 485 indent = self.initial_indent
486 486
487 487 # Maximum width for this line.
488 488 width = self.width - len(indent)
489 489
490 490 # First chunk on line is whitespace -- drop it, unless this
491 491 # is the very beginning of the text (i.e. no lines started yet).
492 492 if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
493 493 del chunks[-1]
494 494
495 495 while chunks:
496 496 l = colwidth(chunks[-1])
497 497
498 498 # Can at least squeeze this chunk onto the current line.
499 499 if cur_len + l <= width:
500 500 cur_line.append(chunks.pop())
501 501 cur_len += l
502 502
503 503 # Nope, this line is full.
504 504 else:
505 505 break
506 506
507 507 # The current line is full, and the next chunk is too big to
508 508 # fit on *any* line (not just this one).
509 509 if chunks and colwidth(chunks[-1]) > width:
510 510 self._handle_long_word(chunks, cur_line, cur_len, width)
511 511
512 512 # If the last chunk on this line is all whitespace, drop it.
513 513 if (self.drop_whitespace and
514 514 cur_line and cur_line[-1].strip() == r''):
515 515 del cur_line[-1]
516 516
517 517 # Convert current line back to a string and store it in list
518 518 # of all lines (return value).
519 519 if cur_line:
520 520 lines.append(indent + r''.join(cur_line))
521 521
522 522 return lines
523 523
524 524 global _MBTextWrapper
525 525 _MBTextWrapper = tw
526 526 return tw(**kwargs)
527 527
528 528 def wrap(line, width, initindent='', hangindent=''):
529 529 maxindent = max(len(hangindent), len(initindent))
530 530 if width <= maxindent:
531 531 # adjust for weird terminal size
532 532 width = max(78, maxindent + 1)
533 533 line = line.decode(pycompat.sysstr(encoding.encoding),
534 534 pycompat.sysstr(encoding.encodingmode))
535 535 initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
536 536 pycompat.sysstr(encoding.encodingmode))
537 537 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
538 538 pycompat.sysstr(encoding.encodingmode))
539 539 wrapper = _MBTextWrapper(width=width,
540 540 initial_indent=initindent,
541 541 subsequent_indent=hangindent)
542 542 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
543 543
544 544 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
545 545 '0': False, 'no': False, 'false': False, 'off': False,
546 546 'never': False}
547 547
548 548 def parsebool(s):
549 549 """Parse s into a boolean.
550 550
551 551 If s is not a valid boolean, returns None.
552 552 """
553 553 return _booleans.get(s.lower(), None)
554 554
555 555 def evalpythonliteral(s):
556 556 """Evaluate a string containing a Python literal expression"""
557 557 # We could backport our tokenizer hack to rewrite '' to u'' if we want
558 558 if pycompat.ispy3:
559 559 return ast.literal_eval(s.decode('latin1'))
560 560 return ast.literal_eval(s)
General Comments 0
You need to be logged in to leave comments. Login now