##// END OF EJS Templates
stringutil: refactor core of pprint so it emits chunks...
Gregory Szorc -
r39389:0d21b1f1 default
parent child Browse files
Show More
@@ -1,591 +1,597 b''
1 1 # stringutil.py - utility for generic string formatting, parsing, etc.
2 2 #
3 3 # Copyright 2005 K. Thananchayan <thananck@yahoo.com>
4 4 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
5 5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
6 6 #
7 7 # This software may be used and distributed according to the terms of the
8 8 # GNU General Public License version 2 or any later version.
9 9
10 10 from __future__ import absolute_import
11 11
12 12 import ast
13 13 import codecs
14 14 import re as remod
15 15 import textwrap
16 16 import types
17 17
18 18 from ..i18n import _
19 19 from ..thirdparty import attr
20 20
21 21 from .. import (
22 22 encoding,
23 23 error,
24 24 pycompat,
25 25 )
26 26
27 27 # regex special chars pulled from https://bugs.python.org/issue29995
28 28 # which was part of Python 3.7.
29 29 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f')
30 30 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial}
31 31
32 32 def reescape(pat):
33 33 """Drop-in replacement for re.escape."""
34 34 # NOTE: it is intentional that this works on unicodes and not
35 35 # bytes, as it's only possible to do the escaping with
36 36 # unicode.translate, not bytes.translate. Sigh.
37 37 wantuni = True
38 38 if isinstance(pat, bytes):
39 39 wantuni = False
40 40 pat = pat.decode('latin1')
41 41 pat = pat.translate(_regexescapemap)
42 42 if wantuni:
43 43 return pat
44 44 return pat.encode('latin1')
45 45
46 46 def pprint(o, bprefix=False):
47 47 """Pretty print an object."""
48 return b''.join(pprintgen(o, bprefix=bprefix))
49
50 def pprintgen(o, bprefix=False):
51 """Pretty print an object to a generator of atoms."""
52
48 53 if isinstance(o, bytes):
49 54 if bprefix:
50 return "b'%s'" % escapestr(o)
51 return "'%s'" % escapestr(o)
55 yield "b'%s'" % escapestr(o)
56 else:
57 yield "'%s'" % escapestr(o)
52 58 elif isinstance(o, bytearray):
53 59 # codecs.escape_encode() can't handle bytearray, so escapestr fails
54 60 # without coercion.
55 return "bytearray['%s']" % escapestr(bytes(o))
61 yield "bytearray['%s']" % escapestr(bytes(o))
56 62 elif isinstance(o, list):
57 return '[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
63 yield '[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
58 64 elif isinstance(o, dict):
59 return '{%s}' % (b', '.join(
65 yield '{%s}' % (b', '.join(
60 66 '%s: %s' % (pprint(k, bprefix=bprefix),
61 67 pprint(v, bprefix=bprefix))
62 68 for k, v in sorted(o.items())))
63 69 elif isinstance(o, set):
64 return 'set([%s])' % (b', '.join(
70 yield 'set([%s])' % (b', '.join(
65 71 pprint(k, bprefix=bprefix) for k in sorted(o)))
66 72 elif isinstance(o, tuple):
67 return '(%s)' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
73 yield '(%s)' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
68 74 elif isinstance(o, types.GeneratorType):
69 return 'gen[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
75 yield 'gen[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
70 76 else:
71 return pycompat.byterepr(o)
77 yield pycompat.byterepr(o)
72 78
73 79 def prettyrepr(o):
74 80 """Pretty print a representation of a possibly-nested object"""
75 81 lines = []
76 82 rs = pycompat.byterepr(o)
77 83 p0 = p1 = 0
78 84 while p0 < len(rs):
79 85 # '... field=<type ... field=<type ...'
80 86 # ~~~~~~~~~~~~~~~~
81 87 # p0 p1 q0 q1
82 88 q0 = -1
83 89 q1 = rs.find('<', p1 + 1)
84 90 if q1 < 0:
85 91 q1 = len(rs)
86 92 elif q1 > p1 + 1 and rs.startswith('=', q1 - 1):
87 93 # backtrack for ' field=<'
88 94 q0 = rs.rfind(' ', p1 + 1, q1 - 1)
89 95 if q0 < 0:
90 96 q0 = q1
91 97 else:
92 98 q0 += 1 # skip ' '
93 99 l = rs.count('<', 0, p0) - rs.count('>', 0, p0)
94 100 assert l >= 0
95 101 lines.append((l, rs[p0:q0].rstrip()))
96 102 p0, p1 = q0, q1
97 103 return '\n'.join(' ' * l + s for l, s in lines)
98 104
99 105 def buildrepr(r):
100 106 """Format an optional printable representation from unexpanded bits
101 107
102 108 ======== =================================
103 109 type(r) example
104 110 ======== =================================
105 111 tuple ('<not %r>', other)
106 112 bytes '<branch closed>'
107 113 callable lambda: '<branch %r>' % sorted(b)
108 114 object other
109 115 ======== =================================
110 116 """
111 117 if r is None:
112 118 return ''
113 119 elif isinstance(r, tuple):
114 120 return r[0] % pycompat.rapply(pycompat.maybebytestr, r[1:])
115 121 elif isinstance(r, bytes):
116 122 return r
117 123 elif callable(r):
118 124 return r()
119 125 else:
120 126 return pprint(r)
121 127
122 128 def binary(s):
123 129 """return true if a string is binary data"""
124 130 return bool(s and '\0' in s)
125 131
126 132 def stringmatcher(pattern, casesensitive=True):
127 133 """
128 134 accepts a string, possibly starting with 're:' or 'literal:' prefix.
129 135 returns the matcher name, pattern, and matcher function.
130 136 missing or unknown prefixes are treated as literal matches.
131 137
132 138 helper for tests:
133 139 >>> def test(pattern, *tests):
134 140 ... kind, pattern, matcher = stringmatcher(pattern)
135 141 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
136 142 >>> def itest(pattern, *tests):
137 143 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
138 144 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
139 145
140 146 exact matching (no prefix):
141 147 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
142 148 ('literal', 'abcdefg', [False, False, True])
143 149
144 150 regex matching ('re:' prefix)
145 151 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
146 152 ('re', 'a.+b', [False, False, True])
147 153
148 154 force exact matches ('literal:' prefix)
149 155 >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
150 156 ('literal', 're:foobar', [False, True])
151 157
152 158 unknown prefixes are ignored and treated as literals
153 159 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
154 160 ('literal', 'foo:bar', [False, False, True])
155 161
156 162 case insensitive regex matches
157 163 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
158 164 ('re', 'A.+b', [False, False, True])
159 165
160 166 case insensitive literal matches
161 167 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
162 168 ('literal', 'ABCDEFG', [False, False, True])
163 169 """
164 170 if pattern.startswith('re:'):
165 171 pattern = pattern[3:]
166 172 try:
167 173 flags = 0
168 174 if not casesensitive:
169 175 flags = remod.I
170 176 regex = remod.compile(pattern, flags)
171 177 except remod.error as e:
172 178 raise error.ParseError(_('invalid regular expression: %s')
173 179 % e)
174 180 return 're', pattern, regex.search
175 181 elif pattern.startswith('literal:'):
176 182 pattern = pattern[8:]
177 183
178 184 match = pattern.__eq__
179 185
180 186 if not casesensitive:
181 187 ipat = encoding.lower(pattern)
182 188 match = lambda s: ipat == encoding.lower(s)
183 189 return 'literal', pattern, match
184 190
185 191 def shortuser(user):
186 192 """Return a short representation of a user name or email address."""
187 193 f = user.find('@')
188 194 if f >= 0:
189 195 user = user[:f]
190 196 f = user.find('<')
191 197 if f >= 0:
192 198 user = user[f + 1:]
193 199 f = user.find(' ')
194 200 if f >= 0:
195 201 user = user[:f]
196 202 f = user.find('.')
197 203 if f >= 0:
198 204 user = user[:f]
199 205 return user
200 206
201 207 def emailuser(user):
202 208 """Return the user portion of an email address."""
203 209 f = user.find('@')
204 210 if f >= 0:
205 211 user = user[:f]
206 212 f = user.find('<')
207 213 if f >= 0:
208 214 user = user[f + 1:]
209 215 return user
210 216
211 217 def email(author):
212 218 '''get email of author.'''
213 219 r = author.find('>')
214 220 if r == -1:
215 221 r = None
216 222 return author[author.find('<') + 1:r]
217 223
218 224 def person(author):
219 225 """Returns the name before an email address,
220 226 interpreting it as per RFC 5322
221 227
222 228 >>> person(b'foo@bar')
223 229 'foo'
224 230 >>> person(b'Foo Bar <foo@bar>')
225 231 'Foo Bar'
226 232 >>> person(b'"Foo Bar" <foo@bar>')
227 233 'Foo Bar'
228 234 >>> person(b'"Foo \"buz\" Bar" <foo@bar>')
229 235 'Foo "buz" Bar'
230 236 >>> # The following are invalid, but do exist in real-life
231 237 ...
232 238 >>> person(b'Foo "buz" Bar <foo@bar>')
233 239 'Foo "buz" Bar'
234 240 >>> person(b'"Foo Bar <foo@bar>')
235 241 'Foo Bar'
236 242 """
237 243 if '@' not in author:
238 244 return author
239 245 f = author.find('<')
240 246 if f != -1:
241 247 return author[:f].strip(' "').replace('\\"', '"')
242 248 f = author.find('@')
243 249 return author[:f].replace('.', ' ')
244 250
245 251 @attr.s(hash=True)
246 252 class mailmapping(object):
247 253 '''Represents a username/email key or value in
248 254 a mailmap file'''
249 255 email = attr.ib()
250 256 name = attr.ib(default=None)
251 257
252 258 def _ismailmaplineinvalid(names, emails):
253 259 '''Returns True if the parsed names and emails
254 260 in a mailmap entry are invalid.
255 261
256 262 >>> # No names or emails fails
257 263 >>> names, emails = [], []
258 264 >>> _ismailmaplineinvalid(names, emails)
259 265 True
260 266 >>> # Only one email fails
261 267 >>> emails = [b'email@email.com']
262 268 >>> _ismailmaplineinvalid(names, emails)
263 269 True
264 270 >>> # One email and one name passes
265 271 >>> names = [b'Test Name']
266 272 >>> _ismailmaplineinvalid(names, emails)
267 273 False
268 274 >>> # No names but two emails passes
269 275 >>> names = []
270 276 >>> emails = [b'proper@email.com', b'commit@email.com']
271 277 >>> _ismailmaplineinvalid(names, emails)
272 278 False
273 279 '''
274 280 return not emails or not names and len(emails) < 2
275 281
276 282 def parsemailmap(mailmapcontent):
277 283 """Parses data in the .mailmap format
278 284
279 285 >>> mmdata = b"\\n".join([
280 286 ... b'# Comment',
281 287 ... b'Name <commit1@email.xx>',
282 288 ... b'<name@email.xx> <commit2@email.xx>',
283 289 ... b'Name <proper@email.xx> <commit3@email.xx>',
284 290 ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
285 291 ... ])
286 292 >>> mm = parsemailmap(mmdata)
287 293 >>> for key in sorted(mm.keys()):
288 294 ... print(key)
289 295 mailmapping(email='commit1@email.xx', name=None)
290 296 mailmapping(email='commit2@email.xx', name=None)
291 297 mailmapping(email='commit3@email.xx', name=None)
292 298 mailmapping(email='commit4@email.xx', name='Commit')
293 299 >>> for val in sorted(mm.values()):
294 300 ... print(val)
295 301 mailmapping(email='commit1@email.xx', name='Name')
296 302 mailmapping(email='name@email.xx', name=None)
297 303 mailmapping(email='proper@email.xx', name='Name')
298 304 mailmapping(email='proper@email.xx', name='Name')
299 305 """
300 306 mailmap = {}
301 307
302 308 if mailmapcontent is None:
303 309 return mailmap
304 310
305 311 for line in mailmapcontent.splitlines():
306 312
307 313 # Don't bother checking the line if it is a comment or
308 314 # is an improperly formed author field
309 315 if line.lstrip().startswith('#'):
310 316 continue
311 317
312 318 # names, emails hold the parsed emails and names for each line
313 319 # name_builder holds the words in a persons name
314 320 names, emails = [], []
315 321 namebuilder = []
316 322
317 323 for element in line.split():
318 324 if element.startswith('#'):
319 325 # If we reach a comment in the mailmap file, move on
320 326 break
321 327
322 328 elif element.startswith('<') and element.endswith('>'):
323 329 # We have found an email.
324 330 # Parse it, and finalize any names from earlier
325 331 emails.append(element[1:-1]) # Slice off the "<>"
326 332
327 333 if namebuilder:
328 334 names.append(' '.join(namebuilder))
329 335 namebuilder = []
330 336
331 337 # Break if we have found a second email, any other
332 338 # data does not fit the spec for .mailmap
333 339 if len(emails) > 1:
334 340 break
335 341
336 342 else:
337 343 # We have found another word in the committers name
338 344 namebuilder.append(element)
339 345
340 346 # Check to see if we have parsed the line into a valid form
341 347 # We require at least one email, and either at least one
342 348 # name or a second email
343 349 if _ismailmaplineinvalid(names, emails):
344 350 continue
345 351
346 352 mailmapkey = mailmapping(
347 353 email=emails[-1],
348 354 name=names[-1] if len(names) == 2 else None,
349 355 )
350 356
351 357 mailmap[mailmapkey] = mailmapping(
352 358 email=emails[0],
353 359 name=names[0] if names else None,
354 360 )
355 361
356 362 return mailmap
357 363
358 364 def mapname(mailmap, author):
359 365 """Returns the author field according to the mailmap cache, or
360 366 the original author field.
361 367
362 368 >>> mmdata = b"\\n".join([
363 369 ... b'# Comment',
364 370 ... b'Name <commit1@email.xx>',
365 371 ... b'<name@email.xx> <commit2@email.xx>',
366 372 ... b'Name <proper@email.xx> <commit3@email.xx>',
367 373 ... b'Name <proper@email.xx> Commit <commit4@email.xx>',
368 374 ... ])
369 375 >>> m = parsemailmap(mmdata)
370 376 >>> mapname(m, b'Commit <commit1@email.xx>')
371 377 'Name <commit1@email.xx>'
372 378 >>> mapname(m, b'Name <commit2@email.xx>')
373 379 'Name <name@email.xx>'
374 380 >>> mapname(m, b'Commit <commit3@email.xx>')
375 381 'Name <proper@email.xx>'
376 382 >>> mapname(m, b'Commit <commit4@email.xx>')
377 383 'Name <proper@email.xx>'
378 384 >>> mapname(m, b'Unknown Name <unknown@email.com>')
379 385 'Unknown Name <unknown@email.com>'
380 386 """
381 387 # If the author field coming in isn't in the correct format,
382 388 # or the mailmap is empty just return the original author field
383 389 if not isauthorwellformed(author) or not mailmap:
384 390 return author
385 391
386 392 # Turn the user name into a mailmapping
387 393 commit = mailmapping(name=person(author), email=email(author))
388 394
389 395 try:
390 396 # Try and use both the commit email and name as the key
391 397 proper = mailmap[commit]
392 398
393 399 except KeyError:
394 400 # If the lookup fails, use just the email as the key instead
395 401 # We call this commit2 as not to erase original commit fields
396 402 commit2 = mailmapping(email=commit.email)
397 403 proper = mailmap.get(commit2, mailmapping(None, None))
398 404
399 405 # Return the author field with proper values filled in
400 406 return '%s <%s>' % (
401 407 proper.name if proper.name else commit.name,
402 408 proper.email if proper.email else commit.email,
403 409 )
404 410
405 411 _correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$')
406 412
407 413 def isauthorwellformed(author):
408 414 '''Return True if the author field is well formed
409 415 (ie "Contributor Name <contrib@email.dom>")
410 416
411 417 >>> isauthorwellformed(b'Good Author <good@author.com>')
412 418 True
413 419 >>> isauthorwellformed(b'Author <good@author.com>')
414 420 True
415 421 >>> isauthorwellformed(b'Bad Author')
416 422 False
417 423 >>> isauthorwellformed(b'Bad Author <author@author.com')
418 424 False
419 425 >>> isauthorwellformed(b'Bad Author author@author.com')
420 426 False
421 427 >>> isauthorwellformed(b'<author@author.com>')
422 428 False
423 429 >>> isauthorwellformed(b'Bad Author <author>')
424 430 False
425 431 '''
426 432 return _correctauthorformat.match(author) is not None
427 433
428 434 def ellipsis(text, maxlength=400):
429 435 """Trim string to at most maxlength (default: 400) columns in display."""
430 436 return encoding.trim(text, maxlength, ellipsis='...')
431 437
432 438 def escapestr(s):
433 439 if isinstance(s, memoryview):
434 440 s = bytes(s)
435 441 # call underlying function of s.encode('string_escape') directly for
436 442 # Python 3 compatibility
437 443 return codecs.escape_encode(s)[0]
438 444
439 445 def unescapestr(s):
440 446 return codecs.escape_decode(s)[0]
441 447
442 448 def forcebytestr(obj):
443 449 """Portably format an arbitrary object (e.g. exception) into a byte
444 450 string."""
445 451 try:
446 452 return pycompat.bytestr(obj)
447 453 except UnicodeEncodeError:
448 454 # non-ascii string, may be lossy
449 455 return pycompat.bytestr(encoding.strtolocal(str(obj)))
450 456
451 457 def uirepr(s):
452 458 # Avoid double backslash in Windows path repr()
453 459 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
454 460
455 461 # delay import of textwrap
456 462 def _MBTextWrapper(**kwargs):
457 463 class tw(textwrap.TextWrapper):
458 464 """
459 465 Extend TextWrapper for width-awareness.
460 466
461 467 Neither number of 'bytes' in any encoding nor 'characters' is
462 468 appropriate to calculate terminal columns for specified string.
463 469
464 470 Original TextWrapper implementation uses built-in 'len()' directly,
465 471 so overriding is needed to use width information of each characters.
466 472
467 473 In addition, characters classified into 'ambiguous' width are
468 474 treated as wide in East Asian area, but as narrow in other.
469 475
470 476 This requires use decision to determine width of such characters.
471 477 """
472 478 def _cutdown(self, ucstr, space_left):
473 479 l = 0
474 480 colwidth = encoding.ucolwidth
475 481 for i in pycompat.xrange(len(ucstr)):
476 482 l += colwidth(ucstr[i])
477 483 if space_left < l:
478 484 return (ucstr[:i], ucstr[i:])
479 485 return ucstr, ''
480 486
481 487 # overriding of base class
482 488 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
483 489 space_left = max(width - cur_len, 1)
484 490
485 491 if self.break_long_words:
486 492 cut, res = self._cutdown(reversed_chunks[-1], space_left)
487 493 cur_line.append(cut)
488 494 reversed_chunks[-1] = res
489 495 elif not cur_line:
490 496 cur_line.append(reversed_chunks.pop())
491 497
492 498 # this overriding code is imported from TextWrapper of Python 2.6
493 499 # to calculate columns of string by 'encoding.ucolwidth()'
494 500 def _wrap_chunks(self, chunks):
495 501 colwidth = encoding.ucolwidth
496 502
497 503 lines = []
498 504 if self.width <= 0:
499 505 raise ValueError("invalid width %r (must be > 0)" % self.width)
500 506
501 507 # Arrange in reverse order so items can be efficiently popped
502 508 # from a stack of chucks.
503 509 chunks.reverse()
504 510
505 511 while chunks:
506 512
507 513 # Start the list of chunks that will make up the current line.
508 514 # cur_len is just the length of all the chunks in cur_line.
509 515 cur_line = []
510 516 cur_len = 0
511 517
512 518 # Figure out which static string will prefix this line.
513 519 if lines:
514 520 indent = self.subsequent_indent
515 521 else:
516 522 indent = self.initial_indent
517 523
518 524 # Maximum width for this line.
519 525 width = self.width - len(indent)
520 526
521 527 # First chunk on line is whitespace -- drop it, unless this
522 528 # is the very beginning of the text (i.e. no lines started yet).
523 529 if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
524 530 del chunks[-1]
525 531
526 532 while chunks:
527 533 l = colwidth(chunks[-1])
528 534
529 535 # Can at least squeeze this chunk onto the current line.
530 536 if cur_len + l <= width:
531 537 cur_line.append(chunks.pop())
532 538 cur_len += l
533 539
534 540 # Nope, this line is full.
535 541 else:
536 542 break
537 543
538 544 # The current line is full, and the next chunk is too big to
539 545 # fit on *any* line (not just this one).
540 546 if chunks and colwidth(chunks[-1]) > width:
541 547 self._handle_long_word(chunks, cur_line, cur_len, width)
542 548
543 549 # If the last chunk on this line is all whitespace, drop it.
544 550 if (self.drop_whitespace and
545 551 cur_line and cur_line[-1].strip() == r''):
546 552 del cur_line[-1]
547 553
548 554 # Convert current line back to a string and store it in list
549 555 # of all lines (return value).
550 556 if cur_line:
551 557 lines.append(indent + r''.join(cur_line))
552 558
553 559 return lines
554 560
555 561 global _MBTextWrapper
556 562 _MBTextWrapper = tw
557 563 return tw(**kwargs)
558 564
559 565 def wrap(line, width, initindent='', hangindent=''):
560 566 maxindent = max(len(hangindent), len(initindent))
561 567 if width <= maxindent:
562 568 # adjust for weird terminal size
563 569 width = max(78, maxindent + 1)
564 570 line = line.decode(pycompat.sysstr(encoding.encoding),
565 571 pycompat.sysstr(encoding.encodingmode))
566 572 initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
567 573 pycompat.sysstr(encoding.encodingmode))
568 574 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
569 575 pycompat.sysstr(encoding.encodingmode))
570 576 wrapper = _MBTextWrapper(width=width,
571 577 initial_indent=initindent,
572 578 subsequent_indent=hangindent)
573 579 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
574 580
575 581 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
576 582 '0': False, 'no': False, 'false': False, 'off': False,
577 583 'never': False}
578 584
579 585 def parsebool(s):
580 586 """Parse s into a boolean.
581 587
582 588 If s is not a valid boolean, returns None.
583 589 """
584 590 return _booleans.get(s.lower(), None)
585 591
586 592 def evalpythonliteral(s):
587 593 """Evaluate a string containing a Python literal expression"""
588 594 # We could backport our tokenizer hack to rewrite '' to u'' if we want
589 595 if pycompat.ispy3:
590 596 return ast.literal_eval(s.decode('latin1'))
591 597 return ast.literal_eval(s)
General Comments 0
You need to be logged in to leave comments. Login now