stringutil.py
541 lines
| 18.0 KiB
| text/x-python
|
PythonLexer
Yuya Nishihara
|
r37101 | # stringutil.py - utility for generic string formatting, parsing, etc. | ||
# | ||||
# Copyright 2005 K. Thananchayan <thananck@yahoo.com> | ||||
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | ||||
# Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
from __future__ import absolute_import | ||||
Yuya Nishihara
|
r37494 | import ast | ||
Yuya Nishihara
|
r37101 | import codecs | ||
import re as remod | ||||
import textwrap | ||||
from ..i18n import _ | ||||
Connor Sheehan
|
r37227 | from ..thirdparty import attr | ||
Yuya Nishihara
|
r37101 | |||
from .. import ( | ||||
encoding, | ||||
error, | ||||
pycompat, | ||||
) | ||||
Yuya Nishihara
|
r37961 | def pprint(o, bprefix=False): | ||
Gregory Szorc
|
r37316 | """Pretty print an object.""" | ||
Gregory Szorc
|
r37637 | if isinstance(o, bytes): | ||
Augie Fackler
|
r37768 | if bprefix: | ||
return "b'%s'" % escapestr(o) | ||||
return "'%s'" % escapestr(o) | ||||
Gregory Szorc
|
r37637 | elif isinstance(o, bytearray): | ||
# codecs.escape_encode() can't handle bytearray, so escapestr fails | ||||
# without coercion. | ||||
return "bytearray['%s']" % escapestr(bytes(o)) | ||||
Gregory Szorc
|
r37316 | elif isinstance(o, list): | ||
Augie Fackler
|
r37768 | return '[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o)) | ||
Gregory Szorc
|
r37316 | elif isinstance(o, dict): | ||
return '{%s}' % (b', '.join( | ||||
Augie Fackler
|
r37768 | '%s: %s' % (pprint(k, bprefix=bprefix), | ||
pprint(v, bprefix=bprefix)) | ||||
for k, v in sorted(o.items()))) | ||||
Augie Fackler
|
r37951 | elif isinstance(o, tuple): | ||
return '(%s)' % (b', '.join(pprint(a, bprefix=bprefix) for a in o)) | ||||
Gregory Szorc
|
r37316 | else: | ||
Yuya Nishihara
|
r37960 | return pycompat.byterepr(o) | ||
Gregory Szorc
|
r37316 | |||
Yuya Nishihara
|
r38280 | def prettyrepr(o): | ||
"""Pretty print a representation of a possibly-nested object""" | ||||
lines = [] | ||||
rs = pycompat.byterepr(o) | ||||
Yuya Nishihara
|
r38283 | p0 = p1 = 0 | ||
while p0 < len(rs): | ||||
# '... field=<type ... field=<type ...' | ||||
# ~~~~~~~~~~~~~~~~ | ||||
# p0 p1 q0 q1 | ||||
q0 = -1 | ||||
q1 = rs.find('<', p1 + 1) | ||||
if q1 < 0: | ||||
q1 = len(rs) | ||||
elif q1 > p1 + 1 and rs.startswith('=', q1 - 1): | ||||
# backtrack for ' field=<' | ||||
q0 = rs.rfind(' ', p1 + 1, q1 - 1) | ||||
if q0 < 0: | ||||
q0 = q1 | ||||
else: | ||||
q0 += 1 # skip ' ' | ||||
l = rs.count('<', 0, p0) - rs.count('>', 0, p0) | ||||
Yuya Nishihara
|
r38280 | assert l >= 0 | ||
Yuya Nishihara
|
r38283 | lines.append((l, rs[p0:q0].rstrip())) | ||
p0, p1 = q0, q1 | ||||
Yuya Nishihara
|
r38280 | return '\n'.join(' ' * l + s for l, s in lines) | ||
Yuya Nishihara
|
r37101 | def binary(s): | ||
"""return true if a string is binary data""" | ||||
return bool(s and '\0' in s) | ||||
def stringmatcher(pattern, casesensitive=True): | ||||
""" | ||||
accepts a string, possibly starting with 're:' or 'literal:' prefix. | ||||
returns the matcher name, pattern, and matcher function. | ||||
missing or unknown prefixes are treated as literal matches. | ||||
helper for tests: | ||||
>>> def test(pattern, *tests): | ||||
... kind, pattern, matcher = stringmatcher(pattern) | ||||
... return (kind, pattern, [bool(matcher(t)) for t in tests]) | ||||
>>> def itest(pattern, *tests): | ||||
... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) | ||||
... return (kind, pattern, [bool(matcher(t)) for t in tests]) | ||||
exact matching (no prefix): | ||||
>>> test(b'abcdefg', b'abc', b'def', b'abcdefg') | ||||
('literal', 'abcdefg', [False, False, True]) | ||||
regex matching ('re:' prefix) | ||||
>>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') | ||||
('re', 'a.+b', [False, False, True]) | ||||
force exact matches ('literal:' prefix) | ||||
>>> test(b'literal:re:foobar', b'foobar', b're:foobar') | ||||
('literal', 're:foobar', [False, True]) | ||||
unknown prefixes are ignored and treated as literals | ||||
>>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') | ||||
('literal', 'foo:bar', [False, False, True]) | ||||
case insensitive regex matches | ||||
>>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') | ||||
('re', 'A.+b', [False, False, True]) | ||||
case insensitive literal matches | ||||
>>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') | ||||
('literal', 'ABCDEFG', [False, False, True]) | ||||
""" | ||||
if pattern.startswith('re:'): | ||||
pattern = pattern[3:] | ||||
try: | ||||
flags = 0 | ||||
if not casesensitive: | ||||
flags = remod.I | ||||
regex = remod.compile(pattern, flags) | ||||
except remod.error as e: | ||||
raise error.ParseError(_('invalid regular expression: %s') | ||||
% e) | ||||
return 're', pattern, regex.search | ||||
elif pattern.startswith('literal:'): | ||||
pattern = pattern[8:] | ||||
match = pattern.__eq__ | ||||
if not casesensitive: | ||||
ipat = encoding.lower(pattern) | ||||
match = lambda s: ipat == encoding.lower(s) | ||||
return 'literal', pattern, match | ||||
def shortuser(user): | ||||
"""Return a short representation of a user name or email address.""" | ||||
f = user.find('@') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
f = user.find('<') | ||||
if f >= 0: | ||||
user = user[f + 1:] | ||||
f = user.find(' ') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
f = user.find('.') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
return user | ||||
def emailuser(user): | ||||
"""Return the user portion of an email address.""" | ||||
f = user.find('@') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
f = user.find('<') | ||||
if f >= 0: | ||||
user = user[f + 1:] | ||||
return user | ||||
def email(author): | ||||
'''get email of author.''' | ||||
r = author.find('>') | ||||
if r == -1: | ||||
r = None | ||||
return author[author.find('<') + 1:r] | ||||
Connor Sheehan
|
r37173 | def person(author): | ||
"""Returns the name before an email address, | ||||
interpreting it as per RFC 5322 | ||||
>>> person(b'foo@bar') | ||||
'foo' | ||||
>>> person(b'Foo Bar <foo@bar>') | ||||
'Foo Bar' | ||||
>>> person(b'"Foo Bar" <foo@bar>') | ||||
'Foo Bar' | ||||
>>> person(b'"Foo \"buz\" Bar" <foo@bar>') | ||||
'Foo "buz" Bar' | ||||
>>> # The following are invalid, but do exist in real-life | ||||
... | ||||
>>> person(b'Foo "buz" Bar <foo@bar>') | ||||
'Foo "buz" Bar' | ||||
>>> person(b'"Foo Bar <foo@bar>') | ||||
'Foo Bar' | ||||
""" | ||||
if '@' not in author: | ||||
return author | ||||
f = author.find('<') | ||||
if f != -1: | ||||
return author[:f].strip(' "').replace('\\"', '"') | ||||
f = author.find('@') | ||||
return author[:f].replace('.', ' ') | ||||
Connor Sheehan
|
r37227 | @attr.s(hash=True) | ||
class mailmapping(object): | ||||
'''Represents a username/email key or value in | ||||
a mailmap file''' | ||||
email = attr.ib() | ||||
name = attr.ib(default=None) | ||||
Connor Sheehan
|
r37263 | def _ismailmaplineinvalid(names, emails): | ||
'''Returns True if the parsed names and emails | ||||
in a mailmap entry are invalid. | ||||
>>> # No names or emails fails | ||||
>>> names, emails = [], [] | ||||
>>> _ismailmaplineinvalid(names, emails) | ||||
True | ||||
>>> # Only one email fails | ||||
>>> emails = [b'email@email.com'] | ||||
>>> _ismailmaplineinvalid(names, emails) | ||||
True | ||||
>>> # One email and one name passes | ||||
>>> names = [b'Test Name'] | ||||
>>> _ismailmaplineinvalid(names, emails) | ||||
False | ||||
>>> # No names but two emails passes | ||||
>>> names = [] | ||||
>>> emails = [b'proper@email.com', b'commit@email.com'] | ||||
>>> _ismailmaplineinvalid(names, emails) | ||||
False | ||||
''' | ||||
return not emails or not names and len(emails) < 2 | ||||
Connor Sheehan
|
r37227 | def parsemailmap(mailmapcontent): | ||
"""Parses data in the .mailmap format | ||||
>>> mmdata = b"\\n".join([ | ||||
... b'# Comment', | ||||
... b'Name <commit1@email.xx>', | ||||
... b'<name@email.xx> <commit2@email.xx>', | ||||
... b'Name <proper@email.xx> <commit3@email.xx>', | ||||
... b'Name <proper@email.xx> Commit <commit4@email.xx>', | ||||
... ]) | ||||
>>> mm = parsemailmap(mmdata) | ||||
>>> for key in sorted(mm.keys()): | ||||
... print(key) | ||||
mailmapping(email='commit1@email.xx', name=None) | ||||
mailmapping(email='commit2@email.xx', name=None) | ||||
mailmapping(email='commit3@email.xx', name=None) | ||||
mailmapping(email='commit4@email.xx', name='Commit') | ||||
>>> for val in sorted(mm.values()): | ||||
... print(val) | ||||
mailmapping(email='commit1@email.xx', name='Name') | ||||
mailmapping(email='name@email.xx', name=None) | ||||
mailmapping(email='proper@email.xx', name='Name') | ||||
mailmapping(email='proper@email.xx', name='Name') | ||||
""" | ||||
mailmap = {} | ||||
if mailmapcontent is None: | ||||
return mailmap | ||||
for line in mailmapcontent.splitlines(): | ||||
# Don't bother checking the line if it is a comment or | ||||
# is an improperly formed author field | ||||
Connor Sheehan
|
r37263 | if line.lstrip().startswith('#'): | ||
Connor Sheehan
|
r37227 | continue | ||
Connor Sheehan
|
r37262 | # names, emails hold the parsed emails and names for each line | ||
Connor Sheehan
|
r37227 | # name_builder holds the words in a persons name | ||
Connor Sheehan
|
r37262 | names, emails = [], [] | ||
Connor Sheehan
|
r37227 | namebuilder = [] | ||
for element in line.split(): | ||||
if element.startswith('#'): | ||||
# If we reach a comment in the mailmap file, move on | ||||
break | ||||
elif element.startswith('<') and element.endswith('>'): | ||||
# We have found an email. | ||||
# Parse it, and finalize any names from earlier | ||||
Connor Sheehan
|
r37262 | emails.append(element[1:-1]) # Slice off the "<>" | ||
Connor Sheehan
|
r37227 | |||
if namebuilder: | ||||
Connor Sheehan
|
r37262 | names.append(' '.join(namebuilder)) | ||
Connor Sheehan
|
r37227 | namebuilder = [] | ||
# Break if we have found a second email, any other | ||||
# data does not fit the spec for .mailmap | ||||
Connor Sheehan
|
r37262 | if len(emails) > 1: | ||
Connor Sheehan
|
r37227 | break | ||
else: | ||||
# We have found another word in the committers name | ||||
namebuilder.append(element) | ||||
Connor Sheehan
|
r37263 | # Check to see if we have parsed the line into a valid form | ||
# We require at least one email, and either at least one | ||||
# name or a second email | ||||
if _ismailmaplineinvalid(names, emails): | ||||
continue | ||||
Connor Sheehan
|
r37227 | mailmapkey = mailmapping( | ||
Connor Sheehan
|
r37262 | email=emails[-1], | ||
name=names[-1] if len(names) == 2 else None, | ||||
Connor Sheehan
|
r37227 | ) | ||
mailmap[mailmapkey] = mailmapping( | ||||
Connor Sheehan
|
r37262 | email=emails[0], | ||
name=names[0] if names else None, | ||||
Connor Sheehan
|
r37227 | ) | ||
return mailmap | ||||
def mapname(mailmap, author): | ||||
"""Returns the author field according to the mailmap cache, or | ||||
the original author field. | ||||
>>> mmdata = b"\\n".join([ | ||||
... b'# Comment', | ||||
... b'Name <commit1@email.xx>', | ||||
... b'<name@email.xx> <commit2@email.xx>', | ||||
... b'Name <proper@email.xx> <commit3@email.xx>', | ||||
... b'Name <proper@email.xx> Commit <commit4@email.xx>', | ||||
... ]) | ||||
>>> m = parsemailmap(mmdata) | ||||
>>> mapname(m, b'Commit <commit1@email.xx>') | ||||
'Name <commit1@email.xx>' | ||||
>>> mapname(m, b'Name <commit2@email.xx>') | ||||
'Name <name@email.xx>' | ||||
>>> mapname(m, b'Commit <commit3@email.xx>') | ||||
'Name <proper@email.xx>' | ||||
>>> mapname(m, b'Commit <commit4@email.xx>') | ||||
'Name <proper@email.xx>' | ||||
>>> mapname(m, b'Unknown Name <unknown@email.com>') | ||||
'Unknown Name <unknown@email.com>' | ||||
""" | ||||
# If the author field coming in isn't in the correct format, | ||||
# or the mailmap is empty just return the original author field | ||||
if not isauthorwellformed(author) or not mailmap: | ||||
return author | ||||
Connor Sheehan
|
r37264 | # Turn the user name into a mailmapping | ||
Connor Sheehan
|
r37227 | commit = mailmapping(name=person(author), email=email(author)) | ||
try: | ||||
# Try and use both the commit email and name as the key | ||||
proper = mailmap[commit] | ||||
except KeyError: | ||||
# If the lookup fails, use just the email as the key instead | ||||
# We call this commit2 as not to erase original commit fields | ||||
commit2 = mailmapping(email=commit.email) | ||||
proper = mailmap.get(commit2, mailmapping(None, None)) | ||||
# Return the author field with proper values filled in | ||||
return '%s <%s>' % ( | ||||
proper.name if proper.name else commit.name, | ||||
proper.email if proper.email else commit.email, | ||||
) | ||||
Connor Sheehan
|
r37172 | _correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$') | ||
def isauthorwellformed(author): | ||||
'''Return True if the author field is well formed | ||||
(ie "Contributor Name <contrib@email.dom>") | ||||
>>> isauthorwellformed(b'Good Author <good@author.com>') | ||||
True | ||||
>>> isauthorwellformed(b'Author <good@author.com>') | ||||
True | ||||
>>> isauthorwellformed(b'Bad Author') | ||||
False | ||||
>>> isauthorwellformed(b'Bad Author <author@author.com') | ||||
False | ||||
>>> isauthorwellformed(b'Bad Author author@author.com') | ||||
False | ||||
>>> isauthorwellformed(b'<author@author.com>') | ||||
False | ||||
>>> isauthorwellformed(b'Bad Author <author>') | ||||
False | ||||
''' | ||||
return _correctauthorformat.match(author) is not None | ||||
Yuya Nishihara
|
r37101 | def ellipsis(text, maxlength=400): | ||
"""Trim string to at most maxlength (default: 400) columns in display.""" | ||||
return encoding.trim(text, maxlength, ellipsis='...') | ||||
def escapestr(s): | ||||
# call underlying function of s.encode('string_escape') directly for | ||||
# Python 3 compatibility | ||||
return codecs.escape_encode(s)[0] | ||||
def unescapestr(s): | ||||
return codecs.escape_decode(s)[0] | ||||
def forcebytestr(obj): | ||||
"""Portably format an arbitrary object (e.g. exception) into a byte | ||||
string.""" | ||||
try: | ||||
return pycompat.bytestr(obj) | ||||
except UnicodeEncodeError: | ||||
# non-ascii string, may be lossy | ||||
return pycompat.bytestr(encoding.strtolocal(str(obj))) | ||||
def uirepr(s): | ||||
# Avoid double backslash in Windows path repr() | ||||
return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') | ||||
# delay import of textwrap | ||||
def _MBTextWrapper(**kwargs): | ||||
class tw(textwrap.TextWrapper): | ||||
""" | ||||
Extend TextWrapper for width-awareness. | ||||
Neither number of 'bytes' in any encoding nor 'characters' is | ||||
appropriate to calculate terminal columns for specified string. | ||||
Original TextWrapper implementation uses built-in 'len()' directly, | ||||
so overriding is needed to use width information of each characters. | ||||
In addition, characters classified into 'ambiguous' width are | ||||
treated as wide in East Asian area, but as narrow in other. | ||||
This requires use decision to determine width of such characters. | ||||
""" | ||||
def _cutdown(self, ucstr, space_left): | ||||
l = 0 | ||||
colwidth = encoding.ucolwidth | ||||
for i in xrange(len(ucstr)): | ||||
l += colwidth(ucstr[i]) | ||||
if space_left < l: | ||||
return (ucstr[:i], ucstr[i:]) | ||||
return ucstr, '' | ||||
# overriding of base class | ||||
def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | ||||
space_left = max(width - cur_len, 1) | ||||
if self.break_long_words: | ||||
cut, res = self._cutdown(reversed_chunks[-1], space_left) | ||||
cur_line.append(cut) | ||||
reversed_chunks[-1] = res | ||||
elif not cur_line: | ||||
cur_line.append(reversed_chunks.pop()) | ||||
# this overriding code is imported from TextWrapper of Python 2.6 | ||||
# to calculate columns of string by 'encoding.ucolwidth()' | ||||
def _wrap_chunks(self, chunks): | ||||
colwidth = encoding.ucolwidth | ||||
lines = [] | ||||
if self.width <= 0: | ||||
raise ValueError("invalid width %r (must be > 0)" % self.width) | ||||
# Arrange in reverse order so items can be efficiently popped | ||||
# from a stack of chucks. | ||||
chunks.reverse() | ||||
while chunks: | ||||
# Start the list of chunks that will make up the current line. | ||||
# cur_len is just the length of all the chunks in cur_line. | ||||
cur_line = [] | ||||
cur_len = 0 | ||||
# Figure out which static string will prefix this line. | ||||
if lines: | ||||
indent = self.subsequent_indent | ||||
else: | ||||
indent = self.initial_indent | ||||
# Maximum width for this line. | ||||
width = self.width - len(indent) | ||||
# First chunk on line is whitespace -- drop it, unless this | ||||
# is the very beginning of the text (i.e. no lines started yet). | ||||
if self.drop_whitespace and chunks[-1].strip() == r'' and lines: | ||||
del chunks[-1] | ||||
while chunks: | ||||
l = colwidth(chunks[-1]) | ||||
# Can at least squeeze this chunk onto the current line. | ||||
if cur_len + l <= width: | ||||
cur_line.append(chunks.pop()) | ||||
cur_len += l | ||||
# Nope, this line is full. | ||||
else: | ||||
break | ||||
# The current line is full, and the next chunk is too big to | ||||
# fit on *any* line (not just this one). | ||||
if chunks and colwidth(chunks[-1]) > width: | ||||
self._handle_long_word(chunks, cur_line, cur_len, width) | ||||
# If the last chunk on this line is all whitespace, drop it. | ||||
if (self.drop_whitespace and | ||||
cur_line and cur_line[-1].strip() == r''): | ||||
del cur_line[-1] | ||||
# Convert current line back to a string and store it in list | ||||
# of all lines (return value). | ||||
if cur_line: | ||||
lines.append(indent + r''.join(cur_line)) | ||||
return lines | ||||
global _MBTextWrapper | ||||
_MBTextWrapper = tw | ||||
return tw(**kwargs) | ||||
def wrap(line, width, initindent='', hangindent=''): | ||||
maxindent = max(len(hangindent), len(initindent)) | ||||
if width <= maxindent: | ||||
# adjust for weird terminal size | ||||
width = max(78, maxindent + 1) | ||||
line = line.decode(pycompat.sysstr(encoding.encoding), | ||||
pycompat.sysstr(encoding.encodingmode)) | ||||
initindent = initindent.decode(pycompat.sysstr(encoding.encoding), | ||||
pycompat.sysstr(encoding.encodingmode)) | ||||
hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), | ||||
pycompat.sysstr(encoding.encodingmode)) | ||||
wrapper = _MBTextWrapper(width=width, | ||||
initial_indent=initindent, | ||||
subsequent_indent=hangindent) | ||||
return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) | ||||
_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, | ||||
'0': False, 'no': False, 'false': False, 'off': False, | ||||
'never': False} | ||||
def parsebool(s): | ||||
"""Parse s into a boolean. | ||||
If s is not a valid boolean, returns None. | ||||
""" | ||||
return _booleans.get(s.lower(), None) | ||||
Gregory Szorc
|
r37306 | |||
Yuya Nishihara
|
r37494 | def evalpythonliteral(s): | ||
"""Evaluate a string containing a Python literal expression""" | ||||
# We could backport our tokenizer hack to rewrite '' to u'' if we want | ||||
Augie Fackler
|
r37699 | if pycompat.ispy3: | ||
return ast.literal_eval(s.decode('latin1')) | ||||
Yuya Nishihara
|
r37494 | return ast.literal_eval(s) | ||