stringutil.py
469 lines
| 15.5 KiB
| text/x-python
|
PythonLexer
Yuya Nishihara
|
r37101 | # stringutil.py - utility for generic string formatting, parsing, etc. | ||
# | ||||
# Copyright 2005 K. Thananchayan <thananck@yahoo.com> | ||||
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | ||||
# Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com> | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
from __future__ import absolute_import | ||||
import codecs | ||||
import re as remod | ||||
import textwrap | ||||
from ..i18n import _ | ||||
Connor Sheehan
|
r37227 | from ..thirdparty import attr | ||
Yuya Nishihara
|
r37101 | |||
from .. import ( | ||||
encoding, | ||||
error, | ||||
pycompat, | ||||
) | ||||
_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} | ||||
_DATA_ESCAPE_MAP.update({ | ||||
b'\\': b'\\\\', | ||||
b'\r': br'\r', | ||||
b'\n': br'\n', | ||||
}) | ||||
_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') | ||||
def escapedata(s): | ||||
if isinstance(s, bytearray): | ||||
s = bytes(s) | ||||
return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) | ||||
def binary(s): | ||||
"""return true if a string is binary data""" | ||||
return bool(s and '\0' in s) | ||||
def stringmatcher(pattern, casesensitive=True): | ||||
""" | ||||
accepts a string, possibly starting with 're:' or 'literal:' prefix. | ||||
returns the matcher name, pattern, and matcher function. | ||||
missing or unknown prefixes are treated as literal matches. | ||||
helper for tests: | ||||
>>> def test(pattern, *tests): | ||||
... kind, pattern, matcher = stringmatcher(pattern) | ||||
... return (kind, pattern, [bool(matcher(t)) for t in tests]) | ||||
>>> def itest(pattern, *tests): | ||||
... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) | ||||
... return (kind, pattern, [bool(matcher(t)) for t in tests]) | ||||
exact matching (no prefix): | ||||
>>> test(b'abcdefg', b'abc', b'def', b'abcdefg') | ||||
('literal', 'abcdefg', [False, False, True]) | ||||
regex matching ('re:' prefix) | ||||
>>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') | ||||
('re', 'a.+b', [False, False, True]) | ||||
force exact matches ('literal:' prefix) | ||||
>>> test(b'literal:re:foobar', b'foobar', b're:foobar') | ||||
('literal', 're:foobar', [False, True]) | ||||
unknown prefixes are ignored and treated as literals | ||||
>>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') | ||||
('literal', 'foo:bar', [False, False, True]) | ||||
case insensitive regex matches | ||||
>>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') | ||||
('re', 'A.+b', [False, False, True]) | ||||
case insensitive literal matches | ||||
>>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') | ||||
('literal', 'ABCDEFG', [False, False, True]) | ||||
""" | ||||
if pattern.startswith('re:'): | ||||
pattern = pattern[3:] | ||||
try: | ||||
flags = 0 | ||||
if not casesensitive: | ||||
flags = remod.I | ||||
regex = remod.compile(pattern, flags) | ||||
except remod.error as e: | ||||
raise error.ParseError(_('invalid regular expression: %s') | ||||
% e) | ||||
return 're', pattern, regex.search | ||||
elif pattern.startswith('literal:'): | ||||
pattern = pattern[8:] | ||||
match = pattern.__eq__ | ||||
if not casesensitive: | ||||
ipat = encoding.lower(pattern) | ||||
match = lambda s: ipat == encoding.lower(s) | ||||
return 'literal', pattern, match | ||||
def shortuser(user): | ||||
"""Return a short representation of a user name or email address.""" | ||||
f = user.find('@') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
f = user.find('<') | ||||
if f >= 0: | ||||
user = user[f + 1:] | ||||
f = user.find(' ') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
f = user.find('.') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
return user | ||||
def emailuser(user): | ||||
"""Return the user portion of an email address.""" | ||||
f = user.find('@') | ||||
if f >= 0: | ||||
user = user[:f] | ||||
f = user.find('<') | ||||
if f >= 0: | ||||
user = user[f + 1:] | ||||
return user | ||||
def email(author): | ||||
'''get email of author.''' | ||||
r = author.find('>') | ||||
if r == -1: | ||||
r = None | ||||
return author[author.find('<') + 1:r] | ||||
Connor Sheehan
|
r37173 | def person(author): | ||
"""Returns the name before an email address, | ||||
interpreting it as per RFC 5322 | ||||
>>> person(b'foo@bar') | ||||
'foo' | ||||
>>> person(b'Foo Bar <foo@bar>') | ||||
'Foo Bar' | ||||
>>> person(b'"Foo Bar" <foo@bar>') | ||||
'Foo Bar' | ||||
>>> person(b'"Foo \"buz\" Bar" <foo@bar>') | ||||
'Foo "buz" Bar' | ||||
>>> # The following are invalid, but do exist in real-life | ||||
... | ||||
>>> person(b'Foo "buz" Bar <foo@bar>') | ||||
'Foo "buz" Bar' | ||||
>>> person(b'"Foo Bar <foo@bar>') | ||||
'Foo Bar' | ||||
""" | ||||
if '@' not in author: | ||||
return author | ||||
f = author.find('<') | ||||
if f != -1: | ||||
return author[:f].strip(' "').replace('\\"', '"') | ||||
f = author.find('@') | ||||
return author[:f].replace('.', ' ') | ||||
Connor Sheehan
|
r37227 | @attr.s(hash=True) | ||
class mailmapping(object): | ||||
'''Represents a username/email key or value in | ||||
a mailmap file''' | ||||
email = attr.ib() | ||||
name = attr.ib(default=None) | ||||
def parsemailmap(mailmapcontent): | ||||
"""Parses data in the .mailmap format | ||||
>>> mmdata = b"\\n".join([ | ||||
... b'# Comment', | ||||
... b'Name <commit1@email.xx>', | ||||
... b'<name@email.xx> <commit2@email.xx>', | ||||
... b'Name <proper@email.xx> <commit3@email.xx>', | ||||
... b'Name <proper@email.xx> Commit <commit4@email.xx>', | ||||
... ]) | ||||
>>> mm = parsemailmap(mmdata) | ||||
>>> for key in sorted(mm.keys()): | ||||
... print(key) | ||||
mailmapping(email='commit1@email.xx', name=None) | ||||
mailmapping(email='commit2@email.xx', name=None) | ||||
mailmapping(email='commit3@email.xx', name=None) | ||||
mailmapping(email='commit4@email.xx', name='Commit') | ||||
>>> for val in sorted(mm.values()): | ||||
... print(val) | ||||
mailmapping(email='commit1@email.xx', name='Name') | ||||
mailmapping(email='name@email.xx', name=None) | ||||
mailmapping(email='proper@email.xx', name='Name') | ||||
mailmapping(email='proper@email.xx', name='Name') | ||||
""" | ||||
mailmap = {} | ||||
if mailmapcontent is None: | ||||
return mailmap | ||||
for line in mailmapcontent.splitlines(): | ||||
# Don't bother checking the line if it is a comment or | ||||
# is an improperly formed author field | ||||
if line.lstrip().startswith('#') or any(c not in line for c in '<>@'): | ||||
continue | ||||
# name, email hold the parsed emails and names for each line | ||||
# name_builder holds the words in a persons name | ||||
name, email = [], [] | ||||
namebuilder = [] | ||||
for element in line.split(): | ||||
if element.startswith('#'): | ||||
# If we reach a comment in the mailmap file, move on | ||||
break | ||||
elif element.startswith('<') and element.endswith('>'): | ||||
# We have found an email. | ||||
# Parse it, and finalize any names from earlier | ||||
email.append(element[1:-1]) # Slice off the "<>" | ||||
if namebuilder: | ||||
name.append(' '.join(namebuilder)) | ||||
namebuilder = [] | ||||
# Break if we have found a second email, any other | ||||
# data does not fit the spec for .mailmap | ||||
if len(email) > 1: | ||||
break | ||||
else: | ||||
# We have found another word in the committers name | ||||
namebuilder.append(element) | ||||
mailmapkey = mailmapping( | ||||
email=email[-1], | ||||
name=name[-1] if len(name) == 2 else None, | ||||
) | ||||
mailmap[mailmapkey] = mailmapping( | ||||
email=email[0], | ||||
name=name[0] if name else None, | ||||
) | ||||
return mailmap | ||||
def mapname(mailmap, author): | ||||
"""Returns the author field according to the mailmap cache, or | ||||
the original author field. | ||||
>>> mmdata = b"\\n".join([ | ||||
... b'# Comment', | ||||
... b'Name <commit1@email.xx>', | ||||
... b'<name@email.xx> <commit2@email.xx>', | ||||
... b'Name <proper@email.xx> <commit3@email.xx>', | ||||
... b'Name <proper@email.xx> Commit <commit4@email.xx>', | ||||
... ]) | ||||
>>> m = parsemailmap(mmdata) | ||||
>>> mapname(m, b'Commit <commit1@email.xx>') | ||||
'Name <commit1@email.xx>' | ||||
>>> mapname(m, b'Name <commit2@email.xx>') | ||||
'Name <name@email.xx>' | ||||
>>> mapname(m, b'Commit <commit3@email.xx>') | ||||
'Name <proper@email.xx>' | ||||
>>> mapname(m, b'Commit <commit4@email.xx>') | ||||
'Name <proper@email.xx>' | ||||
>>> mapname(m, b'Unknown Name <unknown@email.com>') | ||||
'Unknown Name <unknown@email.com>' | ||||
""" | ||||
# If the author field coming in isn't in the correct format, | ||||
# or the mailmap is empty just return the original author field | ||||
if not isauthorwellformed(author) or not mailmap: | ||||
return author | ||||
# Turn the user name into a mailmaptup | ||||
commit = mailmapping(name=person(author), email=email(author)) | ||||
try: | ||||
# Try and use both the commit email and name as the key | ||||
proper = mailmap[commit] | ||||
except KeyError: | ||||
# If the lookup fails, use just the email as the key instead | ||||
# We call this commit2 as not to erase original commit fields | ||||
commit2 = mailmapping(email=commit.email) | ||||
proper = mailmap.get(commit2, mailmapping(None, None)) | ||||
# Return the author field with proper values filled in | ||||
return '%s <%s>' % ( | ||||
proper.name if proper.name else commit.name, | ||||
proper.email if proper.email else commit.email, | ||||
) | ||||
Connor Sheehan
|
r37172 | _correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$') | ||
def isauthorwellformed(author): | ||||
'''Return True if the author field is well formed | ||||
(ie "Contributor Name <contrib@email.dom>") | ||||
>>> isauthorwellformed(b'Good Author <good@author.com>') | ||||
True | ||||
>>> isauthorwellformed(b'Author <good@author.com>') | ||||
True | ||||
>>> isauthorwellformed(b'Bad Author') | ||||
False | ||||
>>> isauthorwellformed(b'Bad Author <author@author.com') | ||||
False | ||||
>>> isauthorwellformed(b'Bad Author author@author.com') | ||||
False | ||||
>>> isauthorwellformed(b'<author@author.com>') | ||||
False | ||||
>>> isauthorwellformed(b'Bad Author <author>') | ||||
False | ||||
''' | ||||
return _correctauthorformat.match(author) is not None | ||||
Yuya Nishihara
|
r37101 | def ellipsis(text, maxlength=400): | ||
"""Trim string to at most maxlength (default: 400) columns in display.""" | ||||
return encoding.trim(text, maxlength, ellipsis='...') | ||||
def escapestr(s): | ||||
# call underlying function of s.encode('string_escape') directly for | ||||
# Python 3 compatibility | ||||
return codecs.escape_encode(s)[0] | ||||
def unescapestr(s): | ||||
return codecs.escape_decode(s)[0] | ||||
def forcebytestr(obj): | ||||
"""Portably format an arbitrary object (e.g. exception) into a byte | ||||
string.""" | ||||
try: | ||||
return pycompat.bytestr(obj) | ||||
except UnicodeEncodeError: | ||||
# non-ascii string, may be lossy | ||||
return pycompat.bytestr(encoding.strtolocal(str(obj))) | ||||
def uirepr(s): | ||||
# Avoid double backslash in Windows path repr() | ||||
return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') | ||||
# delay import of textwrap | ||||
def _MBTextWrapper(**kwargs): | ||||
class tw(textwrap.TextWrapper): | ||||
""" | ||||
Extend TextWrapper for width-awareness. | ||||
Neither number of 'bytes' in any encoding nor 'characters' is | ||||
appropriate to calculate terminal columns for specified string. | ||||
Original TextWrapper implementation uses built-in 'len()' directly, | ||||
so overriding is needed to use width information of each characters. | ||||
In addition, characters classified into 'ambiguous' width are | ||||
treated as wide in East Asian area, but as narrow in other. | ||||
This requires use decision to determine width of such characters. | ||||
""" | ||||
def _cutdown(self, ucstr, space_left): | ||||
l = 0 | ||||
colwidth = encoding.ucolwidth | ||||
for i in xrange(len(ucstr)): | ||||
l += colwidth(ucstr[i]) | ||||
if space_left < l: | ||||
return (ucstr[:i], ucstr[i:]) | ||||
return ucstr, '' | ||||
# overriding of base class | ||||
def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | ||||
space_left = max(width - cur_len, 1) | ||||
if self.break_long_words: | ||||
cut, res = self._cutdown(reversed_chunks[-1], space_left) | ||||
cur_line.append(cut) | ||||
reversed_chunks[-1] = res | ||||
elif not cur_line: | ||||
cur_line.append(reversed_chunks.pop()) | ||||
# this overriding code is imported from TextWrapper of Python 2.6 | ||||
# to calculate columns of string by 'encoding.ucolwidth()' | ||||
def _wrap_chunks(self, chunks): | ||||
colwidth = encoding.ucolwidth | ||||
lines = [] | ||||
if self.width <= 0: | ||||
raise ValueError("invalid width %r (must be > 0)" % self.width) | ||||
# Arrange in reverse order so items can be efficiently popped | ||||
# from a stack of chucks. | ||||
chunks.reverse() | ||||
while chunks: | ||||
# Start the list of chunks that will make up the current line. | ||||
# cur_len is just the length of all the chunks in cur_line. | ||||
cur_line = [] | ||||
cur_len = 0 | ||||
# Figure out which static string will prefix this line. | ||||
if lines: | ||||
indent = self.subsequent_indent | ||||
else: | ||||
indent = self.initial_indent | ||||
# Maximum width for this line. | ||||
width = self.width - len(indent) | ||||
# First chunk on line is whitespace -- drop it, unless this | ||||
# is the very beginning of the text (i.e. no lines started yet). | ||||
if self.drop_whitespace and chunks[-1].strip() == r'' and lines: | ||||
del chunks[-1] | ||||
while chunks: | ||||
l = colwidth(chunks[-1]) | ||||
# Can at least squeeze this chunk onto the current line. | ||||
if cur_len + l <= width: | ||||
cur_line.append(chunks.pop()) | ||||
cur_len += l | ||||
# Nope, this line is full. | ||||
else: | ||||
break | ||||
# The current line is full, and the next chunk is too big to | ||||
# fit on *any* line (not just this one). | ||||
if chunks and colwidth(chunks[-1]) > width: | ||||
self._handle_long_word(chunks, cur_line, cur_len, width) | ||||
# If the last chunk on this line is all whitespace, drop it. | ||||
if (self.drop_whitespace and | ||||
cur_line and cur_line[-1].strip() == r''): | ||||
del cur_line[-1] | ||||
# Convert current line back to a string and store it in list | ||||
# of all lines (return value). | ||||
if cur_line: | ||||
lines.append(indent + r''.join(cur_line)) | ||||
return lines | ||||
global _MBTextWrapper | ||||
_MBTextWrapper = tw | ||||
return tw(**kwargs) | ||||
def wrap(line, width, initindent='', hangindent=''): | ||||
maxindent = max(len(hangindent), len(initindent)) | ||||
if width <= maxindent: | ||||
# adjust for weird terminal size | ||||
width = max(78, maxindent + 1) | ||||
line = line.decode(pycompat.sysstr(encoding.encoding), | ||||
pycompat.sysstr(encoding.encodingmode)) | ||||
initindent = initindent.decode(pycompat.sysstr(encoding.encoding), | ||||
pycompat.sysstr(encoding.encodingmode)) | ||||
hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), | ||||
pycompat.sysstr(encoding.encodingmode)) | ||||
wrapper = _MBTextWrapper(width=width, | ||||
initial_indent=initindent, | ||||
subsequent_indent=hangindent) | ||||
return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) | ||||
_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, | ||||
'0': False, 'no': False, 'false': False, 'off': False, | ||||
'never': False} | ||||
def parsebool(s): | ||||
"""Parse s into a boolean. | ||||
If s is not a valid boolean, returns None. | ||||
""" | ||||
return _booleans.get(s.lower(), None) | ||||