##// END OF EJS Templates
dagutil: remove externalize() and externalizeall()...
dagutil: remove externalize() and externalizeall() They are unused after the previous commit. .. api:: externalize() and externalizeall() removed from dagutil Use .node() on a storage primitive to perform revision to node conversions. Differential Revision: https://phab.mercurial-scm.org/D4305

File last commit:

r39098:1419ba5e default
r39196:0e46b92b default
Show More
stringutil.py
588 lines | 19.5 KiB | text/x-python | PythonLexer
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101 # stringutil.py - utility for generic string formatting, parsing, etc.
#
# Copyright 2005 K. Thananchayan <thananck@yahoo.com>
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
# Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import
Yuya Nishihara
wireproto: convert python literal to object without using unsafe eval()...
r37494 import ast
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101 import codecs
import re as remod
import textwrap
from ..i18n import _
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 from ..thirdparty import attr
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101
from .. import (
encoding,
error,
pycompat,
)
Augie Fackler
stringutil: add a new function to do minimal regex escaping...
r38493 # regex special chars pulled from https://bugs.python.org/issue29995
# which was part of Python 3.7.
Augie Fackler
stringutil: update list of re-special characters to include &~...
r38496 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f')
Augie Fackler
stringutil: add a new function to do minimal regex escaping...
r38493 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial}
def reescape(pat):
"""Drop-in replacement for re.escape."""
# NOTE: it is intentional that this works on unicodes and not
# bytes, as it's only possible to do the escaping with
# unicode.translate, not bytes.translate. Sigh.
wantuni = True
if isinstance(pat, bytes):
wantuni = False
pat = pat.decode('latin1')
pat = pat.translate(_regexescapemap)
if wantuni:
return pat
return pat.encode('latin1')
Yuya Nishihara
stringutil: flip the default of pprint() to bprefix=False...
r37961 def pprint(o, bprefix=False):
Gregory Szorc
stringutil: add function to pretty print an object...
r37316 """Pretty print an object."""
Gregory Szorc
stringutil: support more types with pprint()...
r37637 if isinstance(o, bytes):
Augie Fackler
stringutil: make b prefixes on string output optional...
r37768 if bprefix:
return "b'%s'" % escapestr(o)
return "'%s'" % escapestr(o)
Gregory Szorc
stringutil: support more types with pprint()...
r37637 elif isinstance(o, bytearray):
# codecs.escape_encode() can't handle bytearray, so escapestr fails
# without coercion.
return "bytearray['%s']" % escapestr(bytes(o))
Gregory Szorc
stringutil: add function to pretty print an object...
r37316 elif isinstance(o, list):
Augie Fackler
stringutil: make b prefixes on string output optional...
r37768 return '[%s]' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
Gregory Szorc
stringutil: add function to pretty print an object...
r37316 elif isinstance(o, dict):
return '{%s}' % (b', '.join(
Augie Fackler
stringutil: make b prefixes on string output optional...
r37768 '%s: %s' % (pprint(k, bprefix=bprefix),
pprint(v, bprefix=bprefix))
for k, v in sorted(o.items())))
Augie Fackler
stringutil: teach pprint about sets...
r39086 elif isinstance(o, set):
return 'set([%s])' % (b', '.join(
pprint(k, bprefix=bprefix) for k in sorted(o)))
Augie Fackler
stringutil: teach pprint about tuples...
r37951 elif isinstance(o, tuple):
return '(%s)' % (b', '.join(pprint(a, bprefix=bprefix) for a in o))
Gregory Szorc
stringutil: add function to pretty print an object...
r37316 else:
Yuya Nishihara
stringutil: make pprint() forward uninteresting object to b'%r'...
r37960 return pycompat.byterepr(o)
Gregory Szorc
stringutil: add function to pretty print an object...
r37316
Yuya Nishihara
stringutil: promote smartset.prettyformat() to utility function...
r38280 def prettyrepr(o):
"""Pretty print a representation of a possibly-nested object"""
lines = []
rs = pycompat.byterepr(o)
Yuya Nishihara
stringutil: fix prettyrepr() to not orphan foo=<...> line
r38283 p0 = p1 = 0
while p0 < len(rs):
# '... field=<type ... field=<type ...'
# ~~~~~~~~~~~~~~~~
# p0 p1 q0 q1
q0 = -1
q1 = rs.find('<', p1 + 1)
if q1 < 0:
q1 = len(rs)
elif q1 > p1 + 1 and rs.startswith('=', q1 - 1):
# backtrack for ' field=<'
q0 = rs.rfind(' ', p1 + 1, q1 - 1)
if q0 < 0:
q0 = q1
else:
q0 += 1 # skip ' '
l = rs.count('<', 0, p0) - rs.count('>', 0, p0)
Yuya Nishihara
stringutil: promote smartset.prettyformat() to utility function...
r38280 assert l >= 0
Yuya Nishihara
stringutil: fix prettyrepr() to not orphan foo=<...> line
r38283 lines.append((l, rs[p0:q0].rstrip()))
p0, p1 = q0, q1
Yuya Nishihara
stringutil: promote smartset.prettyformat() to utility function...
r38280 return '\n'.join(' ' * l + s for l, s in lines)
Yuya Nishihara
stringutil: move _formatsetrepr() from smartset...
r38595 def buildrepr(r):
"""Format an optional printable representation from unexpanded bits
======== =================================
type(r) example
======== =================================
tuple ('<not %r>', other)
bytes '<branch closed>'
callable lambda: '<branch %r>' % sorted(b)
object other
======== =================================
"""
if r is None:
return ''
elif isinstance(r, tuple):
return r[0] % pycompat.rapply(pycompat.maybebytestr, r[1:])
elif isinstance(r, bytes):
return r
elif callable(r):
return r()
else:
Augie Fackler
stringutil: have buildrepr delegate to pprint for unknown types...
r39087 return pprint(r)
Yuya Nishihara
stringutil: move _formatsetrepr() from smartset...
r38595
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101 def binary(s):
"""return true if a string is binary data"""
return bool(s and '\0' in s)
def stringmatcher(pattern, casesensitive=True):
"""
accepts a string, possibly starting with 're:' or 'literal:' prefix.
returns the matcher name, pattern, and matcher function.
missing or unknown prefixes are treated as literal matches.
helper for tests:
>>> def test(pattern, *tests):
... kind, pattern, matcher = stringmatcher(pattern)
... return (kind, pattern, [bool(matcher(t)) for t in tests])
>>> def itest(pattern, *tests):
... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
... return (kind, pattern, [bool(matcher(t)) for t in tests])
exact matching (no prefix):
>>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
('literal', 'abcdefg', [False, False, True])
regex matching ('re:' prefix)
>>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
('re', 'a.+b', [False, False, True])
force exact matches ('literal:' prefix)
>>> test(b'literal:re:foobar', b'foobar', b're:foobar')
('literal', 're:foobar', [False, True])
unknown prefixes are ignored and treated as literals
>>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
('literal', 'foo:bar', [False, False, True])
case insensitive regex matches
>>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
('re', 'A.+b', [False, False, True])
case insensitive literal matches
>>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
('literal', 'ABCDEFG', [False, False, True])
"""
if pattern.startswith('re:'):
pattern = pattern[3:]
try:
flags = 0
if not casesensitive:
flags = remod.I
regex = remod.compile(pattern, flags)
except remod.error as e:
raise error.ParseError(_('invalid regular expression: %s')
% e)
return 're', pattern, regex.search
elif pattern.startswith('literal:'):
pattern = pattern[8:]
match = pattern.__eq__
if not casesensitive:
ipat = encoding.lower(pattern)
match = lambda s: ipat == encoding.lower(s)
return 'literal', pattern, match
def shortuser(user):
"""Return a short representation of a user name or email address."""
f = user.find('@')
if f >= 0:
user = user[:f]
f = user.find('<')
if f >= 0:
user = user[f + 1:]
f = user.find(' ')
if f >= 0:
user = user[:f]
f = user.find('.')
if f >= 0:
user = user[:f]
return user
def emailuser(user):
"""Return the user portion of an email address."""
f = user.find('@')
if f >= 0:
user = user[:f]
f = user.find('<')
if f >= 0:
user = user[f + 1:]
return user
def email(author):
'''get email of author.'''
r = author.find('>')
if r == -1:
r = None
return author[author.find('<') + 1:r]
Connor Sheehan
stringutil: move person function from templatefilters...
r37173 def person(author):
"""Returns the name before an email address,
interpreting it as per RFC 5322
>>> person(b'foo@bar')
'foo'
>>> person(b'Foo Bar <foo@bar>')
'Foo Bar'
>>> person(b'"Foo Bar" <foo@bar>')
'Foo Bar'
>>> person(b'"Foo \"buz\" Bar" <foo@bar>')
'Foo "buz" Bar'
>>> # The following are invalid, but do exist in real-life
...
>>> person(b'Foo "buz" Bar <foo@bar>')
'Foo "buz" Bar'
>>> person(b'"Foo Bar <foo@bar>')
'Foo Bar'
"""
if '@' not in author:
return author
f = author.find('<')
if f != -1:
return author[:f].strip(' "').replace('\\"', '"')
f = author.find('@')
return author[:f].replace('.', ' ')
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 @attr.s(hash=True)
class mailmapping(object):
'''Represents a username/email key or value in
a mailmap file'''
email = attr.ib()
name = attr.ib(default=None)
Connor Sheehan
stringutil: improve check for failed mailmap line parsing...
r37263 def _ismailmaplineinvalid(names, emails):
'''Returns True if the parsed names and emails
in a mailmap entry are invalid.
>>> # No names or emails fails
>>> names, emails = [], []
>>> _ismailmaplineinvalid(names, emails)
True
>>> # Only one email fails
>>> emails = [b'email@email.com']
>>> _ismailmaplineinvalid(names, emails)
True
>>> # One email and one name passes
>>> names = [b'Test Name']
>>> _ismailmaplineinvalid(names, emails)
False
>>> # No names but two emails passes
>>> names = []
>>> emails = [b'proper@email.com', b'commit@email.com']
>>> _ismailmaplineinvalid(names, emails)
False
'''
return not emails or not names and len(emails) < 2
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 def parsemailmap(mailmapcontent):
"""Parses data in the .mailmap format
>>> mmdata = b"\\n".join([
... b'# Comment',
... b'Name <commit1@email.xx>',
... b'<name@email.xx> <commit2@email.xx>',
... b'Name <proper@email.xx> <commit3@email.xx>',
... b'Name <proper@email.xx> Commit <commit4@email.xx>',
... ])
>>> mm = parsemailmap(mmdata)
>>> for key in sorted(mm.keys()):
... print(key)
mailmapping(email='commit1@email.xx', name=None)
mailmapping(email='commit2@email.xx', name=None)
mailmapping(email='commit3@email.xx', name=None)
mailmapping(email='commit4@email.xx', name='Commit')
>>> for val in sorted(mm.values()):
... print(val)
mailmapping(email='commit1@email.xx', name='Name')
mailmapping(email='name@email.xx', name=None)
mailmapping(email='proper@email.xx', name='Name')
mailmapping(email='proper@email.xx', name='Name')
"""
mailmap = {}
if mailmapcontent is None:
return mailmap
for line in mailmapcontent.splitlines():
# Don't bother checking the line if it is a comment or
# is an improperly formed author field
Connor Sheehan
stringutil: improve check for failed mailmap line parsing...
r37263 if line.lstrip().startswith('#'):
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 continue
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 # names, emails hold the parsed emails and names for each line
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 # name_builder holds the words in a persons name
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 names, emails = [], []
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 namebuilder = []
for element in line.split():
if element.startswith('#'):
# If we reach a comment in the mailmap file, move on
break
elif element.startswith('<') and element.endswith('>'):
# We have found an email.
# Parse it, and finalize any names from earlier
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 emails.append(element[1:-1]) # Slice off the "<>"
Connor Sheehan
templatefuncs: add mailmap template function...
r37227
if namebuilder:
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 names.append(' '.join(namebuilder))
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 namebuilder = []
# Break if we have found a second email, any other
# data does not fit the spec for .mailmap
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 if len(emails) > 1:
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 break
else:
# We have found another word in the committers name
namebuilder.append(element)
Connor Sheehan
stringutil: improve check for failed mailmap line parsing...
r37263 # Check to see if we have parsed the line into a valid form
# We require at least one email, and either at least one
# name or a second email
if _ismailmaplineinvalid(names, emails):
continue
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 mailmapkey = mailmapping(
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 email=emails[-1],
name=names[-1] if len(names) == 2 else None,
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 )
mailmap[mailmapkey] = mailmapping(
Connor Sheehan
stringutil: rename local email/names variables to their plural forms...
r37262 email=emails[0],
name=names[0] if names else None,
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 )
return mailmap
def mapname(mailmap, author):
"""Returns the author field according to the mailmap cache, or
the original author field.
>>> mmdata = b"\\n".join([
... b'# Comment',
... b'Name <commit1@email.xx>',
... b'<name@email.xx> <commit2@email.xx>',
... b'Name <proper@email.xx> <commit3@email.xx>',
... b'Name <proper@email.xx> Commit <commit4@email.xx>',
... ])
>>> m = parsemailmap(mmdata)
>>> mapname(m, b'Commit <commit1@email.xx>')
'Name <commit1@email.xx>'
>>> mapname(m, b'Name <commit2@email.xx>')
'Name <name@email.xx>'
>>> mapname(m, b'Commit <commit3@email.xx>')
'Name <proper@email.xx>'
>>> mapname(m, b'Commit <commit4@email.xx>')
'Name <proper@email.xx>'
>>> mapname(m, b'Unknown Name <unknown@email.com>')
'Unknown Name <unknown@email.com>'
"""
# If the author field coming in isn't in the correct format,
# or the mailmap is empty just return the original author field
if not isauthorwellformed(author) or not mailmap:
return author
Connor Sheehan
stringutil: edit comment to reflect actual data type name...
r37264 # Turn the user name into a mailmapping
Connor Sheehan
templatefuncs: add mailmap template function...
r37227 commit = mailmapping(name=person(author), email=email(author))
try:
# Try and use both the commit email and name as the key
proper = mailmap[commit]
except KeyError:
# If the lookup fails, use just the email as the key instead
# We call this commit2 as not to erase original commit fields
commit2 = mailmapping(email=commit.email)
proper = mailmap.get(commit2, mailmapping(None, None))
# Return the author field with proper values filled in
return '%s <%s>' % (
proper.name if proper.name else commit.name,
proper.email if proper.email else commit.email,
)
Connor Sheehan
stringutil: add isauthorwellformed function...
r37172 _correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$')
def isauthorwellformed(author):
'''Return True if the author field is well formed
(ie "Contributor Name <contrib@email.dom>")
>>> isauthorwellformed(b'Good Author <good@author.com>')
True
>>> isauthorwellformed(b'Author <good@author.com>')
True
>>> isauthorwellformed(b'Bad Author')
False
>>> isauthorwellformed(b'Bad Author <author@author.com')
False
>>> isauthorwellformed(b'Bad Author author@author.com')
False
>>> isauthorwellformed(b'<author@author.com>')
False
>>> isauthorwellformed(b'Bad Author <author>')
False
'''
return _correctauthorformat.match(author) is not None
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101 def ellipsis(text, maxlength=400):
"""Trim string to at most maxlength (default: 400) columns in display."""
return encoding.trim(text, maxlength, ellipsis='...')
def escapestr(s):
Augie Fackler
stringutil: if we get a memoryview in escapestr, coerce it to bytes...
r39098 if isinstance(s, memoryview):
s = bytes(s)
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101 # call underlying function of s.encode('string_escape') directly for
# Python 3 compatibility
return codecs.escape_encode(s)[0]
def unescapestr(s):
return codecs.escape_decode(s)[0]
def forcebytestr(obj):
"""Portably format an arbitrary object (e.g. exception) into a byte
string."""
try:
return pycompat.bytestr(obj)
except UnicodeEncodeError:
# non-ascii string, may be lossy
return pycompat.bytestr(encoding.strtolocal(str(obj)))
def uirepr(s):
# Avoid double backslash in Windows path repr()
return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
# delay import of textwrap
def _MBTextWrapper(**kwargs):
class tw(textwrap.TextWrapper):
"""
Extend TextWrapper for width-awareness.
Neither number of 'bytes' in any encoding nor 'characters' is
appropriate to calculate terminal columns for specified string.
Original TextWrapper implementation uses built-in 'len()' directly,
so overriding is needed to use width information of each characters.
In addition, characters classified into 'ambiguous' width are
treated as wide in East Asian area, but as narrow in other.
This requires use decision to determine width of such characters.
"""
def _cutdown(self, ucstr, space_left):
l = 0
colwidth = encoding.ucolwidth
Gregory Szorc
global: use pycompat.xrange()...
r38806 for i in pycompat.xrange(len(ucstr)):
Yuya Nishihara
stringutil: move generic string helpers to new module...
r37101 l += colwidth(ucstr[i])
if space_left < l:
return (ucstr[:i], ucstr[i:])
return ucstr, ''
# overriding of base class
def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
space_left = max(width - cur_len, 1)
if self.break_long_words:
cut, res = self._cutdown(reversed_chunks[-1], space_left)
cur_line.append(cut)
reversed_chunks[-1] = res
elif not cur_line:
cur_line.append(reversed_chunks.pop())
# this overriding code is imported from TextWrapper of Python 2.6
# to calculate columns of string by 'encoding.ucolwidth()'
def _wrap_chunks(self, chunks):
colwidth = encoding.ucolwidth
lines = []
if self.width <= 0:
raise ValueError("invalid width %r (must be > 0)" % self.width)
# Arrange in reverse order so items can be efficiently popped
# from a stack of chucks.
chunks.reverse()
while chunks:
# Start the list of chunks that will make up the current line.
# cur_len is just the length of all the chunks in cur_line.
cur_line = []
cur_len = 0
# Figure out which static string will prefix this line.
if lines:
indent = self.subsequent_indent
else:
indent = self.initial_indent
# Maximum width for this line.
width = self.width - len(indent)
# First chunk on line is whitespace -- drop it, unless this
# is the very beginning of the text (i.e. no lines started yet).
if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
del chunks[-1]
while chunks:
l = colwidth(chunks[-1])
# Can at least squeeze this chunk onto the current line.
if cur_len + l <= width:
cur_line.append(chunks.pop())
cur_len += l
# Nope, this line is full.
else:
break
# The current line is full, and the next chunk is too big to
# fit on *any* line (not just this one).
if chunks and colwidth(chunks[-1]) > width:
self._handle_long_word(chunks, cur_line, cur_len, width)
# If the last chunk on this line is all whitespace, drop it.
if (self.drop_whitespace and
cur_line and cur_line[-1].strip() == r''):
del cur_line[-1]
# Convert current line back to a string and store it in list
# of all lines (return value).
if cur_line:
lines.append(indent + r''.join(cur_line))
return lines
global _MBTextWrapper
_MBTextWrapper = tw
return tw(**kwargs)
def wrap(line, width, initindent='', hangindent=''):
maxindent = max(len(hangindent), len(initindent))
if width <= maxindent:
# adjust for weird terminal size
width = max(78, maxindent + 1)
line = line.decode(pycompat.sysstr(encoding.encoding),
pycompat.sysstr(encoding.encodingmode))
initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
pycompat.sysstr(encoding.encodingmode))
hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
pycompat.sysstr(encoding.encodingmode))
wrapper = _MBTextWrapper(width=width,
initial_indent=initindent,
subsequent_indent=hangindent)
return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
'0': False, 'no': False, 'false': False, 'off': False,
'never': False}
def parsebool(s):
"""Parse s into a boolean.
If s is not a valid boolean, returns None.
"""
return _booleans.get(s.lower(), None)
Gregory Szorc
wireproto: syntax for encoding CBOR into frames...
r37306
Yuya Nishihara
wireproto: convert python literal to object without using unsafe eval()...
r37494 def evalpythonliteral(s):
"""Evaluate a string containing a Python literal expression"""
# We could backport our tokenizer hack to rewrite '' to u'' if we want
Augie Fackler
stringutil: ast.literal_eval needs a unicode on py3...
r37699 if pycompat.ispy3:
return ast.literal_eval(s.decode('latin1'))
Yuya Nishihara
wireproto: convert python literal to object without using unsafe eval()...
r37494 return ast.literal_eval(s)