##// END OF EJS Templates
merge with stable
merge with stable

File last commit:

r42002:25694a78 default
r43074:50e25f30 merge default
Show More
encoding.py
616 lines | 20.3 KiB | text/x-python | PythonLexer
Martin Geisler
put license and copyright info into comment blocks
r8226 # encoding.py - character transcoding support for Mercurial
#
# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
#
# This software may be used and distributed according to the terms of the
Matt Mackall
Update license to GPLv2+
r10263 # GNU General Public License version 2 or any later version.
Matt Mackall
move encoding bits from util to encoding...
r7948
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 from __future__ import absolute_import, print_function
Gregory Szorc
encoding: use absolute_import
r27355
import locale
import os
import unicodedata
from . import (
error,
Yuya Nishihara
parsers: switch to policy importer...
r32372 policy,
Yuya Nishihara
pycompat: provide 'ispy3' constant...
r30030 pycompat,
Gregory Szorc
encoding: use absolute_import
r27355 )
Matt Mackall
move encoding bits from util to encoding...
r7948
Yuya Nishihara
encoding: extract stub for fast JSON escape...
r33925 from .pure import (
charencode as charencodepure,
)
Yuya Nishihara
encoding: drop circular import by proxying through '<policy>.charencode'...
r33756 charencode = policy.importmod(r'charencode')
Yuya Nishihara
encoding: add function to test if a str consists of ASCII characters...
r33927 isasciistr = charencode.isasciistr
Yuya Nishihara
encoding: drop circular import by proxying through '<policy>.charencode'...
r33756 asciilower = charencode.asciilower
asciiupper = charencode.asciiupper
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 _jsonescapeu8fast = charencode.jsonescapeu8fast
Yuya Nishihara
encoding: drop circular import by proxying through '<policy>.charencode'...
r33756
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 _sysstr = pycompat.sysstr
Yuya Nishihara
pycompat: provide 'ispy3' constant...
r30030 if pycompat.ispy3:
Gregory Szorc
encoding: make HFS+ ignore code Python 3 compatible...
r28507 unichr = chr
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
# "Unicode Subtleties"), so we need to ignore them in some places for
# sanity.
_ignore = [unichr(int(x, 16)).encode("utf-8") for x in
"200c 200d 200e 200f 202a 202b 202c 202d 202e "
"206a 206b 206c 206d 206e 206f feff".split()]
# verify the next function will work
Yuya Nishihara
encoding: use i.startswith() instead of i[0] to eliminate py2/3 divergence
r32299 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596
def hfsignoreclean(s):
"""Remove codepoints ignored by HFS+ from s.
>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
'.hg'
>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
'.hg'
"""
if "\xe2" in s or "\xef" in s:
for c in _ignore:
s = s.replace(c, '')
return s
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 # encoding.environ is provided read-only, which may not be used to modify
# the process environment
_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
if not pycompat.ispy3:
Yuya Nishihara
check-code: ignore re-exports of os.environ in encoding.py...
r32185 environ = os.environ # re-exports
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 elif _nativeenviron:
Yuya Nishihara
check-code: ignore re-exports of os.environ in encoding.py...
r32185 environ = os.environb # re-exports
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 else:
# preferred encoding isn't known yet; use utf-8 to avoid unicode error
# and recreate it once encoding is settled
Gregory Szorc
encoding: use raw strings for encoding arguments...
r42002 environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
Yuya Nishihara
check-code: ignore re-exports of os.environ in encoding.py...
r32185 for k, v in os.environ.items()) # re-exports
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034
Martin von Zweigbergk
encoding: remove unnecessary lambdas from _encodingfixers...
r39871 _encodingrewrites = {
'646': 'ascii',
'ANSI_X3.4-1968': 'ascii',
Dan Villiom Podlaski Christiansen
encoding: improve handling of buggy getpreferredencoding() on Mac OS X...
r11892 }
Yuya Nishihara
encoding: alias cp65001 to utf-8 on Windows...
r38633 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
# https://bugs.python.org/issue13216
if pycompat.iswindows and not pycompat.ispy3:
Martin von Zweigbergk
encoding: remove unnecessary lambdas from _encodingfixers...
r39871 _encodingrewrites['cp65001'] = 'utf-8'
Matt Mackall
move encoding bits from util to encoding...
r7948
try:
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 encoding = environ.get("HGENCODING")
Matt Mackall
move encoding bits from util to encoding...
r7948 if not encoding:
Pulkit Goyal
py3: make sure encoding.encoding is a bytes variable...
r30622 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
Martin von Zweigbergk
encoding: remove unnecessary lambdas from _encodingfixers...
r39871 encoding = _encodingrewrites.get(encoding, encoding)
Matt Mackall
move encoding bits from util to encoding...
r7948 except locale.Error:
encoding = 'ascii'
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 encodingmode = environ.get("HGENCODINGMODE", "strict")
Matt Mackall
move encoding bits from util to encoding...
r7948 fallbackencoding = 'ISO-8859-1'
Yuya Nishihara
py3: change encoding.localstr to a subclass of bytes, not str
r33810 class localstr(bytes):
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 '''This class allows strings that are unmodified to be
round-tripped to the local encoding and back'''
def __new__(cls, u, l):
Yuya Nishihara
py3: change encoding.localstr to a subclass of bytes, not str
r33810 s = bytes.__new__(cls, l)
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 s._utf8 = u
return s
def __hash__(self):
return hash(self._utf8) # avoid collisions in local string space
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 class safelocalstr(bytes):
"""Tagged string denoting it was previously an internal UTF-8 string,
and can be converted back to UTF-8 losslessly
>>> assert safelocalstr(b'\\xc3') == b'\\xc3'
>>> assert b'\\xc3' == safelocalstr(b'\\xc3')
>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
"""
Matt Mackall
move encoding bits from util to encoding...
r7948 def tolocal(s):
"""
Convert a string from internal UTF-8 to local encoding
All internal strings should be UTF-8 but some repos before the
implementation of locale support may contain latin1 or possibly
other character sets. We attempt to decode everything strictly
using UTF-8, then Latin-1, and failing that, we use UTF-8 and
replace unknown characters.
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
The localstr class is used to cache the known UTF-8 encoding of
strings next to their local representation to allow lossless
round-trip conversion back to UTF-8.
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> u = b'foo: \\xc3\\xa4' # utf-8
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 >>> l = tolocal(u)
>>> l
'foo: ?'
>>> fromlocal(l)
'foo: \\xc3\\xa4'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> u2 = b'foo: \\xc3\\xa1'
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 >>> d = { l: 1, tolocal(u2): 2 }
Mads Kiilerich
tests: stabilize doctest output...
r18378 >>> len(d) # no collision
2
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> b'foo: ?' in d
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 False
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 >>> l = tolocal(l1)
>>> l
'foo: ?'
>>> fromlocal(l) # magically in utf-8
'foo: \\xc3\\xa4'
Matt Mackall
move encoding bits from util to encoding...
r7948 """
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
Yuya Nishihara
encoding: add fast path of from/tolocal() for ASCII strings...
r33928 if isasciistr(s):
return s
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 try:
Matt Mackall
move encoding bits from util to encoding...
r7948 try:
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 # make sure string is actually stored in UTF-8
u = s.decode('UTF-8')
if encoding == 'UTF-8':
# fast path
return s
Gregory Szorc
encoding: use raw strings for encoding arguments...
r42002 r = u.encode(_sysstr(encoding), r"replace")
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 if u == r.decode(_sysstr(encoding)):
Matt Mackall
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...
r13940 # r is a safe, non-lossy encoding of s
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 return safelocalstr(r)
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 return localstr(s, r)
except UnicodeDecodeError:
# we should only get here if we're looking at an ancient changeset
try:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(fallbackencoding))
Gregory Szorc
encoding: use raw strings for encoding arguments...
r42002 r = u.encode(_sysstr(encoding), r"replace")
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 if u == r.decode(_sysstr(encoding)):
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 # r is a safe, non-lossy encoding of s
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 return safelocalstr(r)
Matt Mackall
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...
r13940 return localstr(u.encode('UTF-8'), r)
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 except UnicodeDecodeError:
u = s.decode("utf-8", "replace") # last ditch
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 # can't round-trip
Gregory Szorc
encoding: use raw strings for encoding arguments...
r42002 return u.encode(_sysstr(encoding), r"replace")
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 raise error.Abort(k, hint="please check your locale settings")
Matt Mackall
move encoding bits from util to encoding...
r7948
def fromlocal(s):
"""
Convert a string from the local character encoding to UTF-8
We attempt to decode strings using the encoding mode set by
HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
characters will cause an error message. Other modes include
'replace', which replaces unknown characters with a special
Unicode character, and 'ignore', which drops the character.
"""
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
# can we do a lossless round-trip?
if isinstance(s, localstr):
return s._utf8
Yuya Nishihara
encoding: add fast path of from/tolocal() for ASCII strings...
r33928 if isasciistr(s):
return s
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
Matt Mackall
move encoding bits from util to encoding...
r7948 try:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
return u.encode("utf-8")
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except UnicodeDecodeError as inst:
Matt Mackall
many, many trivial check-code fixups
r10282 sub = s[max(0, inst.start - 10):inst.start + 10]
Yuya Nishihara
py3: don't crash when re-raising encoding error
r36567 raise error.Abort("decoding near '%s': %s!"
% (sub, pycompat.bytestr(inst)))
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
Mads Kiilerich
encoding: use hint markup for "please check your locale settings"...
r15769 raise error.Abort(k, hint="please check your locale settings")
Matt Mackall
move encoding bits from util to encoding...
r7948
Yuya Nishihara
encoding: factor out unicode variants of from/tolocal()...
r31447 def unitolocal(u):
"""Convert a unicode string to a byte string of local encoding"""
return tolocal(u.encode('utf-8'))
def unifromlocal(s):
"""Convert a byte string of local encoding to a unicode string"""
return fromlocal(s).decode('utf-8')
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 def unimethod(bytesfunc):
"""Create a proxy method that forwards __unicode__() and __str__() of
Python 3 to __bytes__()"""
def unifunc(obj):
return unifromlocal(bytesfunc(obj))
return unifunc
Yuya Nishihara
encoding: add converter between native str and byte string...
r31448 # converter functions between native str and byte string. use these if the
# character encoding is not aware (e.g. exception message) or is known to
# be locale dependent (e.g. date formatting.)
if pycompat.ispy3:
strtolocal = unitolocal
strfromlocal = unifromlocal
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 strmethod = unimethod
Yuya Nishihara
encoding: add converter between native str and byte string...
r31448 else:
Yuya Nishihara
pycompat: introduce identity function as a compat stub...
r31774 strtolocal = pycompat.identity
strfromlocal = pycompat.identity
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 strmethod = pycompat.identity
Yuya Nishihara
encoding: add converter between native str and byte string...
r31448
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 if not _nativeenviron:
# now encoding and helper functions are available, recreate the environ
# dict to be exported to other modules
Gregory Szorc
encoding: use raw strings for encoding arguments...
r42002 environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
Yuya Nishihara
check-code: ignore re-exports of os.environ in encoding.py...
r32185 for k, v in os.environ.items()) # re-exports
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034
Matt Harbison
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...
r39843 if pycompat.ispy3:
# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
# returns bytes.
Matt Harbison
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings...
r39844 if pycompat.iswindows:
# Python 3 on Windows issues a DeprecationWarning about using the bytes
# API when os.getcwdb() is called.
getcwd = lambda: strtolocal(os.getcwd()) # re-exports
else:
getcwd = os.getcwdb # re-exports
Matt Harbison
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...
r39843 else:
getcwd = os.getcwd # re-exports
Matt Mackall
encoding: default ambiguous character to narrow...
r12866 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
Yuya Nishihara
encoding: make sure "wide" variable never be referenced from other modules...
r32537 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
and "WFA" or "WF")
Matt Mackall
encoding: default ambiguous character to narrow...
r12866
Matt Mackall
move encoding bits from util to encoding...
r7948 def colwidth(s):
Matt Mackall
encoding: colwidth input is in the local encoding
r15142 "Find the column width of a string for display in the local encoding"
Gregory Szorc
encoding: use raw strings for encoding arguments...
r42002 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
FUJIWARA Katsunori
i18n: calculate terminal columns by width information of each characters...
r15066
def ucolwidth(d):
"Find the column width of a Unicode string for display"
Augie Fackler
encoding: use getattr isntead of hasattr
r14951 eaw = getattr(unicodedata, 'east_asian_width', None)
if eaw is not None:
Yuya Nishihara
encoding: make sure "wide" variable never be referenced from other modules...
r32537 return sum([eaw(c) in _wide and 2 or 1 for c in d])
Matt Mackall
move encoding bits from util to encoding...
r7948 return len(d)
Matt Mackall
encoding: add getcols to extract substrings based on column width
r15143 def getcols(s, start, c):
'''Use colwidth to find a c-column substring of s starting at byte
index start'''
Gregory Szorc
global: use pycompat.xrange()...
r38806 for x in pycompat.xrange(start + c, len(s)):
Matt Mackall
encoding: add getcols to extract substrings based on column width
r15143 t = s[start:x]
if colwidth(t) == c:
return t
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 def trim(s, width, ellipsis='', leftside=False):
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 """Trim string 's' to at most 'width' columns (including 'ellipsis').
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 If 'leftside' is True, left side of string 's' is trimmed.
'ellipsis' is always placed at trimmed side.
Yuya Nishihara
doctest: do not embed non-ascii characters in docstring...
r34138 >>> from .node import bin
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> def bprint(s):
... print(pycompat.sysstr(s))
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> ellipsis = b'+++'
Gregory Szorc
encoding: use absolute_import
r27355 >>> from . import encoding
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> encoding.encoding = b'utf-8'
>>> t = b'1234567890'
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 12, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 1234567890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 10, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 1234567890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 12345+++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++67890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 12345678
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 34567890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 3, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 1, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +
>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
Yuya Nishihara
doctest: pass encoding name as system string
r34137 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 12, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 10, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84+++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++\xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 5))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 5, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 \xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 4, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++
Yuya Nishihara
doctest: do not embed non-ascii characters in docstring...
r34138 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 12, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 10, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55+++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55\x66\x77\x88
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 \x33\x44\x55\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 3, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 1, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +
"""
try:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 except UnicodeDecodeError:
if len(s) <= width: # trimming is not needed
return s
width -= len(ellipsis)
if width <= 0: # no enough room even for ellipsis
return ellipsis[:width + len(ellipsis)]
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 if leftside:
return ellipsis + s[-width:]
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 return s[:width] + ellipsis
if ucolwidth(u) <= width: # trimming is not needed
return s
width -= len(ellipsis)
if width <= 0: # no enough room even for ellipsis
return ellipsis[:width + len(ellipsis)]
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 if leftside:
uslice = lambda i: u[i:]
concat = lambda s: ellipsis + s
else:
uslice = lambda i: u[:-i]
concat = lambda s: s + ellipsis
Gregory Szorc
global: use pycompat.xrange()...
r38806 for i in pycompat.xrange(1, len(u)):
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 usub = uslice(i)
if ucolwidth(usub) <= width:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 return concat(usub.encode(_sysstr(encoding)))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 return ellipsis # no enough room for multi-column characters
Matt Mackall
encoding: add an encoding-aware lower function
r14069 def lower(s):
"best-effort encoding-aware case-folding of local string s"
try:
Siddharth Agarwal
encoding.lower: use fast ASCII lower...
r22779 return asciilower(s)
Martin Geisler
encoding: use s.decode to trigger UnicodeDecodeError...
r17235 except UnicodeDecodeError:
Matt Mackall
encoding: add fast-path for ASCII lowercase
r16387 pass
try:
Matt Mackall
encoding: add an encoding-aware lower function
r14069 if isinstance(s, localstr):
u = s._utf8.decode("utf-8")
else:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
Matt Mackall
encoding: add an encoding-aware lower function
r14069
lu = u.lower()
if u == lu:
return s # preserve localstring
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 return lu.encode(_sysstr(encoding))
Matt Mackall
encoding: add an encoding-aware lower function
r14069 except UnicodeError:
return s.lower() # we don't know how to fold this except in ASCII
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 raise error.Abort(k, hint="please check your locale settings")
def upper(s):
"best-effort encoding-aware case-folding of local string s"
try:
Siddharth Agarwal
encoding: use parsers.asciiupper when available...
r24578 return asciiupper(s)
Martin Geisler
encoding: add fast-path for ASCII uppercase....
r17236 except UnicodeDecodeError:
Siddharth Agarwal
encoding.upper: factor out fallback code...
r24597 return upperfallback(s)
def upperfallback(s):
Martin Geisler
encoding: add fast-path for ASCII uppercase....
r17236 try:
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 if isinstance(s, localstr):
u = s._utf8.decode("utf-8")
else:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672
uu = u.upper()
if u == uu:
return s # preserve localstring
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 return uu.encode(_sysstr(encoding))
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 except UnicodeError:
return s.upper() # we don't know how to fold this except in ASCII
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 raise error.Abort(k, hint="please check your locale settings")
Matt Mackall
encoding: introduce utf8-b helpers
r16133
Siddharth Agarwal
encoding: define an enum that specifies what normcase does to ASCII strings...
r24593 class normcasespecs(object):
'''what a platform's normcase does to ASCII strings
This is specified per platform, and should be consistent with what normcase
on that platform actually does.
lower: normcase lowercases ASCII strings
upper: normcase uppercases ASCII strings
Siddharth Agarwal
util.h: define an enum for normcase specs...
r24608 other: the fallback function should always be called
This should be kept in sync with normcase_spec in util.h.'''
Siddharth Agarwal
encoding: define an enum that specifies what normcase does to ASCII strings...
r24593 lower = -1
upper = 1
other = 0
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 def jsonescape(s, paranoid=False):
Matt Mackall
encoding: add json escaping filter...
r22426 '''returns a string suitable for JSON
JSON is problematic for us because it doesn't support non-Unicode
bytes. To deal with this, we take the following approach:
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 - localstr/safelocalstr objects are converted back to UTF-8
Matt Mackall
encoding: add json escaping filter...
r22426 - valid UTF-8/ASCII strings are passed as-is
- other strings are converted to UTF-8b surrogate encoding
- apply JSON-specified string escaping
(escapes are doubled in these tests)
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'this is a test')
Matt Mackall
encoding: add json escaping filter...
r22426 'this is a test'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
Yuya Nishihara
encoding: escape U+007F (DEL) character in JSON...
r27881 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'a weird byte: \\xdd')
Matt Mackall
encoding: add json escaping filter...
r22426 'a weird byte: \\xed\\xb3\\x9d'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
Matt Mackall
encoding: add json escaping filter...
r22426 'utf-8: caf\\xc3\\xa9'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'')
Matt Mackall
encoding: add json escaping filter...
r22426 ''
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068
Yuya Nishihara
encoding: backport paranoid escaping from templatefilters.jsonescape()...
r28069 If paranoid, non-ascii and common troublesome characters are also escaped.
This is suitable for web output.
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> s = b'escape characters: \\0 \\x0b \\x7f'
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'escape boundary: ~ \\\\u007f \\\\u0080'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'a weird byte: \\\\udcdd'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'utf-8: caf\\\\u00e9'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'non-BMP: \\\\ud834\\\\udd1e'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'<foo@example.org>', paranoid=True)
Yuya Nishihara
encoding: backport paranoid escaping from templatefilters.jsonescape()...
r28069 '\\\\u003cfoo@example.org\\\\u003e'
Matt Mackall
encoding: add json escaping filter...
r22426 '''
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 u8chars = toutf8b(s)
try:
Yuya Nishihara
encoding: extract stub for fast JSON escape...
r33925 return _jsonescapeu8fast(u8chars, paranoid)
except ValueError:
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 pass
Yuya Nishihara
encoding: extract stub for fast JSON escape...
r33925 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
Matt Mackall
encoding: add json escaping filter...
r22426
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
# bytes are mapped to that range.
if pycompat.ispy3:
_utf8strict = r'surrogatepass'
else:
_utf8strict = r'strict'
Matt Mackall
encoding: add getutf8char helper...
r26875 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
def getutf8char(s, pos):
'''get the next full utf-8 character in the given string, starting at pos
Raises a UnicodeError if the given location does not start a valid
utf-8 character.
'''
# find how many bytes to attempt decoding from first nibble
Augie Fackler
encoding: ensure getutf8char always returns a bytestr, never an int
r34197 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
Matt Mackall
encoding: add getutf8char helper...
r26875 if not l: # ascii
Augie Fackler
encoding: ensure getutf8char always returns a bytestr, never an int
r34197 return s[pos:pos + 1]
Matt Mackall
encoding: add getutf8char helper...
r26875
c = s[pos:pos + l]
# validate with attempted decode
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 c.decode("utf-8", _utf8strict)
Matt Mackall
encoding: add getutf8char helper...
r26875 return c
Matt Mackall
encoding: introduce utf8-b helpers
r16133 def toutf8b(s):
'''convert a local, possibly-binary string into UTF-8b
This is intended as a generic method to preserve data when working
with schemes like JSON and XML that have no provision for
arbitrary byte strings. As Mercurial often doesn't know
what encoding data is in, we use so-called UTF-8b.
If a string is already valid UTF-8 (or ASCII), it passes unmodified.
Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
uDC00-uDCFF.
Principles of operation:
Mads Kiilerich
fix trivial spelling errors
r17424 - ASCII and UTF-8 data successfully round-trips and is understood
Matt Mackall
encoding: introduce utf8-b helpers
r16133 by Unicode-oriented clients
- filenames and file contents in arbitrary other encodings can have
be round-tripped or recovered by clueful clients
- local strings that have a cached known UTF-8 encoding (aka
localstr) get sent as UTF-8 so Unicode-oriented clients get the
Unicode data they want
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
Matt Mackall
encoding: introduce utf8-b helpers
r16133 - because we must preserve UTF-8 bytestring in places such as
filenames, metadata can't be roundtripped without help
(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
arbitrary bytes into an internal Unicode format that can be
re-encoded back into the original. Here we are exposing the
internal surrogate encoding as a UTF-8 string.)
'''
Yuya Nishihara
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...
r37965 if isinstance(s, localstr):
# assume that the original UTF-8 sequence would never contain
# invalid characters in U+DCxx range
return s._utf8
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 elif isinstance(s, safelocalstr):
# already verified that s is non-lossy in legacy encoding, which
# shouldn't contain characters in U+DCxx range
return fromlocal(s)
Yuya Nishihara
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...
r37965 elif isasciistr(s):
Yuya Nishihara
encoding: add fast path of from/toutf8b() for ASCII strings...
r33929 return s
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 if "\xed" not in s:
try:
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 s.decode('utf-8', _utf8strict)
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 return s
except UnicodeDecodeError:
pass
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878
Yuya Nishihara
py3: wrap bytes in encoding.from/toutf8b() with bytestr
r34213 s = pycompat.bytestr(s)
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878 r = ""
pos = 0
l = len(s)
while pos < l:
try:
c = getutf8char(s, pos)
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
# have to re-escape existing U+DCxx characters
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 pos += 1
else:
pos += len(c)
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878 except UnicodeDecodeError:
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878 pos += 1
r += c
return r
Matt Mackall
encoding: introduce utf8-b helpers
r16133
def fromutf8b(s):
'''Given a UTF-8b string, return a local, possibly-binary string.
return the original binary string. This
is a round-trip process for strings like filenames, but metadata
that's was passed through tolocal will remain in UTF-8.
Matt Mackall
encoding: extend test cases for utf8b...
r26963 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> m = b"\\xc3\\xa9\\x99abcd"
Matt Mackall
encoding: extend test cases for utf8b...
r26963 >>> toutf8b(m)
Matt Mackall
encoding: introduce utf8-b helpers
r16133 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
Matt Mackall
encoding: extend test cases for utf8b...
r26963 >>> roundtrip(m)
True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xc2\\xc2\\x80")
Matt Mackall
encoding: extend test cases for utf8b...
r26963 True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xef\\xbf\\xbd")
Matt Mackall
encoding: extend test cases for utf8b...
r26963 True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
Matt Mackall
encoding: introduce utf8-b helpers
r16133 True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 True
Matt Mackall
encoding: introduce utf8-b helpers
r16133 '''
Yuya Nishihara
encoding: add fast path of from/toutf8b() for ASCII strings...
r33929 if isasciistr(s):
return s
Matt Mackall
encoding: introduce utf8-b helpers
r16133 # fast path - look for uDxxx prefixes in s
if "\xed" not in s:
return s
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 # We could do this with the unicode type but some Python builds
# use UTF-16 internally (issue5031) which causes non-BMP code
# points to be escaped. Instead, we use our handy getutf8char
# helper again to walk the string without "decoding" it.
Yuya Nishihara
py3: wrap bytes in encoding.from/toutf8b() with bytestr
r34213 s = pycompat.bytestr(s)
Matt Mackall
encoding: introduce utf8-b helpers
r16133 r = ""
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 pos = 0
l = len(s)
while pos < l:
c = getutf8char(s, pos)
pos += len(c)
# unescape U+DCxx characters
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 r += c
Matt Mackall
encoding: introduce utf8-b helpers
r16133 return r