Show More
encoding.py
709 lines
| 22.0 KiB
| text/x-python
|
PythonLexer
/ mercurial / encoding.py
Martin Geisler
|
r8226 | # encoding.py - character transcoding support for Mercurial | ||
# | ||||
# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
Matt Mackall
|
r10263 | # GNU General Public License version 2 or any later version. | ||
Matt Mackall
|
r7948 | |||
Yuya Nishihara
|
r34139 | from __future__ import absolute_import, print_function | ||
Gregory Szorc
|
r27355 | |||
import locale | ||||
import os | ||||
import unicodedata | ||||
Gregory Szorc
|
r43359 | from .pycompat import getattr | ||
Gregory Szorc
|
r27355 | from . import ( | ||
error, | ||||
Yuya Nishihara
|
r32372 | policy, | ||
Yuya Nishihara
|
r30030 | pycompat, | ||
Gregory Szorc
|
r27355 | ) | ||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | from .pure import charencode as charencodepure | ||
Yuya Nishihara
|
r33925 | |||
Yuya Nishihara
|
r44212 | if pycompat.TYPE_CHECKING: | ||
Augie Fackler
|
r43802 | from typing import ( | ||
Any, | ||||
Callable, | ||||
List, | ||||
Text, | ||||
Type, | ||||
TypeVar, | ||||
Union, | ||||
) | ||||
# keep pyflakes happy | ||||
for t in (Any, Callable, List, Text, Type, Union): | ||||
assert t | ||||
Yuya Nishihara
|
r44075 | _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr') | ||
Augie Fackler
|
r43802 | |||
Augie Fackler
|
r43906 | charencode = policy.importmod('charencode') | ||
Yuya Nishihara
|
r33756 | |||
Yuya Nishihara
|
r33927 | isasciistr = charencode.isasciistr | ||
Yuya Nishihara
|
r33756 | asciilower = charencode.asciilower | ||
asciiupper = charencode.asciiupper | ||||
Yuya Nishihara
|
r33926 | _jsonescapeu8fast = charencode.jsonescapeu8fast | ||
Yuya Nishihara
|
r33756 | |||
Yuya Nishihara
|
r30033 | _sysstr = pycompat.sysstr | ||
Yuya Nishihara
|
r30030 | if pycompat.ispy3: | ||
Gregory Szorc
|
r28507 | unichr = chr | ||
Augie Fackler
|
r23596 | # These unicode characters are ignored by HFS+ (Apple Technote 1150, | ||
# "Unicode Subtleties"), so we need to ignore them in some places for | ||||
# sanity. | ||||
Augie Fackler
|
r43346 | _ignore = [ | ||
unichr(int(x, 16)).encode("utf-8") | ||||
Augie Fackler
|
r43347 | for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " | ||
b"206a 206b 206c 206d 206e 206f feff".split() | ||||
Augie Fackler
|
r43346 | ] | ||
Augie Fackler
|
r23596 | # verify the next function will work | ||
Augie Fackler
|
r43347 | assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) | ||
Augie Fackler
|
r23596 | |||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r23596 | def hfsignoreclean(s): | ||
Augie Fackler
|
r43802 | # type: (bytes) -> bytes | ||
Augie Fackler
|
r23596 | """Remove codepoints ignored by HFS+ from s. | ||
>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) | ||||
'.hg' | ||||
>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) | ||||
'.hg' | ||||
""" | ||||
Augie Fackler
|
r43347 | if b"\xe2" in s or b"\xef" in s: | ||
Augie Fackler
|
r23596 | for c in _ignore: | ||
Augie Fackler
|
r43347 | s = s.replace(c, b'') | ||
Augie Fackler
|
r23596 | return s | ||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r30034 | # encoding.environ is provided read-only, which may not be used to modify | ||
# the process environment | ||||
Augie Fackler
|
r43346 | _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ | ||
Yuya Nishihara
|
r30034 | if not pycompat.ispy3: | ||
Yuya Nishihara
|
r32185 | environ = os.environ # re-exports | ||
Yuya Nishihara
|
r30034 | elif _nativeenviron: | ||
Yuya Nishihara
|
r32185 | environ = os.environb # re-exports | ||
Yuya Nishihara
|
r30034 | else: | ||
# preferred encoding isn't known yet; use utf-8 to avoid unicode error | ||||
# and recreate it once encoding is settled | ||||
Augie Fackler
|
r44937 | environ = { | ||
k.encode('utf-8'): v.encode('utf-8') | ||||
Augie Fackler
|
r43346 | for k, v in os.environ.items() # re-exports | ||
Augie Fackler
|
r44937 | } | ||
Yuya Nishihara
|
r30034 | |||
Martin von Zweigbergk
|
r39871 | _encodingrewrites = { | ||
Augie Fackler
|
r43347 | b'646': b'ascii', | ||
b'ANSI_X3.4-1968': b'ascii', | ||||
Dan Villiom Podlaski Christiansen
|
r11892 | } | ||
Yuya Nishihara
|
r38633 | # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. | ||
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. | ||||
# https://bugs.python.org/issue13216 | ||||
if pycompat.iswindows and not pycompat.ispy3: | ||||
Augie Fackler
|
r43347 | _encodingrewrites[b'cp65001'] = b'utf-8' | ||
Matt Mackall
|
r7948 | |||
try: | ||||
Augie Fackler
|
r43347 | encoding = environ.get(b"HGENCODING") | ||
Matt Mackall
|
r7948 | if not encoding: | ||
Augie Fackler
|
r43347 | encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' | ||
Martin von Zweigbergk
|
r39871 | encoding = _encodingrewrites.get(encoding, encoding) | ||
Matt Mackall
|
r7948 | except locale.Error: | ||
Augie Fackler
|
r43347 | encoding = b'ascii' | ||
encodingmode = environ.get(b"HGENCODINGMODE", b"strict") | ||||
fallbackencoding = b'ISO-8859-1' | ||||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33810 | class localstr(bytes): | ||
Augie Fackler
|
r46554 | """This class allows strings that are unmodified to be | ||
round-tripped to the local encoding and back""" | ||||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r13046 | def __new__(cls, u, l): | ||
Yuya Nishihara
|
r33810 | s = bytes.__new__(cls, l) | ||
Matt Mackall
|
r13046 | s._utf8 = u | ||
return s | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r44212 | if pycompat.TYPE_CHECKING: | ||
Yuya Nishihara
|
r44080 | # pseudo implementation to help pytype see localstr() constructor | ||
def __init__(self, u, l): | ||||
# type: (bytes, bytes) -> None | ||||
super(localstr, self).__init__(l) | ||||
self._utf8 = u | ||||
Matt Mackall
|
r13046 | def __hash__(self): | ||
Augie Fackler
|
r43346 | return hash(self._utf8) # avoid collisions in local string space | ||
Matt Mackall
|
r13046 | |||
Yuya Nishihara
|
r37966 | class safelocalstr(bytes): | ||
"""Tagged string denoting it was previously an internal UTF-8 string, | ||||
and can be converted back to UTF-8 losslessly | ||||
>>> assert safelocalstr(b'\\xc3') == b'\\xc3' | ||||
>>> assert b'\\xc3' == safelocalstr(b'\\xc3') | ||||
>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} | ||||
>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} | ||||
""" | ||||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r7948 | def tolocal(s): | ||
Yuya Nishihara
|
r44076 | # type: (bytes) -> bytes | ||
Matt Mackall
|
r7948 | """ | ||
Convert a string from internal UTF-8 to local encoding | ||||
All internal strings should be UTF-8 but some repos before the | ||||
implementation of locale support may contain latin1 or possibly | ||||
other character sets. We attempt to decode everything strictly | ||||
using UTF-8, then Latin-1, and failing that, we use UTF-8 and | ||||
replace unknown characters. | ||||
Matt Mackall
|
r13046 | |||
The localstr class is used to cache the known UTF-8 encoding of | ||||
strings next to their local representation to allow lossless | ||||
round-trip conversion back to UTF-8. | ||||
Yuya Nishihara
|
r34133 | >>> u = b'foo: \\xc3\\xa4' # utf-8 | ||
Matt Mackall
|
r13046 | >>> l = tolocal(u) | ||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) | ||||
'foo: \\xc3\\xa4' | ||||
Yuya Nishihara
|
r34133 | >>> u2 = b'foo: \\xc3\\xa1' | ||
Matt Mackall
|
r13046 | >>> d = { l: 1, tolocal(u2): 2 } | ||
Mads Kiilerich
|
r18378 | >>> len(d) # no collision | ||
2 | ||||
Yuya Nishihara
|
r34133 | >>> b'foo: ?' in d | ||
Matt Mackall
|
r13046 | False | ||
Yuya Nishihara
|
r34133 | >>> l1 = b'foo: \\xe4' # historical latin1 fallback | ||
Matt Mackall
|
r13046 | >>> l = tolocal(l1) | ||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) # magically in utf-8 | ||||
'foo: \\xc3\\xa4' | ||||
Matt Mackall
|
r7948 | """ | ||
Matt Mackall
|
r13046 | |||
Yuya Nishihara
|
r33928 | if isasciistr(s): | ||
return s | ||||
Matt Mackall
|
r16274 | try: | ||
Matt Mackall
|
r7948 | try: | ||
Matt Mackall
|
r16274 | # make sure string is actually stored in UTF-8 | ||
u = s.decode('UTF-8') | ||||
Augie Fackler
|
r43347 | if encoding == b'UTF-8': | ||
Matt Mackall
|
r16274 | # fast path | ||
return s | ||||
Augie Fackler
|
r43809 | r = u.encode(_sysstr(encoding), "replace") | ||
Yuya Nishihara
|
r30033 | if u == r.decode(_sysstr(encoding)): | ||
Matt Mackall
|
r13940 | # r is a safe, non-lossy encoding of s | ||
Yuya Nishihara
|
r37966 | return safelocalstr(r) | ||
Matt Mackall
|
r16274 | return localstr(s, r) | ||
except UnicodeDecodeError: | ||||
# we should only get here if we're looking at an ancient changeset | ||||
try: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(fallbackencoding)) | ||
Augie Fackler
|
r43809 | r = u.encode(_sysstr(encoding), "replace") | ||
Yuya Nishihara
|
r30033 | if u == r.decode(_sysstr(encoding)): | ||
Matt Mackall
|
r16274 | # r is a safe, non-lossy encoding of s | ||
Yuya Nishihara
|
r37966 | return safelocalstr(r) | ||
Matt Mackall
|
r13940 | return localstr(u.encode('UTF-8'), r) | ||
Matt Mackall
|
r16274 | except UnicodeDecodeError: | ||
Augie Fackler
|
r43346 | u = s.decode("utf-8", "replace") # last ditch | ||
Yuya Nishihara
|
r30033 | # can't round-trip | ||
Augie Fackler
|
r43809 | return u.encode(_sysstr(encoding), "replace") | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Martin von Zweigbergk
|
r46273 | raise error.Abort( | ||
pycompat.bytestr(k), hint=b"please check your locale settings" | ||||
) | ||||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r7948 | def fromlocal(s): | ||
Augie Fackler
|
r44036 | # type: (bytes) -> bytes | ||
Matt Mackall
|
r7948 | """ | ||
Convert a string from the local character encoding to UTF-8 | ||||
We attempt to decode strings using the encoding mode set by | ||||
HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown | ||||
characters will cause an error message. Other modes include | ||||
'replace', which replaces unknown characters with a special | ||||
Unicode character, and 'ignore', which drops the character. | ||||
""" | ||||
Matt Mackall
|
r13046 | |||
# can we do a lossless round-trip? | ||||
if isinstance(s, localstr): | ||||
return s._utf8 | ||||
Yuya Nishihara
|
r33928 | if isasciistr(s): | ||
return s | ||||
Matt Mackall
|
r13046 | |||
Matt Mackall
|
r7948 | try: | ||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
return u.encode("utf-8") | ||||
Gregory Szorc
|
r25660 | except UnicodeDecodeError as inst: | ||
Augie Fackler
|
r43346 | sub = s[max(0, inst.start - 10) : inst.start + 10] | ||
raise error.Abort( | ||||
Augie Fackler
|
r43347 | b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Augie Fackler
|
r43347 | raise error.Abort(k, hint=b"please check your locale settings") | ||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r31447 | def unitolocal(u): | ||
Augie Fackler
|
r43802 | # type: (Text) -> bytes | ||
Yuya Nishihara
|
r31447 | """Convert a unicode string to a byte string of local encoding""" | ||
return tolocal(u.encode('utf-8')) | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r31447 | def unifromlocal(s): | ||
Augie Fackler
|
r43802 | # type: (bytes) -> Text | ||
Yuya Nishihara
|
r31447 | """Convert a byte string of local encoding to a unicode string""" | ||
return fromlocal(s).decode('utf-8') | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33022 | def unimethod(bytesfunc): | ||
Augie Fackler
|
r43802 | # type: (Callable[[Any], bytes]) -> Callable[[Any], Text] | ||
Yuya Nishihara
|
r33022 | """Create a proxy method that forwards __unicode__() and __str__() of | ||
Python 3 to __bytes__()""" | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33022 | def unifunc(obj): | ||
return unifromlocal(bytesfunc(obj)) | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33022 | return unifunc | ||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r31448 | # converter functions between native str and byte string. use these if the | ||
# character encoding is not aware (e.g. exception message) or is known to | ||||
# be locale dependent (e.g. date formatting.) | ||||
if pycompat.ispy3: | ||||
strtolocal = unitolocal | ||||
strfromlocal = unifromlocal | ||||
Yuya Nishihara
|
r33022 | strmethod = unimethod | ||
Yuya Nishihara
|
r31448 | else: | ||
Augie Fackler
|
r43770 | |||
def strtolocal(s): | ||||
# type: (str) -> bytes | ||||
Yuya Nishihara
|
r44078 | return s # pytype: disable=bad-return-type | ||
Augie Fackler
|
r43770 | |||
def strfromlocal(s): | ||||
# type: (bytes) -> str | ||||
Yuya Nishihara
|
r44078 | return s # pytype: disable=bad-return-type | ||
Augie Fackler
|
r43770 | |||
Yuya Nishihara
|
r33022 | strmethod = pycompat.identity | ||
Yuya Nishihara
|
r31448 | |||
Yuya Nishihara
|
r30034 | if not _nativeenviron: | ||
# now encoding and helper functions are available, recreate the environ | ||||
# dict to be exported to other modules | ||||
Augie Fackler
|
r44937 | environ = { | ||
tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8')) | ||||
Augie Fackler
|
r43346 | for k, v in os.environ.items() # re-exports | ||
Augie Fackler
|
r44937 | } | ||
Yuya Nishihara
|
r30034 | |||
Matt Harbison
|
r39843 | if pycompat.ispy3: | ||
# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which | ||||
# returns bytes. | ||||
Matt Harbison
|
r39844 | if pycompat.iswindows: | ||
# Python 3 on Windows issues a DeprecationWarning about using the bytes | ||||
# API when os.getcwdb() is called. | ||||
Matt Harbison
|
r47037 | # | ||
# Additionally, py3.8+ uppercases the drive letter when calling | ||||
# os.path.realpath(), which is used on ``repo.root``. Since those | ||||
# strings are compared in various places as simple strings, also call | ||||
# realpath here. See https://bugs.python.org/issue40368 | ||||
getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports | ||||
Matt Harbison
|
r39844 | else: | ||
getcwd = os.getcwdb # re-exports | ||||
Matt Harbison
|
r39843 | else: | ||
getcwd = os.getcwd # re-exports | ||||
Matt Mackall
|
r12866 | # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | ||
Augie Fackler
|
r43346 | _wide = _sysstr( | ||
Augie Fackler
|
r43347 | environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" | ||
and b"WFA" | ||||
or b"WF" | ||||
Augie Fackler
|
r43346 | ) | ||
Matt Mackall
|
r12866 | |||
Matt Mackall
|
r7948 | def colwidth(s): | ||
Augie Fackler
|
r43802 | # type: (bytes) -> int | ||
Matt Harbison
|
r44226 | """Find the column width of a string for display in the local encoding""" | ||
Augie Fackler
|
r43906 | return ucolwidth(s.decode(_sysstr(encoding), 'replace')) | ||
FUJIWARA Katsunori
|
r15066 | |||
Augie Fackler
|
r43346 | |||
FUJIWARA Katsunori
|
r15066 | def ucolwidth(d): | ||
Augie Fackler
|
r43802 | # type: (Text) -> int | ||
Matt Harbison
|
r44226 | """Find the column width of a Unicode string for display""" | ||
Augie Fackler
|
r14951 | eaw = getattr(unicodedata, 'east_asian_width', None) | ||
if eaw is not None: | ||||
Yuya Nishihara
|
r32537 | return sum([eaw(c) in _wide and 2 or 1 for c in d]) | ||
Matt Mackall
|
r7948 | return len(d) | ||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r15143 | def getcols(s, start, c): | ||
Augie Fackler
|
r43802 | # type: (bytes, int, int) -> bytes | ||
Augie Fackler
|
r46554 | """Use colwidth to find a c-column substring of s starting at byte | ||
index start""" | ||||
Gregory Szorc
|
r38806 | for x in pycompat.xrange(start + c, len(s)): | ||
Matt Mackall
|
r15143 | t = s[start:x] | ||
if colwidth(t) == c: | ||||
return t | ||||
Yuya Nishihara
|
r44074 | raise ValueError('substring not found') | ||
Matt Mackall
|
r15143 | |||
Augie Fackler
|
r43346 | |||
Augie Fackler
|
r43347 | def trim(s, width, ellipsis=b'', leftside=False): | ||
Augie Fackler
|
r43802 | # type: (bytes, int, bytes, bool) -> bytes | ||
FUJIWARA Katsunori
|
r21856 | """Trim string 's' to at most 'width' columns (including 'ellipsis'). | ||
FUJIWARA Katsunori
|
r21861 | If 'leftside' is True, left side of string 's' is trimmed. | ||
'ellipsis' is always placed at trimmed side. | ||||
Yuya Nishihara
|
r34138 | >>> from .node import bin | ||
Yuya Nishihara
|
r34139 | >>> def bprint(s): | ||
... print(pycompat.sysstr(s)) | ||||
Yuya Nishihara
|
r34133 | >>> ellipsis = b'+++' | ||
Gregory Szorc
|
r27355 | >>> from . import encoding | ||
Yuya Nishihara
|
r34133 | >>> encoding.encoding = b'utf-8' | ||
>>> t = b'1234567890' | ||||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 12, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | 1234567890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 10, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | 1234567890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | 12345+++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++67890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8)) | ||
FUJIWARA Katsunori
|
r21856 | 12345678 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | 34567890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 3, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | +++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 1, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | + | ||
>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns | ||||
Yuya Nishihara
|
r34137 | >>> t = u.encode(pycompat.sysstr(encoding.encoding)) | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 12, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 10, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84+++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++\xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 5)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 5, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | \xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 4, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | +++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++ | ||
Yuya Nishihara
|
r34138 | >>> t = bin(b'112233445566778899aa') # invalid byte sequence | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 12, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 10, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55+++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55\x66\x77\x88 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | \x33\x44\x55\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 3, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | +++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 1, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | + | ||
""" | ||||
try: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding)) | ||
FUJIWARA Katsunori
|
r21856 | except UnicodeDecodeError: | ||
Augie Fackler
|
r43346 | if len(s) <= width: # trimming is not needed | ||
FUJIWARA Katsunori
|
r21856 | return s | ||
width -= len(ellipsis) | ||||
Augie Fackler
|
r43346 | if width <= 0: # no enough room even for ellipsis | ||
return ellipsis[: width + len(ellipsis)] | ||||
FUJIWARA Katsunori
|
r21861 | if leftside: | ||
return ellipsis + s[-width:] | ||||
FUJIWARA Katsunori
|
r21856 | return s[:width] + ellipsis | ||
Augie Fackler
|
r43346 | if ucolwidth(u) <= width: # trimming is not needed | ||
FUJIWARA Katsunori
|
r21856 | return s | ||
width -= len(ellipsis) | ||||
Augie Fackler
|
r43346 | if width <= 0: # no enough room even for ellipsis | ||
return ellipsis[: width + len(ellipsis)] | ||||
FUJIWARA Katsunori
|
r21856 | |||
FUJIWARA Katsunori
|
r21861 | if leftside: | ||
uslice = lambda i: u[i:] | ||||
concat = lambda s: ellipsis + s | ||||
else: | ||||
uslice = lambda i: u[:-i] | ||||
concat = lambda s: s + ellipsis | ||||
Gregory Szorc
|
r38806 | for i in pycompat.xrange(1, len(u)): | ||
FUJIWARA Katsunori
|
r21856 | usub = uslice(i) | ||
if ucolwidth(usub) <= width: | ||||
Yuya Nishihara
|
r30033 | return concat(usub.encode(_sysstr(encoding))) | ||
Augie Fackler
|
r43346 | return ellipsis # no enough room for multi-column characters | ||
FUJIWARA Katsunori
|
r21856 | |||
Matt Mackall
|
r14069 | def lower(s): | ||
Augie Fackler
|
r43802 | # type: (bytes) -> bytes | ||
Matt Harbison
|
r44226 | """best-effort encoding-aware case-folding of local string s""" | ||
Matt Mackall
|
r14069 | try: | ||
Siddharth Agarwal
|
r22779 | return asciilower(s) | ||
Martin Geisler
|
r17235 | except UnicodeDecodeError: | ||
Matt Mackall
|
r16387 | pass | ||
try: | ||||
Matt Mackall
|
r14069 | if isinstance(s, localstr): | ||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
Matt Mackall
|
r14069 | |||
lu = u.lower() | ||||
if u == lu: | ||||
Augie Fackler
|
r43346 | return s # preserve localstring | ||
Yuya Nishihara
|
r30033 | return lu.encode(_sysstr(encoding)) | ||
Matt Mackall
|
r14069 | except UnicodeError: | ||
Augie Fackler
|
r43346 | return s.lower() # we don't know how to fold this except in ASCII | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Augie Fackler
|
r43347 | raise error.Abort(k, hint=b"please check your locale settings") | ||
FUJIWARA Katsunori
|
r15672 | |||
Augie Fackler
|
r43346 | |||
FUJIWARA Katsunori
|
r15672 | def upper(s): | ||
Augie Fackler
|
r43802 | # type: (bytes) -> bytes | ||
Matt Harbison
|
r44226 | """best-effort encoding-aware case-folding of local string s""" | ||
FUJIWARA Katsunori
|
r15672 | try: | ||
Siddharth Agarwal
|
r24578 | return asciiupper(s) | ||
Martin Geisler
|
r17236 | except UnicodeDecodeError: | ||
Siddharth Agarwal
|
r24597 | return upperfallback(s) | ||
Augie Fackler
|
r43346 | |||
Siddharth Agarwal
|
r24597 | def upperfallback(s): | ||
Augie Fackler
|
r43802 | # type: (Any) -> Any | ||
Martin Geisler
|
r17236 | try: | ||
FUJIWARA Katsunori
|
r15672 | if isinstance(s, localstr): | ||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
FUJIWARA Katsunori
|
r15672 | |||
uu = u.upper() | ||||
if u == uu: | ||||
Augie Fackler
|
r43346 | return s # preserve localstring | ||
Yuya Nishihara
|
r30033 | return uu.encode(_sysstr(encoding)) | ||
FUJIWARA Katsunori
|
r15672 | except UnicodeError: | ||
Augie Fackler
|
r43346 | return s.upper() # we don't know how to fold this except in ASCII | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Augie Fackler
|
r43347 | raise error.Abort(k, hint=b"please check your locale settings") | ||
Matt Mackall
|
r16133 | |||
Augie Fackler
|
r43346 | |||
Siddharth Agarwal
|
r24593 | class normcasespecs(object): | ||
Augie Fackler
|
r46554 | """what a platform's normcase does to ASCII strings | ||
Siddharth Agarwal
|
r24593 | |||
This is specified per platform, and should be consistent with what normcase | ||||
on that platform actually does. | ||||
lower: normcase lowercases ASCII strings | ||||
upper: normcase uppercases ASCII strings | ||||
Siddharth Agarwal
|
r24608 | other: the fallback function should always be called | ||
Augie Fackler
|
r46554 | This should be kept in sync with normcase_spec in util.h.""" | ||
Augie Fackler
|
r43346 | |||
Siddharth Agarwal
|
r24593 | lower = -1 | ||
upper = 1 | ||||
other = 0 | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r28068 | def jsonescape(s, paranoid=False): | ||
Augie Fackler
|
r43802 | # type: (Any, Any) -> Any | ||
Augie Fackler
|
r46554 | """returns a string suitable for JSON | ||
Matt Mackall
|
r22426 | |||
JSON is problematic for us because it doesn't support non-Unicode | ||||
bytes. To deal with this, we take the following approach: | ||||
Yuya Nishihara
|
r37966 | - localstr/safelocalstr objects are converted back to UTF-8 | ||
Matt Mackall
|
r22426 | - valid UTF-8/ASCII strings are passed as-is | ||
- other strings are converted to UTF-8b surrogate encoding | ||||
- apply JSON-specified string escaping | ||||
(escapes are doubled in these tests) | ||||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'this is a test') | ||
Matt Mackall
|
r22426 | 'this is a test' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') | ||
Yuya Nishihara
|
r27881 | 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') | ||
Yuya Nishihara
|
r33926 | 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'a weird byte: \\xdd') | ||
Matt Mackall
|
r22426 | 'a weird byte: \\xed\\xb3\\x9d' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'utf-8: caf\\xc3\\xa9') | ||
Matt Mackall
|
r22426 | 'utf-8: caf\\xc3\\xa9' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'') | ||
Matt Mackall
|
r22426 | '' | ||
Yuya Nishihara
|
r28068 | |||
Yuya Nishihara
|
r28069 | If paranoid, non-ascii and common troublesome characters are also escaped. | ||
This is suitable for web output. | ||||
Yuya Nishihara
|
r28068 | |||
Yuya Nishihara
|
r34133 | >>> s = b'escape characters: \\0 \\x0b \\x7f' | ||
Yuya Nishihara
|
r33926 | >>> assert jsonescape(s) == jsonescape(s, paranoid=True) | ||
Yuya Nishihara
|
r34133 | >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' | ||
Yuya Nishihara
|
r33926 | >>> assert jsonescape(s) == jsonescape(s, paranoid=True) | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'escape boundary: ~ \\\\u007f \\\\u0080' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'a weird byte: \\\\udcdd' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'utf-8: caf\\\\u00e9' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'non-BMP: \\\\ud834\\\\udd1e' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'<foo@example.org>', paranoid=True) | ||
Yuya Nishihara
|
r28069 | '\\\\u003cfoo@example.org\\\\u003e' | ||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r22426 | |||
Yuya Nishihara
|
r28068 | u8chars = toutf8b(s) | ||
try: | ||||
Yuya Nishihara
|
r33925 | return _jsonescapeu8fast(u8chars, paranoid) | ||
except ValueError: | ||||
Yuya Nishihara
|
r28068 | pass | ||
Yuya Nishihara
|
r33925 | return charencodepure.jsonescapeu8fallback(u8chars, paranoid) | ||
Matt Mackall
|
r22426 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r34215 | # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 | ||
# bytes are mapped to that range. | ||||
if pycompat.ispy3: | ||||
_utf8strict = r'surrogatepass' | ||||
else: | ||||
_utf8strict = r'strict' | ||||
Matt Mackall
|
r26875 | _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | ||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r26875 | def getutf8char(s, pos): | ||
Yuya Nishihara
|
r44077 | # type: (bytes, int) -> bytes | ||
Augie Fackler
|
r46554 | """get the next full utf-8 character in the given string, starting at pos | ||
Matt Mackall
|
r26875 | |||
Raises a UnicodeError if the given location does not start a valid | ||||
utf-8 character. | ||||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r26875 | |||
# find how many bytes to attempt decoding from first nibble | ||||
Augie Fackler
|
r43346 | l = _utf8len[ord(s[pos : pos + 1]) >> 4] | ||
if not l: # ascii | ||||
return s[pos : pos + 1] | ||||
Matt Mackall
|
r26875 | |||
Augie Fackler
|
r43346 | c = s[pos : pos + l] | ||
Matt Mackall
|
r26875 | # validate with attempted decode | ||
Yuya Nishihara
|
r34215 | c.decode("utf-8", _utf8strict) | ||
Matt Mackall
|
r26875 | return c | ||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r16133 | def toutf8b(s): | ||
Yuya Nishihara
|
r44077 | # type: (bytes) -> bytes | ||
Augie Fackler
|
r46554 | """convert a local, possibly-binary string into UTF-8b | ||
Matt Mackall
|
r16133 | |||
This is intended as a generic method to preserve data when working | ||||
with schemes like JSON and XML that have no provision for | ||||
arbitrary byte strings. As Mercurial often doesn't know | ||||
what encoding data is in, we use so-called UTF-8b. | ||||
If a string is already valid UTF-8 (or ASCII), it passes unmodified. | ||||
Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | ||||
uDC00-uDCFF. | ||||
Principles of operation: | ||||
Mads Kiilerich
|
r17424 | - ASCII and UTF-8 data successfully round-trips and is understood | ||
Matt Mackall
|
r16133 | by Unicode-oriented clients | ||
- filenames and file contents in arbitrary other encodings can have | ||||
be round-tripped or recovered by clueful clients | ||||
- local strings that have a cached known UTF-8 encoding (aka | ||||
localstr) get sent as UTF-8 so Unicode-oriented clients get the | ||||
Unicode data they want | ||||
Yuya Nishihara
|
r37966 | - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well | ||
Matt Mackall
|
r16133 | - because we must preserve UTF-8 bytestring in places such as | ||
filenames, metadata can't be roundtripped without help | ||||
(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | ||||
arbitrary bytes into an internal Unicode format that can be | ||||
re-encoded back into the original. Here we are exposing the | ||||
internal surrogate encoding as a UTF-8 string.) | ||||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r16133 | |||
Yuya Nishihara
|
r37965 | if isinstance(s, localstr): | ||
# assume that the original UTF-8 sequence would never contain | ||||
# invalid characters in U+DCxx range | ||||
return s._utf8 | ||||
Yuya Nishihara
|
r37966 | elif isinstance(s, safelocalstr): | ||
# already verified that s is non-lossy in legacy encoding, which | ||||
# shouldn't contain characters in U+DCxx range | ||||
return fromlocal(s) | ||||
Yuya Nishihara
|
r37965 | elif isasciistr(s): | ||
Yuya Nishihara
|
r33929 | return s | ||
Augie Fackler
|
r43347 | if b"\xed" not in s: | ||
Matt Mackall
|
r26879 | try: | ||
Yuya Nishihara
|
r34215 | s.decode('utf-8', _utf8strict) | ||
Matt Mackall
|
r26879 | return s | ||
except UnicodeDecodeError: | ||||
pass | ||||
Matt Mackall
|
r26878 | |||
Yuya Nishihara
|
r34213 | s = pycompat.bytestr(s) | ||
Augie Fackler
|
r43347 | r = b"" | ||
Matt Mackall
|
r26878 | pos = 0 | ||
l = len(s) | ||||
while pos < l: | ||||
try: | ||||
c = getutf8char(s, pos) | ||||
Augie Fackler
|
r43347 | if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": | ||
Matt Mackall
|
r26879 | # have to re-escape existing U+DCxx characters | ||
Augie Fackler
|
r43346 | c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) | ||
Matt Mackall
|
r26879 | pos += 1 | ||
else: | ||||
pos += len(c) | ||||
Matt Mackall
|
r26878 | except UnicodeDecodeError: | ||
Augie Fackler
|
r43346 | c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) | ||
Matt Mackall
|
r26878 | pos += 1 | ||
r += c | ||||
return r | ||||
Matt Mackall
|
r16133 | |||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r16133 | def fromutf8b(s): | ||
Yuya Nishihara
|
r44076 | # type: (bytes) -> bytes | ||
Augie Fackler
|
r46554 | """Given a UTF-8b string, return a local, possibly-binary string. | ||
Matt Mackall
|
r16133 | |||
return the original binary string. This | ||||
is a round-trip process for strings like filenames, but metadata | ||||
that's was passed through tolocal will remain in UTF-8. | ||||
Matt Mackall
|
r26963 | >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x | ||
Yuya Nishihara
|
r34133 | >>> m = b"\\xc3\\xa9\\x99abcd" | ||
Matt Mackall
|
r26963 | >>> toutf8b(m) | ||
Matt Mackall
|
r16133 | '\\xc3\\xa9\\xed\\xb2\\x99abcd' | ||
Matt Mackall
|
r26963 | >>> roundtrip(m) | ||
True | ||||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xc2\\xc2\\x80") | ||
Matt Mackall
|
r26963 | True | ||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xef\\xbf\\xbd") | ||
Matt Mackall
|
r26963 | True | ||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") | ||
Matt Mackall
|
r16133 | True | ||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") | ||
Matt Mackall
|
r27699 | True | ||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r16133 | |||
Yuya Nishihara
|
r33929 | if isasciistr(s): | ||
return s | ||||
Matt Mackall
|
r16133 | # fast path - look for uDxxx prefixes in s | ||
Augie Fackler
|
r43347 | if b"\xed" not in s: | ||
Matt Mackall
|
r16133 | return s | ||
Matt Mackall
|
r27699 | # We could do this with the unicode type but some Python builds | ||
# use UTF-16 internally (issue5031) which causes non-BMP code | ||||
# points to be escaped. Instead, we use our handy getutf8char | ||||
# helper again to walk the string without "decoding" it. | ||||
Yuya Nishihara
|
r34213 | s = pycompat.bytestr(s) | ||
Augie Fackler
|
r43347 | r = b"" | ||
Matt Mackall
|
r27699 | pos = 0 | ||
l = len(s) | ||||
while pos < l: | ||||
c = getutf8char(s, pos) | ||||
pos += len(c) | ||||
# unescape U+DCxx characters | ||||
Augie Fackler
|
r43347 | if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": | ||
Augie Fackler
|
r43346 | c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) | ||
Matt Mackall
|
r27699 | r += c | ||
Matt Mackall
|
r16133 | return r | ||