encoding.py
721 lines
| 22.5 KiB
| text/x-python
|
PythonLexer
/ mercurial / encoding.py
Martin Geisler
|
r8226 | # encoding.py - character transcoding support for Mercurial | ||
# | ||||
Raphaël Gomès
|
r47575 | # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others | ||
Martin Geisler
|
r8226 | # | ||
# This software may be used and distributed according to the terms of the | ||||
Matt Mackall
|
r10263 | # GNU General Public License version 2 or any later version. | ||
Matt Mackall
|
r7948 | |||
Matt Harbison
|
r52756 | from __future__ import annotations | ||
Gregory Szorc
|
r27355 | |||
import locale | ||||
import os | ||||
r48421 | import re | |||
r52178 | import typing | |||
Gregory Szorc
|
r27355 | import unicodedata | ||
r52178 | from typing import ( | |||
Any, | ||||
Callable, | ||||
Text, | ||||
TypeVar, | ||||
) | ||||
Gregory Szorc
|
r27355 | from . import ( | ||
error, | ||||
Yuya Nishihara
|
r32372 | policy, | ||
Yuya Nishihara
|
r30030 | pycompat, | ||
Gregory Szorc
|
r27355 | ) | ||
Matt Mackall
|
r7948 | |||
Matt Harbison
|
r52833 | from .interfaces import modules as intmod | ||
Augie Fackler
|
r43346 | from .pure import charencode as charencodepure | ||
Yuya Nishihara
|
r33925 | |||
r52178 | _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr') | |||
Augie Fackler
|
r43802 | |||
Matt Harbison
|
r52833 | charencode: intmod.CharEncoding = policy.importmod('charencode') | ||
Yuya Nishihara
|
r33756 | |||
Yuya Nishihara
|
r33927 | isasciistr = charencode.isasciistr | ||
Yuya Nishihara
|
r33756 | asciilower = charencode.asciilower | ||
asciiupper = charencode.asciiupper | ||||
Yuya Nishihara
|
r33926 | _jsonescapeu8fast = charencode.jsonescapeu8fast | ||
Yuya Nishihara
|
r33756 | |||
Yuya Nishihara
|
r30033 | _sysstr = pycompat.sysstr | ||
Gregory Szorc
|
r49747 | unichr = chr | ||
Gregory Szorc
|
r28507 | |||
Matt Harbison
|
r52615 | |||
Augie Fackler
|
r23596 | # These unicode characters are ignored by HFS+ (Apple Technote 1150, | ||
# "Unicode Subtleties"), so we need to ignore them in some places for | ||||
# sanity. | ||||
Augie Fackler
|
r43346 | _ignore = [ | ||
unichr(int(x, 16)).encode("utf-8") | ||||
Augie Fackler
|
r43347 | for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " | ||
b"206a 206b 206c 206d 206e 206f feff".split() | ||||
Augie Fackler
|
r43346 | ] | ||
Augie Fackler
|
r23596 | # verify the next function will work | ||
Augie Fackler
|
r43347 | assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) | ||
Augie Fackler
|
r23596 | |||
Augie Fackler
|
r43346 | |||
r52180 | def hfsignoreclean(s: bytes) -> bytes: | |||
Augie Fackler
|
r23596 | """Remove codepoints ignored by HFS+ from s. | ||
>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) | ||||
'.hg' | ||||
>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) | ||||
'.hg' | ||||
""" | ||||
Augie Fackler
|
r43347 | if b"\xe2" in s or b"\xef" in s: | ||
Augie Fackler
|
r23596 | for c in _ignore: | ||
Augie Fackler
|
r43347 | s = s.replace(c, b'') | ||
Augie Fackler
|
r23596 | return s | ||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r30034 | # encoding.environ is provided read-only, which may not be used to modify | ||
# the process environment | ||||
Gregory Szorc
|
r49747 | _nativeenviron = os.supports_bytes_environ | ||
if _nativeenviron: | ||||
Yuya Nishihara
|
r32185 | environ = os.environb # re-exports | ||
Jean-Francois Pieronne
|
r51888 | if pycompat.sysplatform == b'OpenVMS': | ||
# workaround for a bug in VSI 3.10 port | ||||
# os.environb is only populated with a few Predefined symbols | ||||
def newget(self, key, default=None): | ||||
# pytype on linux does not understand OpenVMS special modules | ||||
import _decc # pytype: disable=import-error | ||||
v = _decc.getenv(key, None) | ||||
if isinstance(key, bytes): | ||||
return default if v is None else v.encode('latin-1') | ||||
else: | ||||
return default if v is None else v | ||||
environ.__class__.get = newget | ||||
Yuya Nishihara
|
r30034 | else: | ||
# preferred encoding isn't known yet; use utf-8 to avoid unicode error | ||||
# and recreate it once encoding is settled | ||||
Augie Fackler
|
r44937 | environ = { | ||
k.encode('utf-8'): v.encode('utf-8') | ||||
Augie Fackler
|
r43346 | for k, v in os.environ.items() # re-exports | ||
Augie Fackler
|
r44937 | } | ||
Yuya Nishihara
|
r30034 | |||
Martin von Zweigbergk
|
r39871 | _encodingrewrites = { | ||
Augie Fackler
|
r43347 | b'646': b'ascii', | ||
b'ANSI_X3.4-1968': b'ascii', | ||||
Dan Villiom Podlaski Christiansen
|
r11892 | } | ||
Yuya Nishihara
|
r38633 | # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. | ||
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. | ||||
# https://bugs.python.org/issue13216 | ||||
Gregory Szorc
|
r49747 | if pycompat.iswindows: | ||
Augie Fackler
|
r43347 | _encodingrewrites[b'cp65001'] = b'utf-8' | ||
Matt Mackall
|
r7948 | |||
Matt Harbison
|
r52566 | encoding: bytes = b'' # help pytype avoid seeing None value | ||
Matt Mackall
|
r7948 | try: | ||
Matt Harbison
|
r52566 | encoding = environ.get(b"HGENCODING", b'') | ||
Matt Mackall
|
r7948 | if not encoding: | ||
Augie Fackler
|
r43347 | encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' | ||
Martin von Zweigbergk
|
r39871 | encoding = _encodingrewrites.get(encoding, encoding) | ||
Matt Mackall
|
r7948 | except locale.Error: | ||
Augie Fackler
|
r43347 | encoding = b'ascii' | ||
Matt Harbison
|
r52566 | encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict") | ||
Augie Fackler
|
r43347 | fallbackencoding = b'ISO-8859-1' | ||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33810 | class localstr(bytes): | ||
Augie Fackler
|
r46554 | """This class allows strings that are unmodified to be | ||
round-tripped to the local encoding and back""" | ||||
Augie Fackler
|
r43346 | |||
Matt Mackall
|
r13046 | def __new__(cls, u, l): | ||
Yuya Nishihara
|
r33810 | s = bytes.__new__(cls, l) | ||
Matt Mackall
|
r13046 | s._utf8 = u | ||
return s | ||||
Augie Fackler
|
r43346 | |||
r52178 | if typing.TYPE_CHECKING: | |||
Yuya Nishihara
|
r44080 | # pseudo implementation to help pytype see localstr() constructor | ||
r52180 | def __init__(self, u: bytes, l: bytes) -> None: | |||
Yuya Nishihara
|
r44080 | super(localstr, self).__init__(l) | ||
self._utf8 = u | ||||
Matt Mackall
|
r13046 | def __hash__(self): | ||
Augie Fackler
|
r43346 | return hash(self._utf8) # avoid collisions in local string space | ||
Matt Mackall
|
r13046 | |||
Yuya Nishihara
|
r37966 | class safelocalstr(bytes): | ||
"""Tagged string denoting it was previously an internal UTF-8 string, | ||||
and can be converted back to UTF-8 losslessly | ||||
>>> assert safelocalstr(b'\\xc3') == b'\\xc3' | ||||
>>> assert b'\\xc3' == safelocalstr(b'\\xc3') | ||||
>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} | ||||
>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} | ||||
""" | ||||
Augie Fackler
|
r43346 | |||
r52180 | def tolocal(s: bytes) -> bytes: | |||
Matt Mackall
|
r7948 | """ | ||
Convert a string from internal UTF-8 to local encoding | ||||
All internal strings should be UTF-8 but some repos before the | ||||
implementation of locale support may contain latin1 or possibly | ||||
other character sets. We attempt to decode everything strictly | ||||
using UTF-8, then Latin-1, and failing that, we use UTF-8 and | ||||
replace unknown characters. | ||||
Matt Mackall
|
r13046 | |||
The localstr class is used to cache the known UTF-8 encoding of | ||||
strings next to their local representation to allow lossless | ||||
round-trip conversion back to UTF-8. | ||||
Yuya Nishihara
|
r34133 | >>> u = b'foo: \\xc3\\xa4' # utf-8 | ||
Matt Mackall
|
r13046 | >>> l = tolocal(u) | ||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) | ||||
'foo: \\xc3\\xa4' | ||||
Yuya Nishihara
|
r34133 | >>> u2 = b'foo: \\xc3\\xa1' | ||
Matt Mackall
|
r13046 | >>> d = { l: 1, tolocal(u2): 2 } | ||
Mads Kiilerich
|
r18378 | >>> len(d) # no collision | ||
2 | ||||
Yuya Nishihara
|
r34133 | >>> b'foo: ?' in d | ||
Matt Mackall
|
r13046 | False | ||
Yuya Nishihara
|
r34133 | >>> l1 = b'foo: \\xe4' # historical latin1 fallback | ||
Matt Mackall
|
r13046 | >>> l = tolocal(l1) | ||
>>> l | ||||
'foo: ?' | ||||
>>> fromlocal(l) # magically in utf-8 | ||||
'foo: \\xc3\\xa4' | ||||
Matt Mackall
|
r7948 | """ | ||
Matt Mackall
|
r13046 | |||
Yuya Nishihara
|
r33928 | if isasciistr(s): | ||
return s | ||||
Matt Mackall
|
r16274 | try: | ||
Matt Mackall
|
r7948 | try: | ||
Matt Mackall
|
r16274 | # make sure string is actually stored in UTF-8 | ||
u = s.decode('UTF-8') | ||||
Augie Fackler
|
r43347 | if encoding == b'UTF-8': | ||
Matt Mackall
|
r16274 | # fast path | ||
return s | ||||
Augie Fackler
|
r43809 | r = u.encode(_sysstr(encoding), "replace") | ||
Yuya Nishihara
|
r30033 | if u == r.decode(_sysstr(encoding)): | ||
Matt Mackall
|
r13940 | # r is a safe, non-lossy encoding of s | ||
Yuya Nishihara
|
r37966 | return safelocalstr(r) | ||
Matt Mackall
|
r16274 | return localstr(s, r) | ||
except UnicodeDecodeError: | ||||
# we should only get here if we're looking at an ancient changeset | ||||
try: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(fallbackencoding)) | ||
Augie Fackler
|
r43809 | r = u.encode(_sysstr(encoding), "replace") | ||
Yuya Nishihara
|
r30033 | if u == r.decode(_sysstr(encoding)): | ||
Matt Mackall
|
r16274 | # r is a safe, non-lossy encoding of s | ||
Yuya Nishihara
|
r37966 | return safelocalstr(r) | ||
Matt Mackall
|
r13940 | return localstr(u.encode('UTF-8'), r) | ||
Matt Mackall
|
r16274 | except UnicodeDecodeError: | ||
Augie Fackler
|
r43346 | u = s.decode("utf-8", "replace") # last ditch | ||
Yuya Nishihara
|
r30033 | # can't round-trip | ||
Augie Fackler
|
r43809 | return u.encode(_sysstr(encoding), "replace") | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Martin von Zweigbergk
|
r46273 | raise error.Abort( | ||
pycompat.bytestr(k), hint=b"please check your locale settings" | ||||
) | ||||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | |||
r52180 | def fromlocal(s: bytes) -> bytes: | |||
Matt Mackall
|
r7948 | """ | ||
Convert a string from the local character encoding to UTF-8 | ||||
We attempt to decode strings using the encoding mode set by | ||||
HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown | ||||
characters will cause an error message. Other modes include | ||||
'replace', which replaces unknown characters with a special | ||||
Unicode character, and 'ignore', which drops the character. | ||||
""" | ||||
Matt Mackall
|
r13046 | |||
# can we do a lossless round-trip? | ||||
if isinstance(s, localstr): | ||||
return s._utf8 | ||||
Yuya Nishihara
|
r33928 | if isasciistr(s): | ||
return s | ||||
Matt Mackall
|
r13046 | |||
Matt Mackall
|
r7948 | try: | ||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||
return u.encode("utf-8") | ||||
Gregory Szorc
|
r25660 | except UnicodeDecodeError as inst: | ||
Augie Fackler
|
r43346 | sub = s[max(0, inst.start - 10) : inst.start + 10] | ||
raise error.Abort( | ||||
Augie Fackler
|
r43347 | b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) | ||
Augie Fackler
|
r43346 | ) | ||
Gregory Szorc
|
r25660 | except LookupError as k: | ||
Matt Harbison
|
r48820 | raise error.Abort( | ||
pycompat.bytestr(k), hint=b"please check your locale settings" | ||||
) | ||||
Matt Mackall
|
r7948 | |||
Augie Fackler
|
r43346 | |||
r52180 | def unitolocal(u: str) -> bytes: | |||
Yuya Nishihara
|
r31447 | """Convert a unicode string to a byte string of local encoding""" | ||
return tolocal(u.encode('utf-8')) | ||||
Augie Fackler
|
r43346 | |||
r52180 | def unifromlocal(s: bytes) -> str: | |||
Yuya Nishihara
|
r31447 | """Convert a byte string of local encoding to a unicode string""" | ||
return fromlocal(s).decode('utf-8') | ||||
Augie Fackler
|
r43346 | |||
r52180 | def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]: | |||
Yuya Nishihara
|
r33022 | """Create a proxy method that forwards __unicode__() and __str__() of | ||
Python 3 to __bytes__()""" | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33022 | def unifunc(obj): | ||
return unifromlocal(bytesfunc(obj)) | ||||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r33022 | return unifunc | ||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r31448 | # converter functions between native str and byte string. use these if the | ||
# character encoding is not aware (e.g. exception message) or is known to | ||||
# be locale dependent (e.g. date formatting.) | ||||
Gregory Szorc
|
r49747 | strtolocal = unitolocal | ||
strfromlocal = unifromlocal | ||||
strmethod = unimethod | ||||
Yuya Nishihara
|
r31448 | |||
Raphaël Gomès
|
r48359 | |||
r52180 | def lower(s: bytes) -> bytes: | |||
Raphaël Gomès
|
r48359 | """best-effort encoding-aware case-folding of local string s""" | ||
try: | ||||
return asciilower(s) | ||||
except UnicodeDecodeError: | ||||
pass | ||||
try: | ||||
if isinstance(s, localstr): | ||||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||||
lu = u.lower() | ||||
if u == lu: | ||||
return s # preserve localstring | ||||
return lu.encode(_sysstr(encoding)) | ||||
except UnicodeError: | ||||
return s.lower() # we don't know how to fold this except in ASCII | ||||
except LookupError as k: | ||||
Matt Harbison
|
r48820 | raise error.Abort( | ||
pycompat.bytestr(k), hint=b"please check your locale settings" | ||||
) | ||||
Raphaël Gomès
|
r48359 | |||
r52180 | def upper(s: bytes) -> bytes: | |||
Raphaël Gomès
|
r48359 | """best-effort encoding-aware case-folding of local string s""" | ||
try: | ||||
return asciiupper(s) | ||||
except UnicodeDecodeError: | ||||
return upperfallback(s) | ||||
r52180 | def upperfallback(s: Any) -> Any: | |||
Raphaël Gomès
|
r48359 | try: | ||
if isinstance(s, localstr): | ||||
u = s._utf8.decode("utf-8") | ||||
else: | ||||
u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | ||||
uu = u.upper() | ||||
if u == uu: | ||||
return s # preserve localstring | ||||
return uu.encode(_sysstr(encoding)) | ||||
except UnicodeError: | ||||
return s.upper() # we don't know how to fold this except in ASCII | ||||
except LookupError as k: | ||||
Matt Harbison
|
r48820 | raise error.Abort( | ||
pycompat.bytestr(k), hint=b"please check your locale settings" | ||||
) | ||||
Raphaël Gomès
|
r48359 | |||
Yuya Nishihara
|
r30034 | if not _nativeenviron: | ||
# now encoding and helper functions are available, recreate the environ | ||||
# dict to be exported to other modules | ||||
Gregory Szorc
|
r49747 | if pycompat.iswindows: | ||
Raphaël Gomès
|
r48360 | |||
class WindowsEnviron(dict): | ||||
"""`os.environ` normalizes environment variables to uppercase on windows""" | ||||
def get(self, key, default=None): | ||||
return super().get(upper(key), default) | ||||
environ = WindowsEnviron() | ||||
for k, v in os.environ.items(): # re-exports | ||||
environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) | ||||
Yuya Nishihara
|
r30034 | |||
r48421 | DRIVE_RE = re.compile(b'^[a-z]:') | |||
Gregory Szorc
|
r49747 | # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which | ||
# returns bytes. | ||||
if pycompat.iswindows: | ||||
# Python 3 on Windows issues a DeprecationWarning about using the bytes | ||||
# API when os.getcwdb() is called. | ||||
# | ||||
# Additionally, py3.8+ uppercases the drive letter when calling | ||||
# os.path.realpath(), which is used on ``repo.root``. Since those | ||||
# strings are compared in various places as simple strings, also call | ||||
# realpath here. See https://bugs.python.org/issue40368 | ||||
# | ||||
# However this is not reliable, so lets explicitly make this drive | ||||
# letter upper case. | ||||
# | ||||
# note: we should consider dropping realpath here since it seems to | ||||
# change the semantic of `getcwd`. | ||||
r48421 | ||||
Gregory Szorc
|
r49747 | def getcwd(): | ||
cwd = os.getcwd() # re-exports | ||||
cwd = os.path.realpath(cwd) | ||||
cwd = strtolocal(cwd) | ||||
if DRIVE_RE.match(cwd): | ||||
cwd = cwd[0:1].upper() + cwd[1:] | ||||
return cwd | ||||
r48421 | ||||
Matt Harbison
|
r39843 | else: | ||
Gregory Szorc
|
r49747 | getcwd = os.getcwdb # re-exports | ||
Matt Harbison
|
r39843 | |||
Matt Mackall
|
r12866 | # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | ||
Augie Fackler
|
r43346 | _wide = _sysstr( | ||
Augie Fackler
|
r43347 | environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" | ||
and b"WFA" | ||||
or b"WF" | ||||
Augie Fackler
|
r43346 | ) | ||
Matt Mackall
|
r12866 | |||
r52180 | def colwidth(s: bytes) -> int: | |||
Matt Harbison
|
r44226 | """Find the column width of a string for display in the local encoding""" | ||
Augie Fackler
|
r43906 | return ucolwidth(s.decode(_sysstr(encoding), 'replace')) | ||
FUJIWARA Katsunori
|
r15066 | |||
Augie Fackler
|
r43346 | |||
r52180 | def ucolwidth(d: Text) -> int: | |||
Matt Harbison
|
r44226 | """Find the column width of a Unicode string for display""" | ||
Augie Fackler
|
r14951 | eaw = getattr(unicodedata, 'east_asian_width', None) | ||
if eaw is not None: | ||||
Yuya Nishihara
|
r32537 | return sum([eaw(c) in _wide and 2 or 1 for c in d]) | ||
Matt Mackall
|
r7948 | return len(d) | ||
Augie Fackler
|
r43346 | |||
r52180 | def getcols(s: bytes, start: int, c: int) -> bytes: | |||
Augie Fackler
|
r46554 | """Use colwidth to find a c-column substring of s starting at byte | ||
index start""" | ||||
Manuel Jacob
|
r50179 | for x in range(start + c, len(s)): | ||
Matt Mackall
|
r15143 | t = s[start:x] | ||
if colwidth(t) == c: | ||||
return t | ||||
Yuya Nishihara
|
r44074 | raise ValueError('substring not found') | ||
Matt Mackall
|
r15143 | |||
Augie Fackler
|
r43346 | |||
r52180 | def trim( | |||
s: bytes, | ||||
width: int, | ||||
ellipsis: bytes = b'', | ||||
leftside: bool = False, | ||||
) -> bytes: | ||||
FUJIWARA Katsunori
|
r21856 | """Trim string 's' to at most 'width' columns (including 'ellipsis'). | ||
FUJIWARA Katsunori
|
r21861 | If 'leftside' is True, left side of string 's' is trimmed. | ||
'ellipsis' is always placed at trimmed side. | ||||
Yuya Nishihara
|
r34138 | >>> from .node import bin | ||
Yuya Nishihara
|
r34139 | >>> def bprint(s): | ||
... print(pycompat.sysstr(s)) | ||||
Yuya Nishihara
|
r34133 | >>> ellipsis = b'+++' | ||
Gregory Szorc
|
r27355 | >>> from . import encoding | ||
Yuya Nishihara
|
r34133 | >>> encoding.encoding = b'utf-8' | ||
>>> t = b'1234567890' | ||||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 12, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | 1234567890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 10, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | 1234567890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | 12345+++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++67890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8)) | ||
FUJIWARA Katsunori
|
r21856 | 12345678 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | 34567890 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 3, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | +++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 1, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | + | ||
>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns | ||||
Yuya Nishihara
|
r34137 | >>> t = u.encode(pycompat.sysstr(encoding.encoding)) | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 12, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 10, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84+++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++\xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 5)) | ||
FUJIWARA Katsunori
|
r21856 | \xe3\x81\x82\xe3\x81\x84 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 5, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | \xe3\x81\x88\xe3\x81\x8a | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 4, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | +++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++ | ||
Yuya Nishihara
|
r34138 | >>> t = bin(b'112233445566778899aa') # invalid byte sequence | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 12, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 10, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55+++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | +++\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8)) | ||
FUJIWARA Katsunori
|
r21856 | \x11\x22\x33\x44\x55\x66\x77\x88 | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 8, leftside=True)) | ||
FUJIWARA Katsunori
|
r21861 | \x33\x44\x55\x66\x77\x88\x99\xaa | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 3, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | +++ | ||
Yuya Nishihara
|
r34139 | >>> bprint(trim(t, 1, ellipsis=ellipsis)) | ||
FUJIWARA Katsunori
|
r21856 | + | ||
""" | ||||
try: | ||||
Yuya Nishihara
|
r30033 | u = s.decode(_sysstr(encoding)) | ||
FUJIWARA Katsunori
|
r21856 | except UnicodeDecodeError: | ||
Augie Fackler
|
r43346 | if len(s) <= width: # trimming is not needed | ||
FUJIWARA Katsunori
|
r21856 | return s | ||
width -= len(ellipsis) | ||||
Augie Fackler
|
r43346 | if width <= 0: # no enough room even for ellipsis | ||
return ellipsis[: width + len(ellipsis)] | ||||
FUJIWARA Katsunori
|
r21861 | if leftside: | ||
return ellipsis + s[-width:] | ||||
FUJIWARA Katsunori
|
r21856 | return s[:width] + ellipsis | ||
Augie Fackler
|
r43346 | if ucolwidth(u) <= width: # trimming is not needed | ||
FUJIWARA Katsunori
|
r21856 | return s | ||
width -= len(ellipsis) | ||||
Augie Fackler
|
r43346 | if width <= 0: # no enough room even for ellipsis | ||
return ellipsis[: width + len(ellipsis)] | ||||
FUJIWARA Katsunori
|
r21856 | |||
Martin von Zweigbergk
|
r49518 | chars = list(u) | ||
FUJIWARA Katsunori
|
r21861 | if leftside: | ||
Martin von Zweigbergk
|
r49518 | chars.reverse() | ||
width_so_far = 0 | ||||
for i, c in enumerate(chars): | ||||
width_so_far += ucolwidth(c) | ||||
if width_so_far > width: | ||||
break | ||||
chars = chars[:i] | ||||
if leftside: | ||||
chars.reverse() | ||||
u = u''.join(chars).encode(_sysstr(encoding)) | ||||
if leftside: | ||||
return ellipsis + u | ||||
return u + ellipsis | ||||
Augie Fackler
|
r43346 | |||
FUJIWARA Katsunori
|
r21856 | |||
Gregory Szorc
|
r49801 | class normcasespecs: | ||
Augie Fackler
|
r46554 | """what a platform's normcase does to ASCII strings | ||
Siddharth Agarwal
|
r24593 | |||
This is specified per platform, and should be consistent with what normcase | ||||
on that platform actually does. | ||||
lower: normcase lowercases ASCII strings | ||||
upper: normcase uppercases ASCII strings | ||||
Siddharth Agarwal
|
r24608 | other: the fallback function should always be called | ||
Augie Fackler
|
r46554 | This should be kept in sync with normcase_spec in util.h.""" | ||
Augie Fackler
|
r43346 | |||
Siddharth Agarwal
|
r24593 | lower = -1 | ||
upper = 1 | ||||
other = 0 | ||||
Augie Fackler
|
r43346 | |||
Matt Harbison
|
r52615 | def jsonescape(s: bytes, paranoid: bool = False) -> bytes: | ||
Augie Fackler
|
r46554 | """returns a string suitable for JSON | ||
Matt Mackall
|
r22426 | |||
JSON is problematic for us because it doesn't support non-Unicode | ||||
bytes. To deal with this, we take the following approach: | ||||
Yuya Nishihara
|
r37966 | - localstr/safelocalstr objects are converted back to UTF-8 | ||
Matt Mackall
|
r22426 | - valid UTF-8/ASCII strings are passed as-is | ||
- other strings are converted to UTF-8b surrogate encoding | ||||
- apply JSON-specified string escaping | ||||
(escapes are doubled in these tests) | ||||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'this is a test') | ||
Matt Mackall
|
r22426 | 'this is a test' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') | ||
Yuya Nishihara
|
r27881 | 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') | ||
Yuya Nishihara
|
r33926 | 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'a weird byte: \\xdd') | ||
Matt Mackall
|
r22426 | 'a weird byte: \\xed\\xb3\\x9d' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'utf-8: caf\\xc3\\xa9') | ||
Matt Mackall
|
r22426 | 'utf-8: caf\\xc3\\xa9' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'') | ||
Matt Mackall
|
r22426 | '' | ||
Yuya Nishihara
|
r28068 | |||
Yuya Nishihara
|
r28069 | If paranoid, non-ascii and common troublesome characters are also escaped. | ||
This is suitable for web output. | ||||
Yuya Nishihara
|
r28068 | |||
Yuya Nishihara
|
r34133 | >>> s = b'escape characters: \\0 \\x0b \\x7f' | ||
Yuya Nishihara
|
r33926 | >>> assert jsonescape(s) == jsonescape(s, paranoid=True) | ||
Yuya Nishihara
|
r34133 | >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' | ||
Yuya Nishihara
|
r33926 | >>> assert jsonescape(s) == jsonescape(s, paranoid=True) | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'escape boundary: ~ \\\\u007f \\\\u0080' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'a weird byte: \\\\udcdd' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'utf-8: caf\\\\u00e9' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) | ||
Yuya Nishihara
|
r28068 | 'non-BMP: \\\\ud834\\\\udd1e' | ||
Yuya Nishihara
|
r34133 | >>> jsonescape(b'<foo@example.org>', paranoid=True) | ||
Yuya Nishihara
|
r28069 | '\\\\u003cfoo@example.org\\\\u003e' | ||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r22426 | |||
Yuya Nishihara
|
r28068 | u8chars = toutf8b(s) | ||
try: | ||||
Yuya Nishihara
|
r33925 | return _jsonescapeu8fast(u8chars, paranoid) | ||
except ValueError: | ||||
Yuya Nishihara
|
r28068 | pass | ||
Yuya Nishihara
|
r33925 | return charencodepure.jsonescapeu8fallback(u8chars, paranoid) | ||
Matt Mackall
|
r22426 | |||
Augie Fackler
|
r43346 | |||
Yuya Nishihara
|
r34215 | # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 | ||
# bytes are mapped to that range. | ||||
Gregory Szorc
|
r49747 | _utf8strict = r'surrogatepass' | ||
Yuya Nishihara
|
r34215 | |||
Matt Mackall
|
r26875 | _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | ||
Augie Fackler
|
r43346 | |||
r52180 | def getutf8char(s: bytes, pos: int) -> bytes: | |||
Augie Fackler
|
r46554 | """get the next full utf-8 character in the given string, starting at pos | ||
Matt Mackall
|
r26875 | |||
Raises a UnicodeError if the given location does not start a valid | ||||
utf-8 character. | ||||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r26875 | |||
# find how many bytes to attempt decoding from first nibble | ||||
Augie Fackler
|
r43346 | l = _utf8len[ord(s[pos : pos + 1]) >> 4] | ||
if not l: # ascii | ||||
return s[pos : pos + 1] | ||||
Matt Mackall
|
r26875 | |||
Augie Fackler
|
r43346 | c = s[pos : pos + l] | ||
Matt Mackall
|
r26875 | # validate with attempted decode | ||
Yuya Nishihara
|
r34215 | c.decode("utf-8", _utf8strict) | ||
Matt Mackall
|
r26875 | return c | ||
Augie Fackler
|
r43346 | |||
r52180 | def toutf8b(s: bytes) -> bytes: | |||
Augie Fackler
|
r46554 | """convert a local, possibly-binary string into UTF-8b | ||
Matt Mackall
|
r16133 | |||
This is intended as a generic method to preserve data when working | ||||
with schemes like JSON and XML that have no provision for | ||||
arbitrary byte strings. As Mercurial often doesn't know | ||||
what encoding data is in, we use so-called UTF-8b. | ||||
If a string is already valid UTF-8 (or ASCII), it passes unmodified. | ||||
Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | ||||
uDC00-uDCFF. | ||||
Principles of operation: | ||||
Mads Kiilerich
|
r17424 | - ASCII and UTF-8 data successfully round-trips and is understood | ||
Matt Mackall
|
r16133 | by Unicode-oriented clients | ||
- filenames and file contents in arbitrary other encodings can have | ||||
be round-tripped or recovered by clueful clients | ||||
- local strings that have a cached known UTF-8 encoding (aka | ||||
localstr) get sent as UTF-8 so Unicode-oriented clients get the | ||||
Unicode data they want | ||||
Yuya Nishihara
|
r37966 | - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well | ||
Matt Mackall
|
r16133 | - because we must preserve UTF-8 bytestring in places such as | ||
filenames, metadata can't be roundtripped without help | ||||
(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | ||||
arbitrary bytes into an internal Unicode format that can be | ||||
re-encoded back into the original. Here we are exposing the | ||||
internal surrogate encoding as a UTF-8 string.) | ||||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r16133 | |||
Yuya Nishihara
|
r37965 | if isinstance(s, localstr): | ||
# assume that the original UTF-8 sequence would never contain | ||||
# invalid characters in U+DCxx range | ||||
return s._utf8 | ||||
Yuya Nishihara
|
r37966 | elif isinstance(s, safelocalstr): | ||
# already verified that s is non-lossy in legacy encoding, which | ||||
# shouldn't contain characters in U+DCxx range | ||||
return fromlocal(s) | ||||
Yuya Nishihara
|
r37965 | elif isasciistr(s): | ||
Yuya Nishihara
|
r33929 | return s | ||
Augie Fackler
|
r43347 | if b"\xed" not in s: | ||
Matt Mackall
|
r26879 | try: | ||
Yuya Nishihara
|
r34215 | s.decode('utf-8', _utf8strict) | ||
Matt Mackall
|
r26879 | return s | ||
except UnicodeDecodeError: | ||||
pass | ||||
Matt Mackall
|
r26878 | |||
Yuya Nishihara
|
r34213 | s = pycompat.bytestr(s) | ||
Arseniy Alekseyev
|
r51214 | r = bytearray() | ||
Matt Mackall
|
r26878 | pos = 0 | ||
l = len(s) | ||||
while pos < l: | ||||
try: | ||||
c = getutf8char(s, pos) | ||||
Augie Fackler
|
r43347 | if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": | ||
Matt Mackall
|
r26879 | # have to re-escape existing U+DCxx characters | ||
Augie Fackler
|
r43346 | c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) | ||
Matt Mackall
|
r26879 | pos += 1 | ||
else: | ||||
pos += len(c) | ||||
Matt Mackall
|
r26878 | except UnicodeDecodeError: | ||
Augie Fackler
|
r43346 | c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) | ||
Matt Mackall
|
r26878 | pos += 1 | ||
r += c | ||||
Arseniy Alekseyev
|
r51214 | return bytes(r) | ||
Matt Mackall
|
r16133 | |||
Augie Fackler
|
r43346 | |||
r52180 | def fromutf8b(s: bytes) -> bytes: | |||
Augie Fackler
|
r46554 | """Given a UTF-8b string, return a local, possibly-binary string. | ||
Matt Mackall
|
r16133 | |||
return the original binary string. This | ||||
is a round-trip process for strings like filenames, but metadata | ||||
that's was passed through tolocal will remain in UTF-8. | ||||
Matt Mackall
|
r26963 | >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x | ||
Yuya Nishihara
|
r34133 | >>> m = b"\\xc3\\xa9\\x99abcd" | ||
Matt Mackall
|
r26963 | >>> toutf8b(m) | ||
Matt Mackall
|
r16133 | '\\xc3\\xa9\\xed\\xb2\\x99abcd' | ||
Matt Mackall
|
r26963 | >>> roundtrip(m) | ||
True | ||||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xc2\\xc2\\x80") | ||
Matt Mackall
|
r26963 | True | ||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xef\\xbf\\xbd") | ||
Matt Mackall
|
r26963 | True | ||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") | ||
Matt Mackall
|
r16133 | True | ||
Yuya Nishihara
|
r34133 | >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") | ||
Matt Mackall
|
r27699 | True | ||
Augie Fackler
|
r46554 | """ | ||
Matt Mackall
|
r16133 | |||
Yuya Nishihara
|
r33929 | if isasciistr(s): | ||
return s | ||||
Matt Mackall
|
r16133 | # fast path - look for uDxxx prefixes in s | ||
Augie Fackler
|
r43347 | if b"\xed" not in s: | ||
Matt Mackall
|
r16133 | return s | ||
Matt Mackall
|
r27699 | # We could do this with the unicode type but some Python builds | ||
# use UTF-16 internally (issue5031) which causes non-BMP code | ||||
# points to be escaped. Instead, we use our handy getutf8char | ||||
# helper again to walk the string without "decoding" it. | ||||
Yuya Nishihara
|
r34213 | s = pycompat.bytestr(s) | ||
Arseniy Alekseyev
|
r51214 | r = bytearray() | ||
Matt Mackall
|
r27699 | pos = 0 | ||
l = len(s) | ||||
while pos < l: | ||||
c = getutf8char(s, pos) | ||||
pos += len(c) | ||||
# unescape U+DCxx characters | ||||
Augie Fackler
|
r43347 | if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": | ||
Augie Fackler
|
r43346 | c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) | ||
Matt Mackall
|
r27699 | r += c | ||
Arseniy Alekseyev
|
r51214 | return bytes(r) | ||