##// END OF EJS Templates
typing: constrain argument/return types of encoding.toutf8b()
typing: constrain argument/return types of encoding.toutf8b()

File last commit:

r44077:83a349aa default
r44077:83a349aa default
Show More
encoding.py
696 lines | 21.5 KiB | text/x-python | PythonLexer
Martin Geisler
put license and copyright info into comment blocks
r8226 # encoding.py - character transcoding support for Mercurial
#
# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
#
# This software may be used and distributed according to the terms of the
Matt Mackall
Update license to GPLv2+
r10263 # GNU General Public License version 2 or any later version.
Matt Mackall
move encoding bits from util to encoding...
r7948
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 from __future__ import absolute_import, print_function
Gregory Szorc
encoding: use absolute_import
r27355
import locale
import os
import unicodedata
Gregory Szorc
py3: manually import getattr where it is needed...
r43359 from .pycompat import getattr
Gregory Szorc
encoding: use absolute_import
r27355 from . import (
error,
Yuya Nishihara
parsers: switch to policy importer...
r32372 policy,
Yuya Nishihara
pycompat: provide 'ispy3' constant...
r30030 pycompat,
Gregory Szorc
encoding: use absolute_import
r27355 )
Matt Mackall
move encoding bits from util to encoding...
r7948
Augie Fackler
formatting: blacken the codebase...
r43346 from .pure import charencode as charencodepure
Yuya Nishihara
encoding: extract stub for fast JSON escape...
r33925
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 if not globals(): # hide this from non-pytype users
from typing import (
Any,
Callable,
List,
Text,
Type,
TypeVar,
Union,
)
# keep pyflakes happy
for t in (Any, Callable, List, Text, Type, Union):
assert t
Yuya Nishihara
typing: fix forward reference in _Tlocalstr type bound...
r44075 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802
Augie Fackler
cleanup: remove pointless r-prefixes on single-quoted strings...
r43906 charencode = policy.importmod('charencode')
Yuya Nishihara
encoding: drop circular import by proxying through '<policy>.charencode'...
r33756
Yuya Nishihara
encoding: add function to test if a str consists of ASCII characters...
r33927 isasciistr = charencode.isasciistr
Yuya Nishihara
encoding: drop circular import by proxying through '<policy>.charencode'...
r33756 asciilower = charencode.asciilower
asciiupper = charencode.asciiupper
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 _jsonescapeu8fast = charencode.jsonescapeu8fast
Yuya Nishihara
encoding: drop circular import by proxying through '<policy>.charencode'...
r33756
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 _sysstr = pycompat.sysstr
Yuya Nishihara
pycompat: provide 'ispy3' constant...
r30030 if pycompat.ispy3:
Gregory Szorc
encoding: make HFS+ ignore code Python 3 compatible...
r28507 unichr = chr
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
# "Unicode Subtleties"), so we need to ignore them in some places for
# sanity.
Augie Fackler
formatting: blacken the codebase...
r43346 _ignore = [
unichr(int(x, 16)).encode("utf-8")
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
b"206a 206b 206c 206d 206e 206f feff".split()
Augie Fackler
formatting: blacken the codebase...
r43346 ]
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 # verify the next function will work
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596
Augie Fackler
formatting: blacken the codebase...
r43346
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 def hfsignoreclean(s):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes) -> bytes
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 """Remove codepoints ignored by HFS+ from s.
>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
'.hg'
>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
'.hg'
"""
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 if b"\xe2" in s or b"\xef" in s:
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 for c in _ignore:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 s = s.replace(c, b'')
Augie Fackler
encoding: add hfsignoreclean to clean out HFS-ignored characters...
r23596 return s
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 # encoding.environ is provided read-only, which may not be used to modify
# the process environment
Augie Fackler
formatting: blacken the codebase...
r43346 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 if not pycompat.ispy3:
Yuya Nishihara
check-code: ignore re-exports of os.environ in encoding.py...
r32185 environ = os.environ # re-exports
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 elif _nativeenviron:
Yuya Nishihara
check-code: ignore re-exports of os.environ in encoding.py...
r32185 environ = os.environb # re-exports
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 else:
# preferred encoding isn't known yet; use utf-8 to avoid unicode error
# and recreate it once encoding is settled
Augie Fackler
formatting: blacken the codebase...
r43346 environ = dict(
Augie Fackler
cleanup: remove pointless r-prefixes on single-quoted strings...
r43906 (k.encode('utf-8'), v.encode('utf-8'))
Augie Fackler
formatting: blacken the codebase...
r43346 for k, v in os.environ.items() # re-exports
)
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034
Martin von Zweigbergk
encoding: remove unnecessary lambdas from _encodingfixers...
r39871 _encodingrewrites = {
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 b'646': b'ascii',
b'ANSI_X3.4-1968': b'ascii',
Dan Villiom Podlaski Christiansen
encoding: improve handling of buggy getpreferredencoding() on Mac OS X...
r11892 }
Yuya Nishihara
encoding: alias cp65001 to utf-8 on Windows...
r38633 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
# https://bugs.python.org/issue13216
if pycompat.iswindows and not pycompat.ispy3:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 _encodingrewrites[b'cp65001'] = b'utf-8'
Matt Mackall
move encoding bits from util to encoding...
r7948
try:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 encoding = environ.get(b"HGENCODING")
Matt Mackall
move encoding bits from util to encoding...
r7948 if not encoding:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
Martin von Zweigbergk
encoding: remove unnecessary lambdas from _encodingfixers...
r39871 encoding = _encodingrewrites.get(encoding, encoding)
Matt Mackall
move encoding bits from util to encoding...
r7948 except locale.Error:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 encoding = b'ascii'
encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
fallbackencoding = b'ISO-8859-1'
Matt Mackall
move encoding bits from util to encoding...
r7948
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
py3: change encoding.localstr to a subclass of bytes, not str
r33810 class localstr(bytes):
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 '''This class allows strings that are unmodified to be
round-tripped to the local encoding and back'''
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 def __new__(cls, u, l):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
Yuya Nishihara
py3: change encoding.localstr to a subclass of bytes, not str
r33810 s = bytes.__new__(cls, l)
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 s._utf8 = u
return s
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 def __hash__(self):
Augie Fackler
formatting: blacken the codebase...
r43346 return hash(self._utf8) # avoid collisions in local string space
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 class safelocalstr(bytes):
"""Tagged string denoting it was previously an internal UTF-8 string,
and can be converted back to UTF-8 losslessly
>>> assert safelocalstr(b'\\xc3') == b'\\xc3'
>>> assert b'\\xc3' == safelocalstr(b'\\xc3')
>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
"""
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
move encoding bits from util to encoding...
r7948 def tolocal(s):
Yuya Nishihara
typing: fix argument type of encoding.tolocal() and .fromutf8b()...
r44076 # type: (bytes) -> bytes
Matt Mackall
move encoding bits from util to encoding...
r7948 """
Convert a string from internal UTF-8 to local encoding
All internal strings should be UTF-8 but some repos before the
implementation of locale support may contain latin1 or possibly
other character sets. We attempt to decode everything strictly
using UTF-8, then Latin-1, and failing that, we use UTF-8 and
replace unknown characters.
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
The localstr class is used to cache the known UTF-8 encoding of
strings next to their local representation to allow lossless
round-trip conversion back to UTF-8.
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> u = b'foo: \\xc3\\xa4' # utf-8
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 >>> l = tolocal(u)
>>> l
'foo: ?'
>>> fromlocal(l)
'foo: \\xc3\\xa4'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> u2 = b'foo: \\xc3\\xa1'
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 >>> d = { l: 1, tolocal(u2): 2 }
Mads Kiilerich
tests: stabilize doctest output...
r18378 >>> len(d) # no collision
2
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> b'foo: ?' in d
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 False
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046 >>> l = tolocal(l1)
>>> l
'foo: ?'
>>> fromlocal(l) # magically in utf-8
'foo: \\xc3\\xa4'
Matt Mackall
move encoding bits from util to encoding...
r7948 """
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
Yuya Nishihara
encoding: add fast path of from/tolocal() for ASCII strings...
r33928 if isasciistr(s):
return s
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 try:
Matt Mackall
move encoding bits from util to encoding...
r7948 try:
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 # make sure string is actually stored in UTF-8
u = s.decode('UTF-8')
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 if encoding == b'UTF-8':
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 # fast path
return s
Augie Fackler
cleanup: remove pointless r-prefixes on double-quoted strings...
r43809 r = u.encode(_sysstr(encoding), "replace")
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 if u == r.decode(_sysstr(encoding)):
Matt Mackall
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...
r13940 # r is a safe, non-lossy encoding of s
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 return safelocalstr(r)
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 return localstr(s, r)
except UnicodeDecodeError:
# we should only get here if we're looking at an ancient changeset
try:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(fallbackencoding))
Augie Fackler
cleanup: remove pointless r-prefixes on double-quoted strings...
r43809 r = u.encode(_sysstr(encoding), "replace")
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 if u == r.decode(_sysstr(encoding)):
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 # r is a safe, non-lossy encoding of s
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 return safelocalstr(r)
Matt Mackall
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...
r13940 return localstr(u.encode('UTF-8'), r)
Matt Mackall
encoding: tune fast-path of tolocal a bit
r16274 except UnicodeDecodeError:
Augie Fackler
formatting: blacken the codebase...
r43346 u = s.decode("utf-8", "replace") # last ditch
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 # can't round-trip
Augie Fackler
cleanup: remove pointless r-prefixes on double-quoted strings...
r43809 return u.encode(_sysstr(encoding), "replace")
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 raise error.Abort(k, hint=b"please check your locale settings")
Matt Mackall
move encoding bits from util to encoding...
r7948
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
move encoding bits from util to encoding...
r7948 def fromlocal(s):
Augie Fackler
encoding: fix bad type annotation...
r44036 # type: (bytes) -> bytes
Matt Mackall
move encoding bits from util to encoding...
r7948 """
Convert a string from the local character encoding to UTF-8
We attempt to decode strings using the encoding mode set by
HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
characters will cause an error message. Other modes include
'replace', which replaces unknown characters with a special
Unicode character, and 'ignore', which drops the character.
"""
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
# can we do a lossless round-trip?
if isinstance(s, localstr):
return s._utf8
Yuya Nishihara
encoding: add fast path of from/tolocal() for ASCII strings...
r33928 if isasciistr(s):
return s
Matt Mackall
encoding: add localstr class to track UTF-8 version of transcoded strings...
r13046
Matt Mackall
move encoding bits from util to encoding...
r7948 try:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
return u.encode("utf-8")
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except UnicodeDecodeError as inst:
Augie Fackler
formatting: blacken the codebase...
r43346 sub = s[max(0, inst.start - 10) : inst.start + 10]
raise error.Abort(
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
Augie Fackler
formatting: blacken the codebase...
r43346 )
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 raise error.Abort(k, hint=b"please check your locale settings")
Matt Mackall
move encoding bits from util to encoding...
r7948
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
encoding: factor out unicode variants of from/tolocal()...
r31447 def unitolocal(u):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (Text) -> bytes
Yuya Nishihara
encoding: factor out unicode variants of from/tolocal()...
r31447 """Convert a unicode string to a byte string of local encoding"""
return tolocal(u.encode('utf-8'))
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
encoding: factor out unicode variants of from/tolocal()...
r31447 def unifromlocal(s):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes) -> Text
Yuya Nishihara
encoding: factor out unicode variants of from/tolocal()...
r31447 """Convert a byte string of local encoding to a unicode string"""
return fromlocal(s).decode('utf-8')
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 def unimethod(bytesfunc):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 """Create a proxy method that forwards __unicode__() and __str__() of
Python 3 to __bytes__()"""
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 def unifunc(obj):
return unifromlocal(bytesfunc(obj))
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 return unifunc
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
encoding: add converter between native str and byte string...
r31448 # converter functions between native str and byte string. use these if the
# character encoding is not aware (e.g. exception message) or is known to
# be locale dependent (e.g. date formatting.)
if pycompat.ispy3:
strtolocal = unitolocal
strfromlocal = unifromlocal
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 strmethod = unimethod
Yuya Nishihara
encoding: add converter between native str and byte string...
r31448 else:
Augie Fackler
encoding: define local identify functions with explicit type comments...
r43770
def strtolocal(s):
# type: (str) -> bytes
return s
def strfromlocal(s):
# type: (bytes) -> str
return s
Yuya Nishihara
py3: add utility to forward __str__() to __bytes__()...
r33022 strmethod = pycompat.identity
Yuya Nishihara
encoding: add converter between native str and byte string...
r31448
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034 if not _nativeenviron:
# now encoding and helper functions are available, recreate the environ
# dict to be exported to other modules
Augie Fackler
formatting: blacken the codebase...
r43346 environ = dict(
Augie Fackler
cleanup: remove pointless r-prefixes on single-quoted strings...
r43906 (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
Augie Fackler
formatting: blacken the codebase...
r43346 for k, v in os.environ.items() # re-exports
)
Yuya Nishihara
py3: provide encoding.environ which is a dict of bytes...
r30034
Matt Harbison
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...
r39843 if pycompat.ispy3:
# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
# returns bytes.
Matt Harbison
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings...
r39844 if pycompat.iswindows:
# Python 3 on Windows issues a DeprecationWarning about using the bytes
# API when os.getcwdb() is called.
getcwd = lambda: strtolocal(os.getcwd()) # re-exports
else:
getcwd = os.getcwdb # re-exports
Matt Harbison
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...
r39843 else:
getcwd = os.getcwd # re-exports
Matt Mackall
encoding: default ambiguous character to narrow...
r12866 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
Augie Fackler
formatting: blacken the codebase...
r43346 _wide = _sysstr(
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
and b"WFA"
or b"WF"
Augie Fackler
formatting: blacken the codebase...
r43346 )
Matt Mackall
encoding: default ambiguous character to narrow...
r12866
Matt Mackall
move encoding bits from util to encoding...
r7948 def colwidth(s):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes) -> int
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 b"Find the column width of a string for display in the local encoding"
Augie Fackler
cleanup: remove pointless r-prefixes on single-quoted strings...
r43906 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
FUJIWARA Katsunori
i18n: calculate terminal columns by width information of each characters...
r15066
Augie Fackler
formatting: blacken the codebase...
r43346
FUJIWARA Katsunori
i18n: calculate terminal columns by width information of each characters...
r15066 def ucolwidth(d):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (Text) -> int
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 b"Find the column width of a Unicode string for display"
Augie Fackler
encoding: use getattr isntead of hasattr
r14951 eaw = getattr(unicodedata, 'east_asian_width', None)
if eaw is not None:
Yuya Nishihara
encoding: make sure "wide" variable never be referenced from other modules...
r32537 return sum([eaw(c) in _wide and 2 or 1 for c in d])
Matt Mackall
move encoding bits from util to encoding...
r7948 return len(d)
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
encoding: add getcols to extract substrings based on column width
r15143 def getcols(s, start, c):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes, int, int) -> bytes
Matt Mackall
encoding: add getcols to extract substrings based on column width
r15143 '''Use colwidth to find a c-column substring of s starting at byte
index start'''
Gregory Szorc
global: use pycompat.xrange()...
r38806 for x in pycompat.xrange(start + c, len(s)):
Matt Mackall
encoding: add getcols to extract substrings based on column width
r15143 t = s[start:x]
if colwidth(t) == c:
return t
Yuya Nishihara
encoding: make getcols() raise exception explicitly...
r44074 raise ValueError('substring not found')
Matt Mackall
encoding: add getcols to extract substrings based on column width
r15143
Augie Fackler
formatting: blacken the codebase...
r43346
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 def trim(s, width, ellipsis=b'', leftside=False):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes, int, bytes, bool) -> bytes
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 """Trim string 's' to at most 'width' columns (including 'ellipsis').
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 If 'leftside' is True, left side of string 's' is trimmed.
'ellipsis' is always placed at trimmed side.
Yuya Nishihara
doctest: do not embed non-ascii characters in docstring...
r34138 >>> from .node import bin
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> def bprint(s):
... print(pycompat.sysstr(s))
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> ellipsis = b'+++'
Gregory Szorc
encoding: use absolute_import
r27355 >>> from . import encoding
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> encoding.encoding = b'utf-8'
>>> t = b'1234567890'
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 12, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 1234567890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 10, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 1234567890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 12345+++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++67890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 12345678
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 34567890
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 3, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 1, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +
>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
Yuya Nishihara
doctest: pass encoding name as system string
r34137 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 12, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 10, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84+++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++\xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 5))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \xe3\x81\x82\xe3\x81\x84
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 5, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 \xe3\x81\x88\xe3\x81\x8a
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 4, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++
Yuya Nishihara
doctest: do not embed non-ascii characters in docstring...
r34138 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 12, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 10, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55+++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 +++\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 \x11\x22\x33\x44\x55\x66\x77\x88
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 8, leftside=True))
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 \x33\x44\x55\x66\x77\x88\x99\xaa
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 3, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +++
Yuya Nishihara
doctest: use print_function and convert bytes to unicode where needed
r34139 >>> bprint(trim(t, 1, ellipsis=ellipsis))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 +
"""
try:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding))
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 except UnicodeDecodeError:
Augie Fackler
formatting: blacken the codebase...
r43346 if len(s) <= width: # trimming is not needed
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 return s
width -= len(ellipsis)
Augie Fackler
formatting: blacken the codebase...
r43346 if width <= 0: # no enough room even for ellipsis
return ellipsis[: width + len(ellipsis)]
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 if leftside:
return ellipsis + s[-width:]
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 return s[:width] + ellipsis
Augie Fackler
formatting: blacken the codebase...
r43346 if ucolwidth(u) <= width: # trimming is not needed
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 return s
width -= len(ellipsis)
Augie Fackler
formatting: blacken the codebase...
r43346 if width <= 0: # no enough room even for ellipsis
return ellipsis[: width + len(ellipsis)]
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856
FUJIWARA Katsunori
encoding: add 'leftside' argument into 'trim' to switch trimming side
r21861 if leftside:
uslice = lambda i: u[i:]
concat = lambda s: ellipsis + s
else:
uslice = lambda i: u[:-i]
concat = lambda s: s + ellipsis
Gregory Szorc
global: use pycompat.xrange()...
r38806 for i in pycompat.xrange(1, len(u)):
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856 usub = uslice(i)
if ucolwidth(usub) <= width:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 return concat(usub.encode(_sysstr(encoding)))
Augie Fackler
formatting: blacken the codebase...
r43346 return ellipsis # no enough room for multi-column characters
FUJIWARA Katsunori
encoding: add 'trim' to trim multi-byte characters at most specified columns...
r21856
Matt Mackall
encoding: add an encoding-aware lower function
r14069 def lower(s):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes) -> bytes
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 b"best-effort encoding-aware case-folding of local string s"
Matt Mackall
encoding: add an encoding-aware lower function
r14069 try:
Siddharth Agarwal
encoding.lower: use fast ASCII lower...
r22779 return asciilower(s)
Martin Geisler
encoding: use s.decode to trigger UnicodeDecodeError...
r17235 except UnicodeDecodeError:
Matt Mackall
encoding: add fast-path for ASCII lowercase
r16387 pass
try:
Matt Mackall
encoding: add an encoding-aware lower function
r14069 if isinstance(s, localstr):
u = s._utf8.decode("utf-8")
else:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
Matt Mackall
encoding: add an encoding-aware lower function
r14069
lu = u.lower()
if u == lu:
Augie Fackler
formatting: blacken the codebase...
r43346 return s # preserve localstring
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 return lu.encode(_sysstr(encoding))
Matt Mackall
encoding: add an encoding-aware lower function
r14069 except UnicodeError:
Augie Fackler
formatting: blacken the codebase...
r43346 return s.lower() # we don't know how to fold this except in ASCII
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 raise error.Abort(k, hint=b"please check your locale settings")
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672
Augie Fackler
formatting: blacken the codebase...
r43346
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 def upper(s):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (bytes) -> bytes
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 b"best-effort encoding-aware case-folding of local string s"
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 try:
Siddharth Agarwal
encoding: use parsers.asciiupper when available...
r24578 return asciiupper(s)
Martin Geisler
encoding: add fast-path for ASCII uppercase....
r17236 except UnicodeDecodeError:
Siddharth Agarwal
encoding.upper: factor out fallback code...
r24597 return upperfallback(s)
Augie Fackler
formatting: blacken the codebase...
r43346
Siddharth Agarwal
encoding.upper: factor out fallback code...
r24597 def upperfallback(s):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (Any) -> Any
Martin Geisler
encoding: add fast-path for ASCII uppercase....
r17236 try:
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 if isinstance(s, localstr):
u = s._utf8.decode("utf-8")
else:
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672
uu = u.upper()
if u == uu:
Augie Fackler
formatting: blacken the codebase...
r43346 return s # preserve localstring
Yuya Nishihara
py3: convert encoding name and mode to str...
r30033 return uu.encode(_sysstr(encoding))
FUJIWARA Katsunori
i18n: use encoding.lower/upper for encoding aware case folding...
r15672 except UnicodeError:
Augie Fackler
formatting: blacken the codebase...
r43346 return s.upper() # we don't know how to fold this except in ASCII
Gregory Szorc
global: mass rewrite to use modern exception syntax...
r25660 except LookupError as k:
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 raise error.Abort(k, hint=b"please check your locale settings")
Matt Mackall
encoding: introduce utf8-b helpers
r16133
Augie Fackler
formatting: blacken the codebase...
r43346
Siddharth Agarwal
encoding: define an enum that specifies what normcase does to ASCII strings...
r24593 class normcasespecs(object):
'''what a platform's normcase does to ASCII strings
This is specified per platform, and should be consistent with what normcase
on that platform actually does.
lower: normcase lowercases ASCII strings
upper: normcase uppercases ASCII strings
Siddharth Agarwal
util.h: define an enum for normcase specs...
r24608 other: the fallback function should always be called
This should be kept in sync with normcase_spec in util.h.'''
Augie Fackler
formatting: blacken the codebase...
r43346
Siddharth Agarwal
encoding: define an enum that specifies what normcase does to ASCII strings...
r24593 lower = -1
upper = 1
other = 0
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 def jsonescape(s, paranoid=False):
Augie Fackler
encoding: add comment-based type hints for pytype...
r43802 # type: (Any, Any) -> Any
Matt Mackall
encoding: add json escaping filter...
r22426 '''returns a string suitable for JSON
JSON is problematic for us because it doesn't support non-Unicode
bytes. To deal with this, we take the following approach:
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 - localstr/safelocalstr objects are converted back to UTF-8
Matt Mackall
encoding: add json escaping filter...
r22426 - valid UTF-8/ASCII strings are passed as-is
- other strings are converted to UTF-8b surrogate encoding
- apply JSON-specified string escaping
(escapes are doubled in these tests)
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'this is a test')
Matt Mackall
encoding: add json escaping filter...
r22426 'this is a test'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
Yuya Nishihara
encoding: escape U+007F (DEL) character in JSON...
r27881 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'a weird byte: \\xdd')
Matt Mackall
encoding: add json escaping filter...
r22426 'a weird byte: \\xed\\xb3\\x9d'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
Matt Mackall
encoding: add json escaping filter...
r22426 'utf-8: caf\\xc3\\xa9'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'')
Matt Mackall
encoding: add json escaping filter...
r22426 ''
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068
Yuya Nishihara
encoding: backport paranoid escaping from templatefilters.jsonescape()...
r28069 If paranoid, non-ascii and common troublesome characters are also escaped.
This is suitable for web output.
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> s = b'escape characters: \\0 \\x0b \\x7f'
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
Yuya Nishihara
encoding: add fast path of jsonescape() (issue5533)...
r33926 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'escape boundary: ~ \\\\u007f \\\\u0080'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'a weird byte: \\\\udcdd'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'utf-8: caf\\\\u00e9'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 'non-BMP: \\\\ud834\\\\udd1e'
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> jsonescape(b'<foo@example.org>', paranoid=True)
Yuya Nishihara
encoding: backport paranoid escaping from templatefilters.jsonescape()...
r28069 '\\\\u003cfoo@example.org\\\\u003e'
Matt Mackall
encoding: add json escaping filter...
r22426 '''
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 u8chars = toutf8b(s)
try:
Yuya Nishihara
encoding: extract stub for fast JSON escape...
r33925 return _jsonescapeu8fast(u8chars, paranoid)
except ValueError:
Yuya Nishihara
encoding: add option to escape non-ascii characters in JSON...
r28068 pass
Yuya Nishihara
encoding: extract stub for fast JSON escape...
r33925 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
Matt Mackall
encoding: add json escaping filter...
r22426
Augie Fackler
formatting: blacken the codebase...
r43346
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
# bytes are mapped to that range.
if pycompat.ispy3:
_utf8strict = r'surrogatepass'
else:
_utf8strict = r'strict'
Matt Mackall
encoding: add getutf8char helper...
r26875 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
encoding: add getutf8char helper...
r26875 def getutf8char(s, pos):
Yuya Nishihara
typing: constrain argument/return types of encoding.toutf8b()
r44077 # type: (bytes, int) -> bytes
Matt Mackall
encoding: add getutf8char helper...
r26875 '''get the next full utf-8 character in the given string, starting at pos
Raises a UnicodeError if the given location does not start a valid
utf-8 character.
'''
# find how many bytes to attempt decoding from first nibble
Augie Fackler
formatting: blacken the codebase...
r43346 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
if not l: # ascii
return s[pos : pos + 1]
Matt Mackall
encoding: add getutf8char helper...
r26875
Augie Fackler
formatting: blacken the codebase...
r43346 c = s[pos : pos + l]
Matt Mackall
encoding: add getutf8char helper...
r26875 # validate with attempted decode
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 c.decode("utf-8", _utf8strict)
Matt Mackall
encoding: add getutf8char helper...
r26875 return c
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
encoding: introduce utf8-b helpers
r16133 def toutf8b(s):
Yuya Nishihara
typing: constrain argument/return types of encoding.toutf8b()
r44077 # type: (bytes) -> bytes
Matt Mackall
encoding: introduce utf8-b helpers
r16133 '''convert a local, possibly-binary string into UTF-8b
This is intended as a generic method to preserve data when working
with schemes like JSON and XML that have no provision for
arbitrary byte strings. As Mercurial often doesn't know
what encoding data is in, we use so-called UTF-8b.
If a string is already valid UTF-8 (or ASCII), it passes unmodified.
Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
uDC00-uDCFF.
Principles of operation:
Mads Kiilerich
fix trivial spelling errors
r17424 - ASCII and UTF-8 data successfully round-trips and is understood
Matt Mackall
encoding: introduce utf8-b helpers
r16133 by Unicode-oriented clients
- filenames and file contents in arbitrary other encodings can have
be round-tripped or recovered by clueful clients
- local strings that have a cached known UTF-8 encoding (aka
localstr) get sent as UTF-8 so Unicode-oriented clients get the
Unicode data they want
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
Matt Mackall
encoding: introduce utf8-b helpers
r16133 - because we must preserve UTF-8 bytestring in places such as
filenames, metadata can't be roundtripped without help
(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
arbitrary bytes into an internal Unicode format that can be
re-encoded back into the original. Here we are exposing the
internal surrogate encoding as a UTF-8 string.)
'''
Yuya Nishihara
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...
r37965 if isinstance(s, localstr):
# assume that the original UTF-8 sequence would never contain
# invalid characters in U+DCxx range
return s._utf8
Yuya Nishihara
encoding: introduce tagging type for non-lossy non-ASCII string...
r37966 elif isinstance(s, safelocalstr):
# already verified that s is non-lossy in legacy encoding, which
# shouldn't contain characters in U+DCxx range
return fromlocal(s)
Yuya Nishihara
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...
r37965 elif isasciistr(s):
Yuya Nishihara
encoding: add fast path of from/toutf8b() for ASCII strings...
r33929 return s
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 if b"\xed" not in s:
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 try:
Yuya Nishihara
py3: use 'surrogatepass' error handler to process U+DCxx transparently...
r34215 s.decode('utf-8', _utf8strict)
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 return s
except UnicodeDecodeError:
pass
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878
Yuya Nishihara
py3: wrap bytes in encoding.from/toutf8b() with bytestr
r34213 s = pycompat.bytestr(s)
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 r = b""
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878 pos = 0
l = len(s)
while pos < l:
try:
c = getutf8char(s, pos)
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 # have to re-escape existing U+DCxx characters
Augie Fackler
formatting: blacken the codebase...
r43346 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
Matt Mackall
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...
r26879 pos += 1
else:
pos += len(c)
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878 except UnicodeDecodeError:
Augie Fackler
formatting: blacken the codebase...
r43346 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
Matt Mackall
encoding: use getutf8char in toutf8b...
r26878 pos += 1
r += c
return r
Matt Mackall
encoding: introduce utf8-b helpers
r16133
Augie Fackler
formatting: blacken the codebase...
r43346
Matt Mackall
encoding: introduce utf8-b helpers
r16133 def fromutf8b(s):
Yuya Nishihara
typing: fix argument type of encoding.tolocal() and .fromutf8b()...
r44076 # type: (bytes) -> bytes
Matt Mackall
encoding: introduce utf8-b helpers
r16133 '''Given a UTF-8b string, return a local, possibly-binary string.
return the original binary string. This
is a round-trip process for strings like filenames, but metadata
that's was passed through tolocal will remain in UTF-8.
Matt Mackall
encoding: extend test cases for utf8b...
r26963 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> m = b"\\xc3\\xa9\\x99abcd"
Matt Mackall
encoding: extend test cases for utf8b...
r26963 >>> toutf8b(m)
Matt Mackall
encoding: introduce utf8-b helpers
r16133 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
Matt Mackall
encoding: extend test cases for utf8b...
r26963 >>> roundtrip(m)
True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xc2\\xc2\\x80")
Matt Mackall
encoding: extend test cases for utf8b...
r26963 True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xef\\xbf\\xbd")
Matt Mackall
encoding: extend test cases for utf8b...
r26963 True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
Matt Mackall
encoding: introduce utf8-b helpers
r16133 True
Yuya Nishihara
doctest: bulk-replace string literals with b'' for Python 3...
r34133 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 True
Matt Mackall
encoding: introduce utf8-b helpers
r16133 '''
Yuya Nishihara
encoding: add fast path of from/toutf8b() for ASCII strings...
r33929 if isasciistr(s):
return s
Matt Mackall
encoding: introduce utf8-b helpers
r16133 # fast path - look for uDxxx prefixes in s
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 if b"\xed" not in s:
Matt Mackall
encoding: introduce utf8-b helpers
r16133 return s
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 # We could do this with the unicode type but some Python builds
# use UTF-16 internally (issue5031) which causes non-BMP code
# points to be escaped. Instead, we use our handy getutf8char
# helper again to walk the string without "decoding" it.
Yuya Nishihara
py3: wrap bytes in encoding.from/toutf8b() with bytestr
r34213 s = pycompat.bytestr(s)
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 r = b""
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 pos = 0
l = len(s)
while pos < l:
c = getutf8char(s, pos)
pos += len(c)
# unescape U+DCxx characters
Augie Fackler
formatting: byteify all mercurial/ and hgext/ string literals...
r43347 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
Augie Fackler
formatting: blacken the codebase...
r43346 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
Matt Mackall
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...
r27699 r += c
Matt Mackall
encoding: introduce utf8-b helpers
r16133 return r