upstream/mercurial-mirror Files · mercurial/encoding.py

discovery: add a `devel', b'discovery.grow-sample`...

discovery: add a `devel', b'discovery.grow-sample` That option make it possible to disable the "sample growing" behavior when doing analysis and comparison. Differential Revision: https://phab.mercurial-scm.org/D9798

Augie Fackler - - Load All Authors

File last commit:

r46554:89a2afe3 default


                r47017:397e39ad

default

Download file

             encoding.py
        
                    704 lines
            
             | 21.7 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / encoding.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Martin Geisler
    
put license and copyright info into comment blocks

              r8226
            
      # encoding.py - character transcoding support for Mercurial

      #

      #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others

      #

      # This software may be used and distributed according to the terms of the

        Matt Mackall
    
Update license to GPLv2+

              r10263
            
      # GNU General Public License version 2 or any later version.

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
      from __future__ import absolute_import, print_function

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      import locale

      import os

      import unicodedata

        Gregory Szorc
    
py3: manually import getattr where it is needed...

              r43359
            
      from .pycompat import getattr

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      from . import (

          error,

        Yuya Nishihara
    
parsers: switch to policy importer...

              r32372
            
          policy,

        Yuya Nishihara
    
pycompat: provide 'ispy3' constant...

              r30030
            
          pycompat,

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      )

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      from .pure import charencode as charencodepure

        Yuya Nishihara
    
encoding: extract stub for fast JSON escape...

              r33925
            
        Yuya Nishihara
    
typing: consolidate "if not globals():" trick...

              r44212
            
      if pycompat.TYPE_CHECKING:

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          from typing import (

              Any,

              Callable,

              List,

              Text,

              Type,

              TypeVar,

              Union,

          )

          # keep pyflakes happy

          for t in (Any, Callable, List, Text, Type, Union):

              assert t

        Yuya Nishihara
    
typing: fix forward reference in _Tlocalstr type bound...

              r44075
            
          _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
        Augie Fackler
    
cleanup: remove pointless r-prefixes on single-quoted strings...

              r43906
            
      charencode = policy.importmod('charencode')

        Yuya Nishihara
    
encoding: drop circular import by proxying through '<policy>.charencode'...

              r33756
            
        Yuya Nishihara
    
encoding: add function to test if a str consists of ASCII characters...

              r33927
            
      isasciistr = charencode.isasciistr

        Yuya Nishihara
    
encoding: drop circular import by proxying through '<policy>.charencode'...

              r33756
            
      asciilower = charencode.asciilower

      asciiupper = charencode.asciiupper

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
      _jsonescapeu8fast = charencode.jsonescapeu8fast

        Yuya Nishihara
    
encoding: drop circular import by proxying through '<policy>.charencode'...

              r33756
            
        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
      _sysstr = pycompat.sysstr

        Yuya Nishihara
    
pycompat: provide 'ispy3' constant...

              r30030
            
      if pycompat.ispy3:

        Gregory Szorc
    
encoding: make HFS+ ignore code Python 3 compatible...

              r28507
            
          unichr = chr

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
      # These unicode characters are ignored by HFS+ (Apple Technote 1150,

      # "Unicode Subtleties"), so we need to ignore them in some places for

      # sanity.

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      _ignore = [

          unichr(int(x, 16)).encode("utf-8")

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

          b"206a 206b 206c 206d 206e 206f feff".split()

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      ]

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
      # verify the next function will work

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
      assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
      def hfsignoreclean(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> bytes

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
          """Remove codepoints ignored by HFS+ from s.

          >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

          '.hg'

          >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

          '.hg'

          """

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          if b"\xe2" in s or b"\xef" in s:

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
              for c in _ignore:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  s = s.replace(c, b'')

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
          return s

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      # encoding.environ is provided read-only, which may not be used to modify

      # the process environment

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      if not pycompat.ispy3:

        Yuya Nishihara
    
check-code: ignore re-exports of os.environ in encoding.py...

              r32185
            
          environ = os.environ  # re-exports

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      elif _nativeenviron:

        Yuya Nishihara
    
check-code: ignore re-exports of os.environ in encoding.py...

              r32185
            
          environ = os.environb  # re-exports

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      else:

          # preferred encoding isn't known yet; use utf-8 to avoid unicode error

          # and recreate it once encoding is settled

        Augie Fackler
    
cleanup: run pyupgrade on our source tree to clean up varying things...

              r44937
            
          environ = {

              k.encode('utf-8'): v.encode('utf-8')

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              for k, v in os.environ.items()  # re-exports

        Augie Fackler
    
cleanup: run pyupgrade on our source tree to clean up varying things...

              r44937
            
          }

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
        Martin von Zweigbergk
    
encoding: remove unnecessary lambdas from _encodingfixers...

              r39871
            
      _encodingrewrites = {

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          b'646': b'ascii',

          b'ANSI_X3.4-1968': b'ascii',

        Dan Villiom Podlaski Christiansen
    
encoding: improve handling of buggy getpreferredencoding() on Mac OS X...

              r11892
            
      }

        Yuya Nishihara
    
encoding: alias cp65001 to utf-8 on Windows...

              r38633
            
      # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

      # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

      # https://bugs.python.org/issue13216

      if pycompat.iswindows and not pycompat.ispy3:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          _encodingrewrites[b'cp65001'] = b'utf-8'

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      try:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          encoding = environ.get(b"HGENCODING")

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          if not encoding:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

        Martin von Zweigbergk
    
encoding: remove unnecessary lambdas from _encodingfixers...

              r39871
            
              encoding = _encodingrewrites.get(encoding, encoding)

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      except locale.Error:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          encoding = b'ascii'

      encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

      fallbackencoding = b'ISO-8859-1'

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: change encoding.localstr to a subclass of bytes, not str

              r33810
            
      class localstr(bytes):

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """This class allows strings that are unmodified to be

          round-tripped to the local encoding and back"""

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          def __new__(cls, u, l):

        Yuya Nishihara
    
py3: change encoding.localstr to a subclass of bytes, not str

              r33810
            
              s = bytes.__new__(cls, l)

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
              s._utf8 = u

              return s

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
typing: consolidate "if not globals():" trick...

              r44212
            
          if pycompat.TYPE_CHECKING:

        Yuya Nishihara
    
typing: add pseudo localstr.__init__() to help pytype...

              r44080
            
              # pseudo implementation to help pytype see localstr() constructor

              def __init__(self, u, l):

                  # type: (bytes, bytes) -> None

                  super(localstr, self).__init__(l)

                  self._utf8 = u

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          def __hash__(self):

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              return hash(self._utf8)  # avoid collisions in local string space

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
      class safelocalstr(bytes):

          """Tagged string denoting it was previously an internal UTF-8 string,

          and can be converted back to UTF-8 losslessly

          >>> assert safelocalstr(b'\\xc3') == b'\\xc3'

          >>> assert b'\\xc3' == safelocalstr(b'\\xc3')

          >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

          >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

          """

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      def tolocal(s):

        Yuya Nishihara
    
typing: fix argument type of encoding.tolocal() and .fromutf8b()...

              r44076
            
          # type: (bytes) -> bytes

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          """

          Convert a string from internal UTF-8 to local encoding

          All internal strings should be UTF-8 but some repos before the

          implementation of locale support may contain latin1 or possibly

          other character sets. We attempt to decode everything strictly

          using UTF-8, then Latin-1, and failing that, we use UTF-8 and

          replace unknown characters.

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          The localstr class is used to cache the known UTF-8 encoding of

          strings next to their local representation to allow lossless

          round-trip conversion back to UTF-8.

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> u = b'foo: \\xc3\\xa4' # utf-8

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          >>> l = tolocal(u)

          >>> l

          'foo: ?'

          >>> fromlocal(l)

          'foo: \\xc3\\xa4'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> u2 = b'foo: \\xc3\\xa1'

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          >>> d = { l: 1, tolocal(u2): 2 }

        Mads Kiilerich
    
tests: stabilize doctest output...

              r18378
            
          >>> len(d) # no collision

          2

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> b'foo: ?' in d

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          False

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> l1 = b'foo: \\xe4' # historical latin1 fallback

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          >>> l = tolocal(l1)

          >>> l

          'foo: ?'

          >>> fromlocal(l) # magically in utf-8

          'foo: \\xc3\\xa4'

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          """

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
        Yuya Nishihara
    
encoding: add fast path of from/tolocal() for ASCII strings...

              r33928
            
          if isasciistr(s):

              return s

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
          try:

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
              try:

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                  # make sure string is actually stored in UTF-8

                  u = s.decode('UTF-8')

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  if encoding == b'UTF-8':

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                      # fast path

                      return s

        Augie Fackler
    
cleanup: remove pointless r-prefixes on double-quoted strings...

              r43809
            
                  r = u.encode(_sysstr(encoding), "replace")

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                  if u == r.decode(_sysstr(encoding)):

        Matt Mackall
    
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...

              r13940
            
                      # r is a safe, non-lossy encoding of s

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
                      return safelocalstr(r)

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                  return localstr(s, r)

              except UnicodeDecodeError:

                  # we should only get here if we're looking at an ancient changeset

                  try:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                      u = s.decode(_sysstr(fallbackencoding))

        Augie Fackler
    
cleanup: remove pointless r-prefixes on double-quoted strings...

              r43809
            
                      r = u.encode(_sysstr(encoding), "replace")

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                      if u == r.decode(_sysstr(encoding)):

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                          # r is a safe, non-lossy encoding of s

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
                          return safelocalstr(r)

        Matt Mackall
    
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...

              r13940
            
                      return localstr(u.encode('UTF-8'), r)

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                  except UnicodeDecodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                      u = s.decode("utf-8", "replace")  # last ditch

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                      # can't round-trip

        Augie Fackler
    
cleanup: remove pointless r-prefixes on double-quoted strings...

              r43809
            
                      return u.encode(_sysstr(encoding), "replace")

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except LookupError as k:

        Martin von Zweigbergk
    
errors: stop passing non-strings to Abort's constructor...

              r46273
            
              raise error.Abort(

                  pycompat.bytestr(k), hint=b"please check your locale settings"

              )

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      def fromlocal(s):

        Augie Fackler
    
encoding: fix bad type annotation...

              r44036
            
          # type: (bytes) -> bytes

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          """

          Convert a string from the local character encoding to UTF-8

          We attempt to decode strings using the encoding mode set by

          HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

          characters will cause an error message. Other modes include

          'replace', which replaces unknown characters with a special

          Unicode character, and 'ignore', which drops the character.

          """

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          # can we do a lossless round-trip?

          if isinstance(s, localstr):

              return s._utf8

        Yuya Nishihara
    
encoding: add fast path of from/tolocal() for ASCII strings...

              r33928
            
          if isasciistr(s):

              return s

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          try:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
              u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

              return u.encode("utf-8")

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except UnicodeDecodeError as inst:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              sub = s[max(0, inst.start - 10) : inst.start + 10]

              raise error.Abort(

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              )

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except LookupError as k:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              raise error.Abort(k, hint=b"please check your locale settings")

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
      def unitolocal(u):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Text) -> bytes

        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
          """Convert a unicode string to a byte string of local encoding"""

          return tolocal(u.encode('utf-8'))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
      def unifromlocal(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> Text

        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
          """Convert a byte string of local encoding to a unicode string"""

          return fromlocal(s).decode('utf-8')

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
      def unimethod(bytesfunc):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          """Create a proxy method that forwards __unicode__() and __str__() of

          Python 3 to __bytes__()"""

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          def unifunc(obj):

              return unifromlocal(bytesfunc(obj))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          return unifunc

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: add converter between native str and byte string...

              r31448
            
      # converter functions between native str and byte string. use these if the

      # character encoding is not aware (e.g. exception message) or is known to

      # be locale dependent (e.g. date formatting.)

      if pycompat.ispy3:

          strtolocal = unitolocal

          strfromlocal = unifromlocal

        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          strmethod = unimethod

        Yuya Nishihara
    
encoding: add converter between native str and byte string...

              r31448
            
      else:

        Augie Fackler
    
encoding: define local identify functions with explicit type comments...

              r43770
            
          def strtolocal(s):

              # type: (str) -> bytes

        Yuya Nishihara
    
typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()...

              r44078
            
              return s  # pytype: disable=bad-return-type

        Augie Fackler
    
encoding: define local identify functions with explicit type comments...

              r43770
            
          def strfromlocal(s):

              # type: (bytes) -> str

        Yuya Nishihara
    
typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()...

              r44078
            
              return s  # pytype: disable=bad-return-type

        Augie Fackler
    
encoding: define local identify functions with explicit type comments...

              r43770
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          strmethod = pycompat.identity

        Yuya Nishihara
    
encoding: add converter between native str and byte string...

              r31448
            
        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      if not _nativeenviron:

          # now encoding and helper functions are available, recreate the environ

          # dict to be exported to other modules

        Augie Fackler
    
cleanup: run pyupgrade on our source tree to clean up varying things...

              r44937
            
          environ = {

              tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              for k, v in os.environ.items()  # re-exports

        Augie Fackler
    
cleanup: run pyupgrade on our source tree to clean up varying things...

              r44937
            
          }

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
        Matt Harbison
    
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...

              r39843
            
      if pycompat.ispy3:

          # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

          # returns bytes.

        Matt Harbison
    
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings...

              r39844
            
          if pycompat.iswindows:

              # Python 3 on Windows issues a DeprecationWarning about using the bytes

              # API when os.getcwdb() is called.

              getcwd = lambda: strtolocal(os.getcwd())  # re-exports

          else:

              getcwd = os.getcwdb  # re-exports

        Matt Harbison
    
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...

              r39843
            
      else:

          getcwd = os.getcwd  # re-exports

        Matt Mackall
    
encoding: default ambiguous character to narrow...

              r12866
            
      # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      _wide = _sysstr(

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

          and b"WFA"

          or b"WF"

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      )

        Matt Mackall
    
encoding: default ambiguous character to narrow...

              r12866
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      def colwidth(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> int

        Matt Harbison
    
cleanup: fix docstring formatting...

              r44226
            
          """Find the column width of a string for display in the local encoding"""

        Augie Fackler
    
cleanup: remove pointless r-prefixes on single-quoted strings...

              r43906
            
          return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

        FUJIWARA Katsunori
    
i18n: calculate terminal columns by width information of each characters...

              r15066
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        FUJIWARA Katsunori
    
i18n: calculate terminal columns by width information of each characters...

              r15066
            
      def ucolwidth(d):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Text) -> int

        Matt Harbison
    
cleanup: fix docstring formatting...

              r44226
            
          """Find the column width of a Unicode string for display"""

        Augie Fackler
    
encoding: use getattr isntead of hasattr

              r14951
            
          eaw = getattr(unicodedata, 'east_asian_width', None)

          if eaw is not None:

        Yuya Nishihara
    
encoding: make sure "wide" variable never be referenced from other modules...

              r32537
            
              return sum([eaw(c) in _wide and 2 or 1 for c in d])

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          return len(d)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: add getcols to extract substrings based on column width

              r15143
            
      def getcols(s, start, c):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes, int, int) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """Use colwidth to find a c-column substring of s starting at byte

          index start"""

        Gregory Szorc
    
global: use pycompat.xrange()...

              r38806
            
          for x in pycompat.xrange(start + c, len(s)):

        Matt Mackall
    
encoding: add getcols to extract substrings based on column width

              r15143
            
              t = s[start:x]

              if colwidth(t) == c:

                  return t

        Yuya Nishihara
    
encoding: make getcols() raise exception explicitly...

              r44074
            
          raise ValueError('substring not found')

        Matt Mackall
    
encoding: add getcols to extract substrings based on column width

              r15143
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
      def trim(s, width, ellipsis=b'', leftside=False):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes, int, bytes, bool) -> bytes

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          """Trim string 's' to at most 'width' columns (including 'ellipsis').

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          If 'leftside' is True, left side of string 's' is trimmed.

          'ellipsis' is always placed at trimmed side.

        Yuya Nishihara
    
doctest: do not embed non-ascii characters in docstring...

              r34138
            
          >>> from .node import bin

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> def bprint(s):

          ...     print(pycompat.sysstr(s))

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> ellipsis = b'+++'

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
          >>> from . import encoding

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> encoding.encoding = b'utf-8'

          >>> t = b'1234567890'

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 12, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          1234567890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 10, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          1234567890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          12345+++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++67890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          12345678

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          34567890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 3, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 1, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +

          >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

        Yuya Nishihara
    
doctest: pass encoding name as system string

              r34137
            
          >>> t = u.encode(pycompat.sysstr(encoding.encoding))

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 12, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 10, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84+++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++\xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 5))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 5, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          \xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 4, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++

        Yuya Nishihara
    
doctest: do not embed non-ascii characters in docstring...

              r34138
            
          >>> t = bin(b'112233445566778899aa') # invalid byte sequence

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 12, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 10, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55+++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55\x66\x77\x88

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          \x33\x44\x55\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 3, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 1, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +

          """

          try:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
              u = s.decode(_sysstr(encoding))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          except UnicodeDecodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              if len(s) <= width:  # trimming is not needed

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
                  return s

              width -= len(ellipsis)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              if width <= 0:  # no enough room even for ellipsis

                  return ellipsis[: width + len(ellipsis)]

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
              if leftside:

                  return ellipsis + s[-width:]

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
              return s[:width] + ellipsis

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          if ucolwidth(u) <= width:  # trimming is not needed

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
              return s

          width -= len(ellipsis)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          if width <= 0:  # no enough room even for ellipsis

              return ellipsis[: width + len(ellipsis)]

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          if leftside:

              uslice = lambda i: u[i:]

              concat = lambda s: ellipsis + s

          else:

              uslice = lambda i: u[:-i]

              concat = lambda s: s + ellipsis

        Gregory Szorc
    
global: use pycompat.xrange()...

              r38806
            
          for i in pycompat.xrange(1, len(u)):

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
              usub = uslice(i)

              if ucolwidth(usub) <= width:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                  return concat(usub.encode(_sysstr(encoding)))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          return ellipsis  # no enough room for multi-column characters

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
        Matt Mackall
    
encoding: add an encoding-aware lower function

              r14069
            
      def lower(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> bytes

        Matt Harbison
    
cleanup: fix docstring formatting...

              r44226
            
          """best-effort encoding-aware case-folding of local string s"""

        Matt Mackall
    
encoding: add an encoding-aware lower function

              r14069
            
          try:

        Siddharth Agarwal
    
encoding.lower: use fast ASCII lower...

              r22779
            
              return asciilower(s)

        Martin Geisler
    
encoding: use s.decode to trigger UnicodeDecodeError...

              r17235
            
          except UnicodeDecodeError:

        Matt Mackall
    
encoding: add fast-path for ASCII lowercase

              r16387
            
              pass

          try:

        Matt Mackall
    
encoding: add an encoding-aware lower function

              r14069
            
              if isinstance(s, localstr):

                  u = s._utf8.decode("utf-8")

              else:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                  u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

        Matt Mackall
    
encoding: add an encoding-aware lower function

              r14069
            
              lu = u.lower()

              if u == lu:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                  return s  # preserve localstring

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
              return lu.encode(_sysstr(encoding))

        Matt Mackall
    
encoding: add an encoding-aware lower function

              r14069
            
          except UnicodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              return s.lower()  # we don't know how to fold this except in ASCII

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except LookupError as k:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              raise error.Abort(k, hint=b"please check your locale settings")

        FUJIWARA Katsunori
    
i18n: use encoding.lower/upper for encoding aware case folding...

              r15672
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        FUJIWARA Katsunori
    
i18n: use encoding.lower/upper for encoding aware case folding...

              r15672
            
      def upper(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> bytes

        Matt Harbison
    
cleanup: fix docstring formatting...

              r44226
            
          """best-effort encoding-aware case-folding of local string s"""

        FUJIWARA Katsunori
    
i18n: use encoding.lower/upper for encoding aware case folding...

              r15672
            
          try:

        Siddharth Agarwal
    
encoding: use parsers.asciiupper when available...

              r24578
            
              return asciiupper(s)

        Martin Geisler
    
encoding: add fast-path for ASCII uppercase....

              r17236
            
          except UnicodeDecodeError:

        Siddharth Agarwal
    
encoding.upper: factor out fallback code...

              r24597
            
              return upperfallback(s)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Siddharth Agarwal
    
encoding.upper: factor out fallback code...

              r24597
            
      def upperfallback(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Any) -> Any

        Martin Geisler
    
encoding: add fast-path for ASCII uppercase....

              r17236
            
          try:

        FUJIWARA Katsunori
    
i18n: use encoding.lower/upper for encoding aware case folding...

              r15672
            
              if isinstance(s, localstr):

                  u = s._utf8.decode("utf-8")

              else:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                  u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

        FUJIWARA Katsunori
    
i18n: use encoding.lower/upper for encoding aware case folding...

              r15672
            
              uu = u.upper()

              if u == uu:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                  return s  # preserve localstring

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
              return uu.encode(_sysstr(encoding))

        FUJIWARA Katsunori
    
i18n: use encoding.lower/upper for encoding aware case folding...

              r15672
            
          except UnicodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              return s.upper()  # we don't know how to fold this except in ASCII

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except LookupError as k:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              raise error.Abort(k, hint=b"please check your locale settings")

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Siddharth Agarwal
    
encoding: define an enum that specifies what normcase does to ASCII strings...

              r24593
            
      class normcasespecs(object):

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """what a platform's normcase does to ASCII strings

        Siddharth Agarwal
    
encoding: define an enum that specifies what normcase does to ASCII strings...

              r24593
            
          This is specified per platform, and should be consistent with what normcase

          on that platform actually does.

          lower: normcase lowercases ASCII strings

          upper: normcase uppercases ASCII strings

        Siddharth Agarwal
    
util.h: define an enum for normcase specs...

              r24608
            
          other: the fallback function should always be called

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          This should be kept in sync with normcase_spec in util.h."""

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Siddharth Agarwal
    
encoding: define an enum that specifies what normcase does to ASCII strings...

              r24593
            
          lower = -1

          upper = 1

          other = 0

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
      def jsonescape(s, paranoid=False):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Any, Any) -> Any

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """returns a string suitable for JSON

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          JSON is problematic for us because it doesn't support non-Unicode

          bytes. To deal with this, we take the following approach:

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
          - localstr/safelocalstr objects are converted back to UTF-8

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          - valid UTF-8/ASCII strings are passed as-is

          - other strings are converted to UTF-8b surrogate encoding

          - apply JSON-specified string escaping

          (escapes are doubled in these tests)

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'this is a test')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          'this is a test'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

        Yuya Nishihara
    
encoding: escape U+007F (DEL) character in JSON...

              r27881
            
          'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
          'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'a weird byte: \\xdd')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          'a weird byte: \\xed\\xb3\\x9d'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'utf-8: caf\\xc3\\xa9')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          'utf-8: caf\\xc3\\xa9'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          ''

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
        Yuya Nishihara
    
encoding: backport paranoid escaping from templatefilters.jsonescape()...

              r28069
            
          If paranoid, non-ascii and common troublesome characters are also escaped.

          This is suitable for web output.

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> s = b'escape characters: \\0 \\x0b \\x7f'

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
          >>> assert jsonescape(s) == jsonescape(s, paranoid=True)

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
          >>> assert jsonescape(s) == jsonescape(s, paranoid=True)

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'escape boundary: ~ \\\\u007f \\\\u0080'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'a weird byte: \\\\udcdd'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'utf-8: caf\\\\u00e9'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'non-BMP: \\\\ud834\\\\udd1e'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'<foo@example.org>', paranoid=True)

        Yuya Nishihara
    
encoding: backport paranoid escaping from templatefilters.jsonescape()...

              r28069
            
          '\\\\u003cfoo@example.org\\\\u003e'

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          u8chars = toutf8b(s)

          try:

        Yuya Nishihara
    
encoding: extract stub for fast JSON escape...

              r33925
            
              return _jsonescapeu8fast(u8chars, paranoid)

          except ValueError:

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
              pass

        Yuya Nishihara
    
encoding: extract stub for fast JSON escape...

              r33925
            
          return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
      # We need to decode/encode U+DCxx codes transparently since invalid UTF-8

      # bytes are mapped to that range.

      if pycompat.ispy3:

          _utf8strict = r'surrogatepass'

      else:

          _utf8strict = r'strict'

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
      _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
      def getutf8char(s, pos):

        Yuya Nishihara
    
typing: constrain argument/return types of encoding.toutf8b()

              r44077
            
          # type: (bytes, int) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """get the next full utf-8 character in the given string, starting at pos

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          Raises a UnicodeError if the given location does not start a valid

          utf-8 character.

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          # find how many bytes to attempt decoding from first nibble

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          l = _utf8len[ord(s[pos : pos + 1]) >> 4]

          if not l:  # ascii

              return s[pos : pos + 1]

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          c = s[pos : pos + l]

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          # validate with attempted decode

        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
          c.decode("utf-8", _utf8strict)

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          return c

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
      def toutf8b(s):

        Yuya Nishihara
    
typing: constrain argument/return types of encoding.toutf8b()

              r44077
            
          # type: (bytes) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """convert a local, possibly-binary string into UTF-8b

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          This is intended as a generic method to preserve data when working

          with schemes like JSON and XML that have no provision for

          arbitrary byte strings. As Mercurial often doesn't know

          what encoding data is in, we use so-called UTF-8b.

          If a string is already valid UTF-8 (or ASCII), it passes unmodified.

          Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

          uDC00-uDCFF.

          Principles of operation:

        Mads Kiilerich
    
fix trivial spelling errors

              r17424
            
          - ASCII and UTF-8 data successfully round-trips and is understood

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
            by Unicode-oriented clients

          - filenames and file contents in arbitrary other encodings can have

            be round-tripped or recovered by clueful clients

          - local strings that have a cached known UTF-8 encoding (aka

            localstr) get sent as UTF-8 so Unicode-oriented clients get the

            Unicode data they want

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
          - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          - because we must preserve UTF-8 bytestring in places such as

            filenames, metadata can't be roundtripped without help

          (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

          arbitrary bytes into an internal Unicode format that can be

          re-encoded back into the original. Here we are exposing the

          internal surrogate encoding as a UTF-8 string.)

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Yuya Nishihara
    
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...

              r37965
            
          if isinstance(s, localstr):

              # assume that the original UTF-8 sequence would never contain

              # invalid characters in U+DCxx range

              return s._utf8

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
          elif isinstance(s, safelocalstr):

              # already verified that s is non-lossy in legacy encoding, which

              # shouldn't contain characters in U+DCxx range

              return fromlocal(s)

        Yuya Nishihara
    
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...

              r37965
            
          elif isasciistr(s):

        Yuya Nishihara
    
encoding: add fast path of from/toutf8b() for ASCII strings...

              r33929
            
              return s

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          if b"\xed" not in s:

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
              try:

        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
                  s.decode('utf-8', _utf8strict)

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
                  return s

              except UnicodeDecodeError:

                  pass

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
        Yuya Nishihara
    
py3: wrap bytes in encoding.from/toutf8b() with bytestr

              r34213
            
          s = pycompat.bytestr(s)

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          r = b""

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
          pos = 0

          l = len(s)

          while pos < l:

              try:

                  c = getutf8char(s, pos)

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
                      # have to re-escape existing U+DCxx characters

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                      c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
                      pos += 1

                  else:

                      pos += len(c)

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
              except UnicodeDecodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                  c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
                  pos += 1

              r += c

          return r

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
      def fromutf8b(s):

        Yuya Nishihara
    
typing: fix argument type of encoding.tolocal() and .fromutf8b()...

              r44076
            
          # type: (bytes) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """Given a UTF-8b string, return a local, possibly-binary string.

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          return the original binary string. This

          is a round-trip process for strings like filenames, but metadata

          that's was passed through tolocal will remain in UTF-8.

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> m = b"\\xc3\\xa9\\x99abcd"

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          >>> toutf8b(m)

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          '\\xc3\\xa9\\xed\\xb2\\x99abcd'

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          >>> roundtrip(m)

          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xc2\\xc2\\x80")

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xef\\xbf\\xbd")

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
          True

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Yuya Nishihara
    
encoding: add fast path of from/toutf8b() for ASCII strings...

              r33929
            
          if isasciistr(s):

              return s

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          # fast path - look for uDxxx prefixes in s

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          if b"\xed" not in s:

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
              return s

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
          # We could do this with the unicode type but some Python builds

          # use UTF-16 internally (issue5031) which causes non-BMP code

          # points to be escaped. Instead, we use our handy getutf8char

          # helper again to walk the string without "decoding" it.

        Yuya Nishihara
    
py3: wrap bytes in encoding.from/toutf8b() with bytestr

              r34213
            
          s = pycompat.bytestr(s)

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          r = b""

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
          pos = 0

          l = len(s)

          while pos < l:

              c = getutf8char(s, pos)

              pos += len(c)

              # unescape U+DCxx characters

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                  c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
              r += c

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages