upstream/mercurial-mirror Files · mercurial/encoding.py

dirstate: factor the identity setting code in the dirstate map...

dirstate: factor the identity setting code in the dirstate map We need it in more locations, so let us start factoring thing out first to make sure the same code is called everywhere. This bears some similarity with on default, but at a smaller scope and for a different purpose.

Manuel Jacob - - Load All Authors

File last commit:

r50179:d44e3c45 default


                r51136:342c3c46

stable

Download file

             encoding.py
        
                    725 lines
            
             | 22.2 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / mercurial / encoding.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Martin Geisler
    
put license and copyright info into comment blocks

              r8226
            
      # encoding.py - character transcoding support for Mercurial

      #

        Raphaël Gomès
    
contributor: change mentions of mpm to olivia...

              r47575
            
      #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others

        Martin Geisler
    
put license and copyright info into comment blocks

              r8226
            
      #

      # This software may be used and distributed according to the terms of the

        Matt Mackall
    
Update license to GPLv2+

              r10263
            
      # GNU General Public License version 2 or any later version.

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      import locale

      import os

        marmoute
    
windows: enforce upper case drive letter for getcwd in mercurial too...

              r48421
            
      import re

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      import unicodedata

        Gregory Szorc
    
py3: manually import getattr where it is needed...

              r43359
            
      from .pycompat import getattr

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      from . import (

          error,

        Yuya Nishihara
    
parsers: switch to policy importer...

              r32372
            
          policy,

        Yuya Nishihara
    
pycompat: provide 'ispy3' constant...

              r30030
            
          pycompat,

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
      )

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      from .pure import charencode as charencodepure

        Yuya Nishihara
    
encoding: extract stub for fast JSON escape...

              r33925
            
        Yuya Nishihara
    
typing: consolidate "if not globals():" trick...

              r44212
            
      if pycompat.TYPE_CHECKING:

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          from typing import (

              Any,

              Callable,

              List,

              Text,

              Type,

              TypeVar,

              Union,

          )

          # keep pyflakes happy

          for t in (Any, Callable, List, Text, Type, Union):

              assert t

        Yuya Nishihara
    
typing: fix forward reference in _Tlocalstr type bound...

              r44075
            
          _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
        Augie Fackler
    
cleanup: remove pointless r-prefixes on single-quoted strings...

              r43906
            
      charencode = policy.importmod('charencode')

        Yuya Nishihara
    
encoding: drop circular import by proxying through '<policy>.charencode'...

              r33756
            
        Yuya Nishihara
    
encoding: add function to test if a str consists of ASCII characters...

              r33927
            
      isasciistr = charencode.isasciistr

        Yuya Nishihara
    
encoding: drop circular import by proxying through '<policy>.charencode'...

              r33756
            
      asciilower = charencode.asciilower

      asciiupper = charencode.asciiupper

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
      _jsonescapeu8fast = charencode.jsonescapeu8fast

        Yuya Nishihara
    
encoding: drop circular import by proxying through '<policy>.charencode'...

              r33756
            
        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
      _sysstr = pycompat.sysstr

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
      unichr = chr

        Gregory Szorc
    
encoding: make HFS+ ignore code Python 3 compatible...

              r28507
            
        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
      # These unicode characters are ignored by HFS+ (Apple Technote 1150,

      # "Unicode Subtleties"), so we need to ignore them in some places for

      # sanity.

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      _ignore = [

          unichr(int(x, 16)).encode("utf-8")

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

          b"206a 206b 206c 206d 206e 206f feff".split()

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      ]

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
      # verify the next function will work

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
      assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
      def hfsignoreclean(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> bytes

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
          """Remove codepoints ignored by HFS+ from s.

          >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

          '.hg'

          >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

          '.hg'

          """

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          if b"\xe2" in s or b"\xef" in s:

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
              for c in _ignore:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  s = s.replace(c, b'')

        Augie Fackler
    
encoding: add hfsignoreclean to clean out HFS-ignored characters...

              r23596
            
          return s

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      # encoding.environ is provided read-only, which may not be used to modify

      # the process environment

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
      _nativeenviron = os.supports_bytes_environ

      if _nativeenviron:

        Yuya Nishihara
    
check-code: ignore re-exports of os.environ in encoding.py...

              r32185
            
          environ = os.environb  # re-exports

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      else:

          # preferred encoding isn't known yet; use utf-8 to avoid unicode error

          # and recreate it once encoding is settled

        Augie Fackler
    
cleanup: run pyupgrade on our source tree to clean up varying things...

              r44937
            
          environ = {

              k.encode('utf-8'): v.encode('utf-8')

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              for k, v in os.environ.items()  # re-exports

        Augie Fackler
    
cleanup: run pyupgrade on our source tree to clean up varying things...

              r44937
            
          }

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
        Martin von Zweigbergk
    
encoding: remove unnecessary lambdas from _encodingfixers...

              r39871
            
      _encodingrewrites = {

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          b'646': b'ascii',

          b'ANSI_X3.4-1968': b'ascii',

        Dan Villiom Podlaski Christiansen
    
encoding: improve handling of buggy getpreferredencoding() on Mac OS X...

              r11892
            
      }

        Yuya Nishihara
    
encoding: alias cp65001 to utf-8 on Windows...

              r38633
            
      # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

      # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

      # https://bugs.python.org/issue13216

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
      if pycompat.iswindows:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          _encodingrewrites[b'cp65001'] = b'utf-8'

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      try:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          encoding = environ.get(b"HGENCODING")

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          if not encoding:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

        Martin von Zweigbergk
    
encoding: remove unnecessary lambdas from _encodingfixers...

              r39871
            
              encoding = _encodingrewrites.get(encoding, encoding)

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      except locale.Error:

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          encoding = b'ascii'

      encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

      fallbackencoding = b'ISO-8859-1'

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: change encoding.localstr to a subclass of bytes, not str

              r33810
            
      class localstr(bytes):

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """This class allows strings that are unmodified to be

          round-tripped to the local encoding and back"""

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          def __new__(cls, u, l):

        Yuya Nishihara
    
py3: change encoding.localstr to a subclass of bytes, not str

              r33810
            
              s = bytes.__new__(cls, l)

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
              s._utf8 = u

              return s

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
typing: consolidate "if not globals():" trick...

              r44212
            
          if pycompat.TYPE_CHECKING:

        Yuya Nishihara
    
typing: add pseudo localstr.__init__() to help pytype...

              r44080
            
              # pseudo implementation to help pytype see localstr() constructor

              def __init__(self, u, l):

                  # type: (bytes, bytes) -> None

                  super(localstr, self).__init__(l)

                  self._utf8 = u

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          def __hash__(self):

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              return hash(self._utf8)  # avoid collisions in local string space

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
      class safelocalstr(bytes):

          """Tagged string denoting it was previously an internal UTF-8 string,

          and can be converted back to UTF-8 losslessly

          >>> assert safelocalstr(b'\\xc3') == b'\\xc3'

          >>> assert b'\\xc3' == safelocalstr(b'\\xc3')

          >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

          >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

          """

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      def tolocal(s):

        Yuya Nishihara
    
typing: fix argument type of encoding.tolocal() and .fromutf8b()...

              r44076
            
          # type: (bytes) -> bytes

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          """

          Convert a string from internal UTF-8 to local encoding

          All internal strings should be UTF-8 but some repos before the

          implementation of locale support may contain latin1 or possibly

          other character sets. We attempt to decode everything strictly

          using UTF-8, then Latin-1, and failing that, we use UTF-8 and

          replace unknown characters.

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          The localstr class is used to cache the known UTF-8 encoding of

          strings next to their local representation to allow lossless

          round-trip conversion back to UTF-8.

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> u = b'foo: \\xc3\\xa4' # utf-8

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          >>> l = tolocal(u)

          >>> l

          'foo: ?'

          >>> fromlocal(l)

          'foo: \\xc3\\xa4'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> u2 = b'foo: \\xc3\\xa1'

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          >>> d = { l: 1, tolocal(u2): 2 }

        Mads Kiilerich
    
tests: stabilize doctest output...

              r18378
            
          >>> len(d) # no collision

          2

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> b'foo: ?' in d

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          False

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> l1 = b'foo: \\xe4' # historical latin1 fallback

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          >>> l = tolocal(l1)

          >>> l

          'foo: ?'

          >>> fromlocal(l) # magically in utf-8

          'foo: \\xc3\\xa4'

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          """

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
        Yuya Nishihara
    
encoding: add fast path of from/tolocal() for ASCII strings...

              r33928
            
          if isasciistr(s):

              return s

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
          try:

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
              try:

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                  # make sure string is actually stored in UTF-8

                  u = s.decode('UTF-8')

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  if encoding == b'UTF-8':

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                      # fast path

                      return s

        Augie Fackler
    
cleanup: remove pointless r-prefixes on double-quoted strings...

              r43809
            
                  r = u.encode(_sysstr(encoding), "replace")

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                  if u == r.decode(_sysstr(encoding)):

        Matt Mackall
    
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...

              r13940
            
                      # r is a safe, non-lossy encoding of s

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
                      return safelocalstr(r)

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                  return localstr(s, r)

              except UnicodeDecodeError:

                  # we should only get here if we're looking at an ancient changeset

                  try:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                      u = s.decode(_sysstr(fallbackencoding))

        Augie Fackler
    
cleanup: remove pointless r-prefixes on double-quoted strings...

              r43809
            
                      r = u.encode(_sysstr(encoding), "replace")

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                      if u == r.decode(_sysstr(encoding)):

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                          # r is a safe, non-lossy encoding of s

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
                          return safelocalstr(r)

        Matt Mackall
    
encoding: avoid localstr when a string can be encoded losslessly (issue2763)...

              r13940
            
                      return localstr(u.encode('UTF-8'), r)

        Matt Mackall
    
encoding: tune fast-path of tolocal a bit

              r16274
            
                  except UnicodeDecodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                      u = s.decode("utf-8", "replace")  # last ditch

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
                      # can't round-trip

        Augie Fackler
    
cleanup: remove pointless r-prefixes on double-quoted strings...

              r43809
            
                      return u.encode(_sysstr(encoding), "replace")

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except LookupError as k:

        Martin von Zweigbergk
    
errors: stop passing non-strings to Abort's constructor...

              r46273
            
              raise error.Abort(

                  pycompat.bytestr(k), hint=b"please check your locale settings"

              )

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      def fromlocal(s):

        Augie Fackler
    
encoding: fix bad type annotation...

              r44036
            
          # type: (bytes) -> bytes

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          """

          Convert a string from the local character encoding to UTF-8

          We attempt to decode strings using the encoding mode set by

          HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

          characters will cause an error message. Other modes include

          'replace', which replaces unknown characters with a special

          Unicode character, and 'ignore', which drops the character.

          """

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
          # can we do a lossless round-trip?

          if isinstance(s, localstr):

              return s._utf8

        Yuya Nishihara
    
encoding: add fast path of from/tolocal() for ASCII strings...

              r33928
            
          if isasciistr(s):

              return s

        Matt Mackall
    
encoding: add localstr class to track UTF-8 version of transcoded strings...

              r13046
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          try:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
              u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

              return u.encode("utf-8")

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except UnicodeDecodeError as inst:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              sub = s[max(0, inst.start - 10) : inst.start + 10]

              raise error.Abort(

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              )

        Gregory Szorc
    
global: mass rewrite to use modern exception syntax...

              r25660
            
          except LookupError as k:

        Matt Harbison
    
encoding: force a few Errors to bytes before passing to `error.Abort`...

              r48820
            
              raise error.Abort(

                  pycompat.bytestr(k), hint=b"please check your locale settings"

              )

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
      def unitolocal(u):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Text) -> bytes

        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
          """Convert a unicode string to a byte string of local encoding"""

          return tolocal(u.encode('utf-8'))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
      def unifromlocal(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> Text

        Yuya Nishihara
    
encoding: factor out unicode variants of from/tolocal()...

              r31447
            
          """Convert a byte string of local encoding to a unicode string"""

          return fromlocal(s).decode('utf-8')

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
      def unimethod(bytesfunc):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          """Create a proxy method that forwards __unicode__() and __str__() of

          Python 3 to __bytes__()"""

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          def unifunc(obj):

              return unifromlocal(bytesfunc(obj))

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: add utility to forward __str__() to __bytes__()...

              r33022
            
          return unifunc

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: add converter between native str and byte string...

              r31448
            
      # converter functions between native str and byte string. use these if the

      # character encoding is not aware (e.g. exception message) or is known to

      # be locale dependent (e.g. date formatting.)

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
      strtolocal = unitolocal

      strfromlocal = unifromlocal

      strmethod = unimethod

        Yuya Nishihara
    
encoding: add converter between native str and byte string...

              r31448
            
        Raphaël Gomès
    
encoding: move case-related utils up...

              r48359
            
      def lower(s):

          # type: (bytes) -> bytes

          """best-effort encoding-aware case-folding of local string s"""

          try:

              return asciilower(s)

          except UnicodeDecodeError:

              pass

          try:

              if isinstance(s, localstr):

                  u = s._utf8.decode("utf-8")

              else:

                  u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

              lu = u.lower()

              if u == lu:

                  return s  # preserve localstring

              return lu.encode(_sysstr(encoding))

          except UnicodeError:

              return s.lower()  # we don't know how to fold this except in ASCII

          except LookupError as k:

        Matt Harbison
    
encoding: force a few Errors to bytes before passing to `error.Abort`...

              r48820
            
              raise error.Abort(

                  pycompat.bytestr(k), hint=b"please check your locale settings"

              )

        Raphaël Gomès
    
encoding: move case-related utils up...

              r48359
            
      def upper(s):

          # type: (bytes) -> bytes

          """best-effort encoding-aware case-folding of local string s"""

          try:

              return asciiupper(s)

          except UnicodeDecodeError:

              return upperfallback(s)

      def upperfallback(s):

          # type: (Any) -> Any

          try:

              if isinstance(s, localstr):

                  u = s._utf8.decode("utf-8")

              else:

                  u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

              uu = u.upper()

              if u == uu:

                  return s  # preserve localstring

              return uu.encode(_sysstr(encoding))

          except UnicodeError:

              return s.upper()  # we don't know how to fold this except in ASCII

          except LookupError as k:

        Matt Harbison
    
encoding: force a few Errors to bytes before passing to `error.Abort`...

              r48820
            
              raise error.Abort(

                  pycompat.bytestr(k), hint=b"please check your locale settings"

              )

        Raphaël Gomès
    
encoding: move case-related utils up...

              r48359
            
        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
      if not _nativeenviron:

          # now encoding and helper functions are available, recreate the environ

          # dict to be exported to other modules

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
          if pycompat.iswindows:

        Raphaël Gomès
    
windows: replicate the normalizing behavior of os.environ...

              r48360
            
              class WindowsEnviron(dict):

                  """`os.environ` normalizes environment variables to uppercase on windows"""

                  def get(self, key, default=None):

                      return super().get(upper(key), default)

              environ = WindowsEnviron()

          for k, v in os.environ.items():  # re-exports

              environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

        Yuya Nishihara
    
py3: provide encoding.environ which is a dict of bytes...

              r30034
            
        marmoute
    
windows: enforce upper case drive letter for getcwd in mercurial too...

              r48421
            
      DRIVE_RE = re.compile(b'^[a-z]:')

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
      # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

      # returns bytes.

      if pycompat.iswindows:

          # Python 3 on Windows issues a DeprecationWarning about using the bytes

          # API when os.getcwdb() is called.

          #

          # Additionally, py3.8+ uppercases the drive letter when calling

          # os.path.realpath(), which is used on ``repo.root``.  Since those

          # strings are compared in various places as simple strings, also call

          # realpath here.  See https://bugs.python.org/issue40368

          #

          # However this is not reliable, so lets explicitly make this drive

          # letter upper case.

          #

          # note: we should consider dropping realpath here since it seems to

          # change the semantic of `getcwd`.

        marmoute
    
windows: enforce upper case drive letter for getcwd in mercurial too...

              r48421
            
        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
          def getcwd():

              cwd = os.getcwd()  # re-exports

              cwd = os.path.realpath(cwd)

              cwd = strtolocal(cwd)

              if DRIVE_RE.match(cwd):

                  cwd = cwd[0:1].upper() + cwd[1:]

              return cwd

        marmoute
    
windows: enforce upper case drive letter for getcwd in mercurial too...

              r48421
            
        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
        Matt Harbison
    
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...

              r39843
            
      else:

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
          getcwd = os.getcwdb  # re-exports

        Matt Harbison
    
py3: rename pycompat.getcwd() to encoding.getcwd() (API)...

              r39843
            
        Matt Mackall
    
encoding: default ambiguous character to narrow...

              r12866
            
      # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      _wide = _sysstr(

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

          and b"WFA"

          or b"WF"

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
      )

        Matt Mackall
    
encoding: default ambiguous character to narrow...

              r12866
            
        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
      def colwidth(s):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes) -> int

        Matt Harbison
    
cleanup: fix docstring formatting...

              r44226
            
          """Find the column width of a string for display in the local encoding"""

        Augie Fackler
    
cleanup: remove pointless r-prefixes on single-quoted strings...

              r43906
            
          return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

        FUJIWARA Katsunori
    
i18n: calculate terminal columns by width information of each characters...

              r15066
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        FUJIWARA Katsunori
    
i18n: calculate terminal columns by width information of each characters...

              r15066
            
      def ucolwidth(d):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Text) -> int

        Matt Harbison
    
cleanup: fix docstring formatting...

              r44226
            
          """Find the column width of a Unicode string for display"""

        Augie Fackler
    
encoding: use getattr isntead of hasattr

              r14951
            
          eaw = getattr(unicodedata, 'east_asian_width', None)

          if eaw is not None:

        Yuya Nishihara
    
encoding: make sure "wide" variable never be referenced from other modules...

              r32537
            
              return sum([eaw(c) in _wide and 2 or 1 for c in d])

        Matt Mackall
    
move encoding bits from util to encoding...

              r7948
            
          return len(d)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: add getcols to extract substrings based on column width

              r15143
            
      def getcols(s, start, c):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes, int, int) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """Use colwidth to find a c-column substring of s starting at byte

          index start"""

        Manuel Jacob
    
py3: replace `pycompat.xrange` by `range`

              r50179
            
          for x in range(start + c, len(s)):

        Matt Mackall
    
encoding: add getcols to extract substrings based on column width

              r15143
            
              t = s[start:x]

              if colwidth(t) == c:

                  return t

        Yuya Nishihara
    
encoding: make getcols() raise exception explicitly...

              r44074
            
          raise ValueError('substring not found')

        Matt Mackall
    
encoding: add getcols to extract substrings based on column width

              r15143
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
      def trim(s, width, ellipsis=b'', leftside=False):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (bytes, int, bytes, bool) -> bytes

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          """Trim string 's' to at most 'width' columns (including 'ellipsis').

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          If 'leftside' is True, left side of string 's' is trimmed.

          'ellipsis' is always placed at trimmed side.

        Yuya Nishihara
    
doctest: do not embed non-ascii characters in docstring...

              r34138
            
          >>> from .node import bin

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> def bprint(s):

          ...     print(pycompat.sysstr(s))

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> ellipsis = b'+++'

        Gregory Szorc
    
encoding: use absolute_import

              r27355
            
          >>> from . import encoding

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> encoding.encoding = b'utf-8'

          >>> t = b'1234567890'

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 12, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          1234567890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 10, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          1234567890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          12345+++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++67890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          12345678

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          34567890

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 3, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 1, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +

          >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

        Yuya Nishihara
    
doctest: pass encoding name as system string

              r34137
            
          >>> t = u.encode(pycompat.sysstr(encoding.encoding))

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 12, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 10, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84+++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++\xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 5))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \xe3\x81\x82\xe3\x81\x84

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 5, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          \xe3\x81\x88\xe3\x81\x8a

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 4, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++

        Yuya Nishihara
    
doctest: do not embed non-ascii characters in docstring...

              r34138
            
          >>> t = bin(b'112233445566778899aa') # invalid byte sequence

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 12, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 10, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55+++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          +++\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          \x11\x22\x33\x44\x55\x66\x77\x88

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 8, leftside=True))

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          \x33\x44\x55\x66\x77\x88\x99\xaa

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 3, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +++

        Yuya Nishihara
    
doctest: use print_function and convert bytes to unicode where needed

              r34139
            
          >>> bprint(trim(t, 1, ellipsis=ellipsis))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          +

          """

          try:

        Yuya Nishihara
    
py3: convert encoding name and mode to str...

              r30033
            
              u = s.decode(_sysstr(encoding))

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
          except UnicodeDecodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              if len(s) <= width:  # trimming is not needed

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
                  return s

              width -= len(ellipsis)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
              if width <= 0:  # no enough room even for ellipsis

                  return ellipsis[: width + len(ellipsis)]

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
              if leftside:

                  return ellipsis + s[-width:]

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
              return s[:width] + ellipsis

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          if ucolwidth(u) <= width:  # trimming is not needed

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
              return s

          width -= len(ellipsis)

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          if width <= 0:  # no enough room even for ellipsis

              return ellipsis[: width + len(ellipsis)]

        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
        Martin von Zweigbergk
    
encoding: fix trim() to be O(n) instead of O(n^2)...

              r49518
            
          chars = list(u)

        FUJIWARA Katsunori
    
encoding: add 'leftside' argument into 'trim' to switch trimming side

              r21861
            
          if leftside:

        Martin von Zweigbergk
    
encoding: fix trim() to be O(n) instead of O(n^2)...

              r49518
            
              chars.reverse()

          width_so_far = 0

          for i, c in enumerate(chars):

              width_so_far += ucolwidth(c)

              if width_so_far > width:

                  break

          chars = chars[:i]

          if leftside:

              chars.reverse()

          u = u''.join(chars).encode(_sysstr(encoding))

          if leftside:

              return ellipsis + u

          return u + ellipsis

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        FUJIWARA Katsunori
    
encoding: add 'trim' to trim multi-byte characters at most specified columns...

              r21856
            
        Gregory Szorc
    
py3: use class X: instead of class X(object):...

              r49801
            
      class normcasespecs:

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """what a platform's normcase does to ASCII strings

        Siddharth Agarwal
    
encoding: define an enum that specifies what normcase does to ASCII strings...

              r24593
            
          This is specified per platform, and should be consistent with what normcase

          on that platform actually does.

          lower: normcase lowercases ASCII strings

          upper: normcase uppercases ASCII strings

        Siddharth Agarwal
    
util.h: define an enum for normcase specs...

              r24608
            
          other: the fallback function should always be called

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          This should be kept in sync with normcase_spec in util.h."""

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Siddharth Agarwal
    
encoding: define an enum that specifies what normcase does to ASCII strings...

              r24593
            
          lower = -1

          upper = 1

          other = 0

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
      def jsonescape(s, paranoid=False):

        Augie Fackler
    
encoding: add comment-based type hints for pytype...

              r43802
            
          # type: (Any, Any) -> Any

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """returns a string suitable for JSON

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          JSON is problematic for us because it doesn't support non-Unicode

          bytes. To deal with this, we take the following approach:

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
          - localstr/safelocalstr objects are converted back to UTF-8

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          - valid UTF-8/ASCII strings are passed as-is

          - other strings are converted to UTF-8b surrogate encoding

          - apply JSON-specified string escaping

          (escapes are doubled in these tests)

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'this is a test')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          'this is a test'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

        Yuya Nishihara
    
encoding: escape U+007F (DEL) character in JSON...

              r27881
            
          'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
          'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'a weird byte: \\xdd')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          'a weird byte: \\xed\\xb3\\x9d'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'utf-8: caf\\xc3\\xa9')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          'utf-8: caf\\xc3\\xa9'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'')

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
          ''

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
        Yuya Nishihara
    
encoding: backport paranoid escaping from templatefilters.jsonescape()...

              r28069
            
          If paranoid, non-ascii and common troublesome characters are also escaped.

          This is suitable for web output.

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> s = b'escape characters: \\0 \\x0b \\x7f'

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
          >>> assert jsonescape(s) == jsonescape(s, paranoid=True)

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

        Yuya Nishihara
    
encoding: add fast path of jsonescape() (issue5533)...

              r33926
            
          >>> assert jsonescape(s) == jsonescape(s, paranoid=True)

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'escape boundary: ~ \\\\u007f \\\\u0080'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'a weird byte: \\\\udcdd'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'utf-8: caf\\\\u00e9'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          'non-BMP: \\\\ud834\\\\udd1e'

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> jsonescape(b'<foo@example.org>', paranoid=True)

        Yuya Nishihara
    
encoding: backport paranoid escaping from templatefilters.jsonescape()...

              r28069
            
          '\\\\u003cfoo@example.org\\\\u003e'

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
          u8chars = toutf8b(s)

          try:

        Yuya Nishihara
    
encoding: extract stub for fast JSON escape...

              r33925
            
              return _jsonescapeu8fast(u8chars, paranoid)

          except ValueError:

        Yuya Nishihara
    
encoding: add option to escape non-ascii characters in JSON...

              r28068
            
              pass

        Yuya Nishihara
    
encoding: extract stub for fast JSON escape...

              r33925
            
          return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

        Matt Mackall
    
encoding: add json escaping filter...

              r22426
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
      # We need to decode/encode U+DCxx codes transparently since invalid UTF-8

      # bytes are mapped to that range.

        Gregory Szorc
    
encoding: remove Python 2 support code...

              r49747
            
      _utf8strict = r'surrogatepass'

        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
      _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
      def getutf8char(s, pos):

        Yuya Nishihara
    
typing: constrain argument/return types of encoding.toutf8b()

              r44077
            
          # type: (bytes, int) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """get the next full utf-8 character in the given string, starting at pos

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          Raises a UnicodeError if the given location does not start a valid

          utf-8 character.

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          # find how many bytes to attempt decoding from first nibble

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          l = _utf8len[ord(s[pos : pos + 1]) >> 4]

          if not l:  # ascii

              return s[pos : pos + 1]

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
          c = s[pos : pos + l]

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          # validate with attempted decode

        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
          c.decode("utf-8", _utf8strict)

        Matt Mackall
    
encoding: add getutf8char helper...

              r26875
            
          return c

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
      def toutf8b(s):

        Yuya Nishihara
    
typing: constrain argument/return types of encoding.toutf8b()

              r44077
            
          # type: (bytes) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """convert a local, possibly-binary string into UTF-8b

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          This is intended as a generic method to preserve data when working

          with schemes like JSON and XML that have no provision for

          arbitrary byte strings. As Mercurial often doesn't know

          what encoding data is in, we use so-called UTF-8b.

          If a string is already valid UTF-8 (or ASCII), it passes unmodified.

          Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

          uDC00-uDCFF.

          Principles of operation:

        Mads Kiilerich
    
fix trivial spelling errors

              r17424
            
          - ASCII and UTF-8 data successfully round-trips and is understood

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
            by Unicode-oriented clients

          - filenames and file contents in arbitrary other encodings can have

            be round-tripped or recovered by clueful clients

          - local strings that have a cached known UTF-8 encoding (aka

            localstr) get sent as UTF-8 so Unicode-oriented clients get the

            Unicode data they want

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
          - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          - because we must preserve UTF-8 bytestring in places such as

            filenames, metadata can't be roundtripped without help

          (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

          arbitrary bytes into an internal Unicode format that can be

          re-encoded back into the original. Here we are exposing the

          internal surrogate encoding as a UTF-8 string.)

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Yuya Nishihara
    
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...

              r37965
            
          if isinstance(s, localstr):

              # assume that the original UTF-8 sequence would never contain

              # invalid characters in U+DCxx range

              return s._utf8

        Yuya Nishihara
    
encoding: introduce tagging type for non-lossy non-ASCII string...

              r37966
            
          elif isinstance(s, safelocalstr):

              # already verified that s is non-lossy in legacy encoding, which

              # shouldn't contain characters in U+DCxx range

              return fromlocal(s)

        Yuya Nishihara
    
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it...

              r37965
            
          elif isasciistr(s):

        Yuya Nishihara
    
encoding: add fast path of from/toutf8b() for ASCII strings...

              r33929
            
              return s

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          if b"\xed" not in s:

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
              try:

        Yuya Nishihara
    
py3: use 'surrogatepass' error handler to process U+DCxx transparently...

              r34215
            
                  s.decode('utf-8', _utf8strict)

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
                  return s

              except UnicodeDecodeError:

                  pass

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
        Yuya Nishihara
    
py3: wrap bytes in encoding.from/toutf8b() with bytestr

              r34213
            
          s = pycompat.bytestr(s)

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          r = b""

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
          pos = 0

          l = len(s)

          while pos < l:

              try:

                  c = getutf8char(s, pos)

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
                  if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
                      # have to re-escape existing U+DCxx characters

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                      c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

        Matt Mackall
    
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)...

              r26879
            
                      pos += 1

                  else:

                      pos += len(c)

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
              except UnicodeDecodeError:

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                  c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

        Matt Mackall
    
encoding: use getutf8char in toutf8b...

              r26878
            
                  pos += 1

              r += c

          return r

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
      def fromutf8b(s):

        Yuya Nishihara
    
typing: fix argument type of encoding.tolocal() and .fromutf8b()...

              r44076
            
          # type: (bytes) -> bytes

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """Given a UTF-8b string, return a local, possibly-binary string.

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          return the original binary string. This

          is a round-trip process for strings like filenames, but metadata

          that's was passed through tolocal will remain in UTF-8.

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> m = b"\\xc3\\xa9\\x99abcd"

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          >>> toutf8b(m)

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          '\\xc3\\xa9\\xed\\xb2\\x99abcd'

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          >>> roundtrip(m)

          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xc2\\xc2\\x80")

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xef\\xbf\\xbd")

        Matt Mackall
    
encoding: extend test cases for utf8b...

              r26963
            
          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          True

        Yuya Nishihara
    
doctest: bulk-replace string literals with b'' for Python 3...

              r34133
            
          >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
          True

        Augie Fackler
    
formating: upgrade to black 20.8b1...

              r46554
            
          """

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
        Yuya Nishihara
    
encoding: add fast path of from/toutf8b() for ASCII strings...

              r33929
            
          if isasciistr(s):

              return s

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          # fast path - look for uDxxx prefixes in s

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          if b"\xed" not in s:

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
              return s

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
          # We could do this with the unicode type but some Python builds

          # use UTF-16 internally (issue5031) which causes non-BMP code

          # points to be escaped. Instead, we use our handy getutf8char

          # helper again to walk the string without "decoding" it.

        Yuya Nishihara
    
py3: wrap bytes in encoding.from/toutf8b() with bytestr

              r34213
            
          s = pycompat.bytestr(s)

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
          r = b""

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
          pos = 0

          l = len(s)

          while pos < l:

              c = getutf8char(s, pos)

              pos += len(c)

              # unescape U+DCxx characters

        Augie Fackler
    
formatting: byteify all mercurial/ and hgext/ string literals...

              r43347
            
              if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

        Augie Fackler
    
formatting: blacken the codebase...

              r43346
            
                  c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

        Matt Mackall
    
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)...

              r27699
            
              r += c

        Matt Mackall
    
encoding: introduce utf8-b helpers

              r16133
            
          return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages