upstream/mercurial-mirror Commit - r13046:7cc4263e

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

def _getpreferredencoding():

11

def _getpreferredencoding():

12

'''

12

'''

13

On darwin, getpreferredencoding ignores the locale environment and

13

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

15

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

16

Python versions.

17

18

However, we can't use a version check for this method, as some distributions

18

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

20

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

21

'''

22

try:

22

try:

23

locale.CODESET

23

locale.CODESET

24

except AttributeError:

24

except AttributeError:

25

# Fall back to parsing environment variables :-(

25

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

26

return locale.getdefaultlocale()[1]

27

28

oldloc = locale.setlocale(locale.LC_CTYPE)

28

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

29

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

30

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

32

33

return result

33

return result

34

35

_encodingfixers = {

35

_encodingfixers = {

36

'646': lambda: 'ascii',

36

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

38

'mac-roman': _getpreferredencoding

39

}

39

}

40

41

try:

41

try:

42

encoding = os.environ.get("HGENCODING")

42

encoding = os.environ.get("HGENCODING")

43

if not encoding:

43

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

44

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

46

except locale.Error:

47

encoding = 'ascii'

47

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

49

fallbackencoding = 'ISO-8859-1'

50

51

class localstr(str):

52

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

56

s._utf8 = u

57

return s

58

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

60

51

def tolocal(s):

61

def tolocal(s):

52

"""

62

"""

53

Convert a string from internal UTF-8 to local encoding

63

Convert a string from internal UTF-8 to local encoding

54

64

55

All internal strings should be UTF-8 but some repos before the

65

All internal strings should be UTF-8 but some repos before the

56

implementation of locale support may contain latin1 or possibly

66

implementation of locale support may contain latin1 or possibly

57

other character sets. We attempt to decode everything strictly

67

other character sets. We attempt to decode everything strictly

58

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

59

replace unknown characters.

69

replace unknown characters.

70

71

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

74

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

77

>>> l

78

'foo: ?'

79

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> d # no collision

84

{'foo: ?': 1, 'foo: ?': 2}

85

>>> 'foo: ?' in d

86

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

89

>>> l

90

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

60

"""

93

"""

94

61

for e in ('UTF-8', fallbackencoding):

95

for e in ('UTF-8', fallbackencoding):

62

try:

96

try:

63

u = s.decode(e) # attempt strict decoding

97

u = s.decode(e) # attempt strict decoding

64

return u.encode(encoding, "replace")

98

if u == 'UTF-8':

99

return localstr(s, u.encode(encoding, "replace"))

100

else:

101

return localstr(u.encode('UTF-8'),

102

u.encode(encoding, "replace"))

65

except LookupError, k:

103

except LookupError, k:

66

raise error.Abort("%s, please check your locale settings" % k)

104

raise error.Abort("%s, please check your locale settings" % k)

67

except UnicodeDecodeError:

105

except UnicodeDecodeError:

68

pass

106

pass

69

u = s.decode("utf-8", "replace") # last ditch

107

u = s.decode("utf-8", "replace") # last ditch

70

return u.encode(encoding, "replace")

108

return u.encode(encoding, "replace") # can't round-trip

71

109

72

def fromlocal(s):

110

def fromlocal(s):

73

"""

111

"""

74

Convert a string from the local character encoding to UTF-8

112

Convert a string from the local character encoding to UTF-8

75

113

76

We attempt to decode strings using the encoding mode set by

114

We attempt to decode strings using the encoding mode set by

77

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

115

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

78

characters will cause an error message. Other modes include

116

characters will cause an error message. Other modes include

79

'replace', which replaces unknown characters with a special

117

'replace', which replaces unknown characters with a special

80

Unicode character, and 'ignore', which drops the character.

118

Unicode character, and 'ignore', which drops the character.

81

"""

119

"""

120

121

# can we do a lossless round-trip?

122

if isinstance(s, localstr):

123

return s._utf8

124

82

try:

125

try:

83

return s.decode(encoding, encodingmode).encode("utf-8")

126

return s.decode(encoding, encodingmode).encode("utf-8")

84

except UnicodeDecodeError, inst:

127

except UnicodeDecodeError, inst:

85

sub = s[max(0, inst.start - 10):inst.start + 10]

128

sub = s[max(0, inst.start - 10):inst.start + 10]

86

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

129

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

87

except LookupError, k:

130

except LookupError, k:

88

raise error.Abort("%s, please check your locale settings" % k)

131

raise error.Abort("%s, please check your locale settings" % k)

89

132

90

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

133

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

91

ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")

134

ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")

92

135

93

def colwidth(s):

136

def colwidth(s):

94

"Find the column width of a UTF-8 string for display"

137

"Find the column width of a UTF-8 string for display"

95

d = s.decode(encoding, 'replace')

138

d = s.decode(encoding, 'replace')

96

if hasattr(unicodedata, 'east_asian_width'):

139

if hasattr(unicodedata, 'east_asian_width'):

97

wide = "WF"

140

wide = "WF"

98

if ambiguous == "wide":

141

if ambiguous == "wide":

99

wide = "WFA"

142

wide = "WFA"

100

w = unicodedata.east_asian_width

143

w = unicodedata.east_asian_width

101

return sum([w(c) in wide and 2 or 1 for c in d])

144

return sum([w(c) in wide and 2 or 1 for c in d])

102

return len(d)

145

return len(d)

103

146

             # this is hack to make sure no escape characters are inserted into the output
             import os
             if 'TERM' in os.environ:
                 del os.environ['TERM']
             import doctest
             import mercurial.changelog
             doctest.testmod(mercurial.changelog)
             import mercurial.dagparser
             doctest.testmod(mercurial.dagparser, optionflags=doctest.NORMALIZE_WHITESPACE)
             import mercurial.match
             doctest.testmod(mercurial.match)
-            import mercurial.url
+            import mercurial.encoding
-            doctest.testmod(mercurial.url)
+            doctest.testmod(mercurial.encoding)
             import hgext.convert.cvsps
             doctest.testmod(hgext.convert.cvsps)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
+            class localstr(str):
+                '''This class allows strings that are unmodified to be
+                round-tripped to the local encoding and back'''
+                def __new__(cls, u, l):
+                    s = str.__new__(cls, l)
+                    s._utf8 = u
+                    return s
+                def __hash__(self):
+                    return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
+                The localstr class is used to cache the known UTF-8 encoding of
+                strings next to their local representation to allow lossless
+                round-trip conversion back to UTF-8.
+                >>> u = 'foo: \\xc3\\xa4' # utf-8
+                >>> l = tolocal(u)
+                >>> l
+                'foo: ?'
+                >>> fromlocal(l)
+                'foo: \\xc3\\xa4'
+                >>> u2 = 'foo: \\xc3\\xa1'
+                >>> d = { l: 1, tolocal(u2): 2 }
+                >>> d # no collision
+                {'foo: ?': 1, 'foo: ?': 2}
+                >>> 'foo: ?' in d
+                False
+                >>> l1 = 'foo: \\xe4' # historical latin1 fallback
+                >>> l = tolocal(l1)
+                >>> l
+                'foo: ?'
+                >>> fromlocal(l) # magically in utf-8
+                'foo: \\xc3\\xa4'
                 """
                 for e in ('UTF-8', fallbackencoding):
                     try:
                         u = s.decode(e) # attempt strict decoding
-                        return u.encode(encoding, "replace")
+                        if u == 'UTF-8':
+                            return localstr(s, u.encode(encoding, "replace"))
+                        else:
+                            return localstr(u.encode('UTF-8'),
+                                            u.encode(encoding, "replace"))
                     except LookupError, k:
                         raise error.Abort("%s, please check your locale settings" % k)
                     except UnicodeDecodeError:
                         pass
                 u = s.decode("utf-8", "replace") # last ditch
-                return u.encode(encoding, "replace")
+                return u.encode(encoding, "replace") # can't round-trip
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
+                # can we do a lossless round-trip?
+                if isinstance(s, localstr):
+                    return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort("%s, please check your locale settings" % k)
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")
             def colwidth(s):
                 "Find the column width of a UTF-8 string for display"
                 d = s.decode(encoding, 'replace')
                 if hasattr(unicodedata, 'east_asian_width'):
                     wide = "WF"
                     if ambiguous == "wide":
                         wide = "WFA"
                     w = unicodedata.east_asian_width
                     return sum([w(c) in wide and 2 or 1 for c in d])
                 return len(d)