upstream/mercurial-mirror Commit - r13051:120eccaa

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

def _getpreferredencoding():

11

def _getpreferredencoding():

12

'''

12

'''

13

On darwin, getpreferredencoding ignores the locale environment and

13

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

15

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

16

Python versions.

17

18

However, we can't use a version check for this method, as some distributions

18

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

20

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

21

'''

22

try:

22

try:

23

locale.CODESET

23

locale.CODESET

24

except AttributeError:

24

except AttributeError:

25

# Fall back to parsing environment variables :-(

25

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

26

return locale.getdefaultlocale()[1]

27

28

oldloc = locale.setlocale(locale.LC_CTYPE)

28

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

29

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

30

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

32

33

return result

33

return result

34

35

_encodingfixers = {

35

_encodingfixers = {

36

'646': lambda: 'ascii',

36

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

38

'mac-roman': _getpreferredencoding

39

}

39

}

40

41

try:

41

try:

42

encoding = os.environ.get("HGENCODING")

42

encoding = os.environ.get("HGENCODING")

43

if not encoding:

43

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

44

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

46

except locale.Error:

47

encoding = 'ascii'

47

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

49

fallbackencoding = 'ISO-8859-1'

50

51

class localstr(str):

51

class localstr(str):

52

'''This class allows strings that are unmodified to be

52

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

53

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

54

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

55

s = str.__new__(cls, l)

56

s._utf8 = u

56

s._utf8 = u

57

return s

57

return s

58

def __hash__(self):

58

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

59

return hash(self._utf8) # avoid collisions in local string space

60

61

def tolocal(s):

61

def tolocal(s):

62

"""

62

"""

63

Convert a string from internal UTF-8 to local encoding

63

Convert a string from internal UTF-8 to local encoding

64

65

All internal strings should be UTF-8 but some repos before the

65

All internal strings should be UTF-8 but some repos before the

66

implementation of locale support may contain latin1 or possibly

66

implementation of locale support may contain latin1 or possibly

67

other character sets. We attempt to decode everything strictly

67

other character sets. We attempt to decode everything strictly

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

69

replace unknown characters.

69

replace unknown characters.

70

71

The localstr class is used to cache the known UTF-8 encoding of

71

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

72

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

73

round-trip conversion back to UTF-8.

74

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

76

>>> l = tolocal(u)

77

>>> l

77

>>> l

78

'foo: ?'

78

'foo: ?'

79

>>> fromlocal(l)

79

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

80

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

81

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

82

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> d # no collision

83

>>> d # no collision

84

{'foo: ?': 1, 'foo: ?': 2}

84

{'foo: ?': 1, 'foo: ?': 2}

85

>>> 'foo: ?' in d

85

>>> 'foo: ?' in d

86

False

86

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

88

>>> l = tolocal(l1)

89

>>> l

89

>>> l

90

'foo: ?'

90

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

91

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

92

'foo: \\xc3\\xa4'

93

"""

93

"""

94

95

for e in ('UTF-8', fallbackencoding):

95

for e in ('UTF-8', fallbackencoding):

96

try:

96

try:

97

u = s.decode(e) # attempt strict decoding

97

u = s.decode(e) # attempt strict decoding

98

if u == 'UTF-8':

98

if e == 'UTF-8':

99

return localstr(s, u.encode(encoding, "replace"))

99

return localstr(s, u.encode(encoding, "replace"))

100

else:

100

else:

101

return localstr(u.encode('UTF-8'),

101

return localstr(u.encode('UTF-8'),

102

u.encode(encoding, "replace"))

102

u.encode(encoding, "replace"))

103

except LookupError, k:

103

except LookupError, k:

104

raise error.Abort("%s, please check your locale settings" % k)

104

raise error.Abort("%s, please check your locale settings" % k)

105

except UnicodeDecodeError:

105

except UnicodeDecodeError:

106

pass

106

pass

107

u = s.decode("utf-8", "replace") # last ditch

107

u = s.decode("utf-8", "replace") # last ditch

108

return u.encode(encoding, "replace") # can't round-trip

108

return u.encode(encoding, "replace") # can't round-trip

109

110

def fromlocal(s):

110

def fromlocal(s):

111

"""

111

"""

112

Convert a string from the local character encoding to UTF-8

112

Convert a string from the local character encoding to UTF-8

113

114

We attempt to decode strings using the encoding mode set by

114

We attempt to decode strings using the encoding mode set by

115

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

115

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

116

characters will cause an error message. Other modes include

116

characters will cause an error message. Other modes include

117

'replace', which replaces unknown characters with a special

117

'replace', which replaces unknown characters with a special

118

Unicode character, and 'ignore', which drops the character.

118

Unicode character, and 'ignore', which drops the character.

119

"""

119

"""

120

121

# can we do a lossless round-trip?

121

# can we do a lossless round-trip?

122

if isinstance(s, localstr):

122

if isinstance(s, localstr):

123

return s._utf8

123

return s._utf8

124

125

try:

125

try:

126

return s.decode(encoding, encodingmode).encode("utf-8")

126

return s.decode(encoding, encodingmode).encode("utf-8")

127

except UnicodeDecodeError, inst:

127

except UnicodeDecodeError, inst:

128

sub = s[max(0, inst.start - 10):inst.start + 10]

128

sub = s[max(0, inst.start - 10):inst.start + 10]

129

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

129

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

130

except LookupError, k:

130

except LookupError, k:

131

raise error.Abort("%s, please check your locale settings" % k)

131

raise error.Abort("%s, please check your locale settings" % k)

132

133

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

133

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

134

ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")

134

ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")

135

136

def colwidth(s):

136

def colwidth(s):

137

"Find the column width of a UTF-8 string for display"

137

"Find the column width of a UTF-8 string for display"

138

d = s.decode(encoding, 'replace')

138

d = s.decode(encoding, 'replace')

139

if hasattr(unicodedata, 'east_asian_width'):

139

if hasattr(unicodedata, 'east_asian_width'):

140

wide = "WF"

140

wide = "WF"

141

if ambiguous == "wide":

141

if ambiguous == "wide":

142

wide = "WFA"

142

wide = "WFA"

143

w = unicodedata.east_asian_width

143

w = unicodedata.east_asian_width

144

return sum([w(c) in wide and 2 or 1 for c in d])

144

return sum([w(c) in wide and 2 or 1 for c in d])

145

return len(d)

145

return len(d)

146

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> d # no collision
                 {'foo: ?': 1, 'foo: ?': 2}
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 for e in ('UTF-8', fallbackencoding):
                     try:
                         u = s.decode(e) # attempt strict decoding
-                        if u == 'UTF-8':
+                        if e == 'UTF-8':
                             return localstr(s, u.encode(encoding, "replace"))
                         else:
                             return localstr(u.encode('UTF-8'),
                                             u.encode(encoding, "replace"))
                     except LookupError, k:
                         raise error.Abort("%s, please check your locale settings" % k)
                     except UnicodeDecodeError:
                         pass
                 u = s.decode("utf-8", "replace") # last ditch
                 return u.encode(encoding, "replace") # can't round-trip
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort("%s, please check your locale settings" % k)
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             ambiguous = os.environ.get("HGENCODINGAMBIGUOUS", "narrow")
             def colwidth(s):
                 "Find the column width of a UTF-8 string for display"
                 d = s.decode(encoding, 'replace')
                 if hasattr(unicodedata, 'east_asian_width'):
                     wide = "WF"
                     if ambiguous == "wide":
                         wide = "WFA"
                     w = unicodedata.east_asian_width
                     return sum([w(c) in wide and 2 or 1 for c in d])
                 return len(d)