upstream/mercurial-mirror Commit - r15143:16c129b0

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

def _getpreferredencoding():

11

def _getpreferredencoding():

12

'''

12

'''

13

On darwin, getpreferredencoding ignores the locale environment and

13

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

15

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

16

Python versions.

17

18

However, we can't use a version check for this method, as some distributions

18

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

20

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

21

'''

22

try:

22

try:

23

locale.CODESET

23

locale.CODESET

24

except AttributeError:

24

except AttributeError:

25

# Fall back to parsing environment variables :-(

25

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

26

return locale.getdefaultlocale()[1]

27

28

oldloc = locale.setlocale(locale.LC_CTYPE)

28

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

29

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

30

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

32

33

return result

33

return result

34

35

_encodingfixers = {

35

_encodingfixers = {

36

'646': lambda: 'ascii',

36

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

38

'mac-roman': _getpreferredencoding

39

}

39

}

40

41

try:

41

try:

42

encoding = os.environ.get("HGENCODING")

42

encoding = os.environ.get("HGENCODING")

43

if not encoding:

43

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

44

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

46

except locale.Error:

47

encoding = 'ascii'

47

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

49

fallbackencoding = 'ISO-8859-1'

50

51

class localstr(str):

51

class localstr(str):

52

'''This class allows strings that are unmodified to be

52

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

53

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

54

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

55

s = str.__new__(cls, l)

56

s._utf8 = u

56

s._utf8 = u

57

return s

57

return s

58

def __hash__(self):

58

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

59

return hash(self._utf8) # avoid collisions in local string space

60

61

def tolocal(s):

61

def tolocal(s):

62

"""

62

"""

63

Convert a string from internal UTF-8 to local encoding

63

Convert a string from internal UTF-8 to local encoding

64

65

All internal strings should be UTF-8 but some repos before the

65

All internal strings should be UTF-8 but some repos before the

66

implementation of locale support may contain latin1 or possibly

66

implementation of locale support may contain latin1 or possibly

67

other character sets. We attempt to decode everything strictly

67

other character sets. We attempt to decode everything strictly

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

69

replace unknown characters.

69

replace unknown characters.

70

71

The localstr class is used to cache the known UTF-8 encoding of

71

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

72

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

73

round-trip conversion back to UTF-8.

74

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

76

>>> l = tolocal(u)

77

>>> l

77

>>> l

78

'foo: ?'

78

'foo: ?'

79

>>> fromlocal(l)

79

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

80

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

81

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

82

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> d # no collision

83

>>> d # no collision

84

{'foo: ?': 1, 'foo: ?': 2}

84

{'foo: ?': 1, 'foo: ?': 2}

85

>>> 'foo: ?' in d

85

>>> 'foo: ?' in d

86

False

86

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

88

>>> l = tolocal(l1)

89

>>> l

89

>>> l

90

'foo: ?'

90

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

91

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

92

'foo: \\xc3\\xa4'

93

"""

93

"""

94

95

for e in ('UTF-8', fallbackencoding):

95

for e in ('UTF-8', fallbackencoding):

96

try:

96

try:

97

u = s.decode(e) # attempt strict decoding

97

u = s.decode(e) # attempt strict decoding

98

r = u.encode(encoding, "replace")

98

r = u.encode(encoding, "replace")

99

if u == r.decode(encoding):

99

if u == r.decode(encoding):

100

# r is a safe, non-lossy encoding of s

100

# r is a safe, non-lossy encoding of s

101

return r

101

return r

102

elif e == 'UTF-8':

102

elif e == 'UTF-8':

103

return localstr(s, r)

103

return localstr(s, r)

104

else:

104

else:

105

return localstr(u.encode('UTF-8'), r)

105

return localstr(u.encode('UTF-8'), r)

106

107

except LookupError, k:

107

except LookupError, k:

108

raise error.Abort("%s, please check your locale settings" % k)

108

raise error.Abort("%s, please check your locale settings" % k)

109

except UnicodeDecodeError:

109

except UnicodeDecodeError:

110

pass

110

pass

111

u = s.decode("utf-8", "replace") # last ditch

111

u = s.decode("utf-8", "replace") # last ditch

112

return u.encode(encoding, "replace") # can't round-trip

112

return u.encode(encoding, "replace") # can't round-trip

113

114

def fromlocal(s):

114

def fromlocal(s):

115

"""

115

"""

116

Convert a string from the local character encoding to UTF-8

116

Convert a string from the local character encoding to UTF-8

117

118

We attempt to decode strings using the encoding mode set by

118

We attempt to decode strings using the encoding mode set by

119

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

119

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

120

characters will cause an error message. Other modes include

120

characters will cause an error message. Other modes include

121

'replace', which replaces unknown characters with a special

121

'replace', which replaces unknown characters with a special

122

Unicode character, and 'ignore', which drops the character.

122

Unicode character, and 'ignore', which drops the character.

123

"""

123

"""

124

125

# can we do a lossless round-trip?

125

# can we do a lossless round-trip?

126

if isinstance(s, localstr):

126

if isinstance(s, localstr):

127

return s._utf8

127

return s._utf8

128

129

try:

129

try:

130

return s.decode(encoding, encodingmode).encode("utf-8")

130

return s.decode(encoding, encodingmode).encode("utf-8")

131

except UnicodeDecodeError, inst:

131

except UnicodeDecodeError, inst:

132

sub = s[max(0, inst.start - 10):inst.start + 10]

132

sub = s[max(0, inst.start - 10):inst.start + 10]

133

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

133

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

134

except LookupError, k:

134

except LookupError, k:

135

raise error.Abort("%s, please check your locale settings" % k)

135

raise error.Abort("%s, please check your locale settings" % k)

136

137

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

137

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

138

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

138

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

139

and "WFA" or "WF")

139

and "WFA" or "WF")

140

141

def colwidth(s):

141

def colwidth(s):

142

"Find the column width of a string for display in the local encoding"

142

"Find the column width of a string for display in the local encoding"

143

return ucolwidth(s.decode(encoding, 'replace'))

143

return ucolwidth(s.decode(encoding, 'replace'))

144

145

def ucolwidth(d):

145

def ucolwidth(d):

146

"Find the column width of a Unicode string for display"

146

"Find the column width of a Unicode string for display"

147

eaw = getattr(unicodedata, 'east_asian_width', None)

147

eaw = getattr(unicodedata, 'east_asian_width', None)

148

if eaw is not None:

148

if eaw is not None:

149

return sum([eaw(c) in wide and 2 or 1 for c in d])

149

return sum([eaw(c) in wide and 2 or 1 for c in d])

150

return len(d)

150

return len(d)

151

152

def getcols(s, start, c):

153

'''Use colwidth to find a c-column substring of s starting at byte

154

index start'''

155

for x in xrange(start + c, len(s)):

156

t = s[start:x]

157

if colwidth(t) == c:

158

return t

159

152

def lower(s):

160

def lower(s):

153

"best-effort encoding-aware case-folding of local string s"

161

"best-effort encoding-aware case-folding of local string s"

154

try:

162

try:

155

if isinstance(s, localstr):

163

if isinstance(s, localstr):

156

u = s._utf8.decode("utf-8")

164

u = s._utf8.decode("utf-8")

157

else:

165

else:

158

u = s.decode(encoding, encodingmode)

166

u = s.decode(encoding, encodingmode)

159

167

160

lu = u.lower()

168

lu = u.lower()

161

if u == lu:

169

if u == lu:

162

return s # preserve localstring

170

return s # preserve localstring

163

return lu.encode(encoding)

171

return lu.encode(encoding)

164

except UnicodeError:

172

except UnicodeError:

165

return s.lower() # we don't know how to fold this except in ASCII

173

return s.lower() # we don't know how to fold this except in ASCII

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> d # no collision
                 {'foo: ?': 1, 'foo: ?': 2}
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 for e in ('UTF-8', fallbackencoding):
                     try:
                         u = s.decode(e) # attempt strict decoding
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         elif e == 'UTF-8':
                             return localstr(s, r)
                         else:
                             return localstr(u.encode('UTF-8'), r)
                     except LookupError, k:
                         raise error.Abort("%s, please check your locale settings" % k)
                     except UnicodeDecodeError:
                         pass
                 u = s.decode("utf-8", "replace") # last ditch
                 return u.encode(encoding, "replace") # can't round-trip
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort("%s, please check your locale settings" % k)
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
+            def getcols(s, start, c):
+                '''Use colwidth to find a c-column substring of s starting at byte
+                index start'''
+                for x in xrange(start + c, len(s)):
+                    t = s[start:x]
+                    if colwidth(t) == c:
+                        return t
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII