##// END OF EJS Templates
encoding: improve handling of buggy getpreferredencoding() on Mac OS X...
Dan Villiom Podlaski Christiansen -
r11892:2be70ca1 stable
parent child Browse files
Show More
@@ -1,77 +1,97 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import sys, unicodedata, locale, os
9 import sys, unicodedata, locale, os
10
10
11 _encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
11 def _getpreferredencoding():
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
17
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
22 try:
23 locale.CODESET
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
33 return result
34
35 _encodingfixers = {
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
39 }
12
40
13 try:
41 try:
14 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
15 if sys.platform == 'darwin' and not encoding:
16 # On darwin, getpreferredencoding ignores the locale environment and
17 # always returns mac-roman. We override this if the environment is
18 # not C (has been customized by the user).
19 lc = locale.setlocale(locale.LC_CTYPE, '')
20 if lc == 'UTF-8':
21 locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
22 encoding = locale.getlocale()[1]
23 if not encoding:
43 if not encoding:
24 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
25 encoding = _encodingfixup.get(encoding, encoding)
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
26 except locale.Error:
46 except locale.Error:
27 encoding = 'ascii'
47 encoding = 'ascii'
28 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
29 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
30
50
31 def tolocal(s):
51 def tolocal(s):
32 """
52 """
33 Convert a string from internal UTF-8 to local encoding
53 Convert a string from internal UTF-8 to local encoding
34
54
35 All internal strings should be UTF-8 but some repos before the
55 All internal strings should be UTF-8 but some repos before the
36 implementation of locale support may contain latin1 or possibly
56 implementation of locale support may contain latin1 or possibly
37 other character sets. We attempt to decode everything strictly
57 other character sets. We attempt to decode everything strictly
38 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
58 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
39 replace unknown characters.
59 replace unknown characters.
40 """
60 """
41 for e in ('UTF-8', fallbackencoding):
61 for e in ('UTF-8', fallbackencoding):
42 try:
62 try:
43 u = s.decode(e) # attempt strict decoding
63 u = s.decode(e) # attempt strict decoding
44 return u.encode(encoding, "replace")
64 return u.encode(encoding, "replace")
45 except LookupError, k:
65 except LookupError, k:
46 raise error.Abort("%s, please check your locale settings" % k)
66 raise error.Abort("%s, please check your locale settings" % k)
47 except UnicodeDecodeError:
67 except UnicodeDecodeError:
48 pass
68 pass
49 u = s.decode("utf-8", "replace") # last ditch
69 u = s.decode("utf-8", "replace") # last ditch
50 return u.encode(encoding, "replace")
70 return u.encode(encoding, "replace")
51
71
52 def fromlocal(s):
72 def fromlocal(s):
53 """
73 """
54 Convert a string from the local character encoding to UTF-8
74 Convert a string from the local character encoding to UTF-8
55
75
56 We attempt to decode strings using the encoding mode set by
76 We attempt to decode strings using the encoding mode set by
57 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
77 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
58 characters will cause an error message. Other modes include
78 characters will cause an error message. Other modes include
59 'replace', which replaces unknown characters with a special
79 'replace', which replaces unknown characters with a special
60 Unicode character, and 'ignore', which drops the character.
80 Unicode character, and 'ignore', which drops the character.
61 """
81 """
62 try:
82 try:
63 return s.decode(encoding, encodingmode).encode("utf-8")
83 return s.decode(encoding, encodingmode).encode("utf-8")
64 except UnicodeDecodeError, inst:
84 except UnicodeDecodeError, inst:
65 sub = s[max(0, inst.start - 10):inst.start + 10]
85 sub = s[max(0, inst.start - 10):inst.start + 10]
66 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
86 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
67 except LookupError, k:
87 except LookupError, k:
68 raise error.Abort("%s, please check your locale settings" % k)
88 raise error.Abort("%s, please check your locale settings" % k)
69
89
70 def colwidth(s):
90 def colwidth(s):
71 "Find the column width of a UTF-8 string for display"
91 "Find the column width of a UTF-8 string for display"
72 d = s.decode(encoding, 'replace')
92 d = s.decode(encoding, 'replace')
73 if hasattr(unicodedata, 'east_asian_width'):
93 if hasattr(unicodedata, 'east_asian_width'):
74 w = unicodedata.east_asian_width
94 w = unicodedata.east_asian_width
75 return sum([w(c) in 'WFA' and 2 or 1 for c in d])
95 return sum([w(c) in 'WFA' and 2 or 1 for c in d])
76 return len(d)
96 return len(d)
77
97
General Comments 0
You need to be logged in to leave comments. Login now