##// END OF EJS Templates
encoding: fix issue with non-standard UTF-8 CTYPE on OS X
Dirkjan Ochtman -
r9574:9e9f63d5 default
parent child Browse files
Show More
@@ -1,75 +1,77 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2, incorporated herein by reference.
6 # GNU General Public License version 2, incorporated herein by reference.
7
7
8 import error
8 import error
9 import sys, unicodedata, locale, os
9 import sys, unicodedata, locale, os
10
10
11 _encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
11 _encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
12
12
13 try:
13 try:
14 encoding = os.environ.get("HGENCODING")
14 encoding = os.environ.get("HGENCODING")
15 if sys.platform == 'darwin' and not encoding:
15 if sys.platform == 'darwin' and not encoding:
16 # On darwin, getpreferredencoding ignores the locale environment and
16 # On darwin, getpreferredencoding ignores the locale environment and
17 # always returns mac-roman. We override this if the environment is
17 # always returns mac-roman. We override this if the environment is
18 # not C (has been customized by the user).
18 # not C (has been customized by the user).
19 locale.setlocale(locale.LC_CTYPE, '')
19 lc = locale.setlocale(locale.LC_CTYPE, '')
20 if lc == 'UTF-8':
21 locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
20 encoding = locale.getlocale()[1]
22 encoding = locale.getlocale()[1]
21 if not encoding:
23 if not encoding:
22 encoding = locale.getpreferredencoding() or 'ascii'
24 encoding = locale.getpreferredencoding() or 'ascii'
23 encoding = _encodingfixup.get(encoding, encoding)
25 encoding = _encodingfixup.get(encoding, encoding)
24 except locale.Error:
26 except locale.Error:
25 encoding = 'ascii'
27 encoding = 'ascii'
26 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
28 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
27 fallbackencoding = 'ISO-8859-1'
29 fallbackencoding = 'ISO-8859-1'
28
30
29 def tolocal(s):
31 def tolocal(s):
30 """
32 """
31 Convert a string from internal UTF-8 to local encoding
33 Convert a string from internal UTF-8 to local encoding
32
34
33 All internal strings should be UTF-8 but some repos before the
35 All internal strings should be UTF-8 but some repos before the
34 implementation of locale support may contain latin1 or possibly
36 implementation of locale support may contain latin1 or possibly
35 other character sets. We attempt to decode everything strictly
37 other character sets. We attempt to decode everything strictly
36 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
38 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
37 replace unknown characters.
39 replace unknown characters.
38 """
40 """
39 for e in ('UTF-8', fallbackencoding):
41 for e in ('UTF-8', fallbackencoding):
40 try:
42 try:
41 u = s.decode(e) # attempt strict decoding
43 u = s.decode(e) # attempt strict decoding
42 return u.encode(encoding, "replace")
44 return u.encode(encoding, "replace")
43 except LookupError, k:
45 except LookupError, k:
44 raise error.Abort("%s, please check your locale settings" % k)
46 raise error.Abort("%s, please check your locale settings" % k)
45 except UnicodeDecodeError:
47 except UnicodeDecodeError:
46 pass
48 pass
47 u = s.decode("utf-8", "replace") # last ditch
49 u = s.decode("utf-8", "replace") # last ditch
48 return u.encode(encoding, "replace")
50 return u.encode(encoding, "replace")
49
51
50 def fromlocal(s):
52 def fromlocal(s):
51 """
53 """
52 Convert a string from the local character encoding to UTF-8
54 Convert a string from the local character encoding to UTF-8
53
55
54 We attempt to decode strings using the encoding mode set by
56 We attempt to decode strings using the encoding mode set by
55 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
57 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
56 characters will cause an error message. Other modes include
58 characters will cause an error message. Other modes include
57 'replace', which replaces unknown characters with a special
59 'replace', which replaces unknown characters with a special
58 Unicode character, and 'ignore', which drops the character.
60 Unicode character, and 'ignore', which drops the character.
59 """
61 """
60 try:
62 try:
61 return s.decode(encoding, encodingmode).encode("utf-8")
63 return s.decode(encoding, encodingmode).encode("utf-8")
62 except UnicodeDecodeError, inst:
64 except UnicodeDecodeError, inst:
63 sub = s[max(0, inst.start-10):inst.start+10]
65 sub = s[max(0, inst.start-10):inst.start+10]
64 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
66 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
65 except LookupError, k:
67 except LookupError, k:
66 raise error.Abort("%s, please check your locale settings" % k)
68 raise error.Abort("%s, please check your locale settings" % k)
67
69
68 def colwidth(s):
70 def colwidth(s):
69 "Find the column width of a UTF-8 string for display"
71 "Find the column width of a UTF-8 string for display"
70 d = s.decode(encoding, 'replace')
72 d = s.decode(encoding, 'replace')
71 if hasattr(unicodedata, 'east_asian_width'):
73 if hasattr(unicodedata, 'east_asian_width'):
72 w = unicodedata.east_asian_width
74 w = unicodedata.east_asian_width
73 return sum([w(c) in 'WF' and 2 or 1 for c in d])
75 return sum([w(c) in 'WF' and 2 or 1 for c in d])
74 return len(d)
76 return len(d)
75
77
General Comments 0
You need to be logged in to leave comments. Login now