##// END OF EJS Templates
encoding: add getcols to extract substrings based on column width
Matt Mackall -
r15143:16c129b0 default
parent child Browse files
Show More
@@ -1,165 +1,173 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 def _getpreferredencoding():
12 12 '''
13 13 On darwin, getpreferredencoding ignores the locale environment and
14 14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 15 for Python 2.7 and up. This is the same corrected code for earlier
16 16 Python versions.
17 17
18 18 However, we can't use a version check for this method, as some distributions
19 19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 20 encoding, as it is unlikely that this encoding is the actually expected.
21 21 '''
22 22 try:
23 23 locale.CODESET
24 24 except AttributeError:
25 25 # Fall back to parsing environment variables :-(
26 26 return locale.getdefaultlocale()[1]
27 27
28 28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 29 locale.setlocale(locale.LC_CTYPE, "")
30 30 result = locale.nl_langinfo(locale.CODESET)
31 31 locale.setlocale(locale.LC_CTYPE, oldloc)
32 32
33 33 return result
34 34
35 35 _encodingfixers = {
36 36 '646': lambda: 'ascii',
37 37 'ANSI_X3.4-1968': lambda: 'ascii',
38 38 'mac-roman': _getpreferredencoding
39 39 }
40 40
41 41 try:
42 42 encoding = os.environ.get("HGENCODING")
43 43 if not encoding:
44 44 encoding = locale.getpreferredencoding() or 'ascii'
45 45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 46 except locale.Error:
47 47 encoding = 'ascii'
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 51 class localstr(str):
52 52 '''This class allows strings that are unmodified to be
53 53 round-tripped to the local encoding and back'''
54 54 def __new__(cls, u, l):
55 55 s = str.__new__(cls, l)
56 56 s._utf8 = u
57 57 return s
58 58 def __hash__(self):
59 59 return hash(self._utf8) # avoid collisions in local string space
60 60
61 61 def tolocal(s):
62 62 """
63 63 Convert a string from internal UTF-8 to local encoding
64 64
65 65 All internal strings should be UTF-8 but some repos before the
66 66 implementation of locale support may contain latin1 or possibly
67 67 other character sets. We attempt to decode everything strictly
68 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 69 replace unknown characters.
70 70
71 71 The localstr class is used to cache the known UTF-8 encoding of
72 72 strings next to their local representation to allow lossless
73 73 round-trip conversion back to UTF-8.
74 74
75 75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 76 >>> l = tolocal(u)
77 77 >>> l
78 78 'foo: ?'
79 79 >>> fromlocal(l)
80 80 'foo: \\xc3\\xa4'
81 81 >>> u2 = 'foo: \\xc3\\xa1'
82 82 >>> d = { l: 1, tolocal(u2): 2 }
83 83 >>> d # no collision
84 84 {'foo: ?': 1, 'foo: ?': 2}
85 85 >>> 'foo: ?' in d
86 86 False
87 87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 88 >>> l = tolocal(l1)
89 89 >>> l
90 90 'foo: ?'
91 91 >>> fromlocal(l) # magically in utf-8
92 92 'foo: \\xc3\\xa4'
93 93 """
94 94
95 95 for e in ('UTF-8', fallbackencoding):
96 96 try:
97 97 u = s.decode(e) # attempt strict decoding
98 98 r = u.encode(encoding, "replace")
99 99 if u == r.decode(encoding):
100 100 # r is a safe, non-lossy encoding of s
101 101 return r
102 102 elif e == 'UTF-8':
103 103 return localstr(s, r)
104 104 else:
105 105 return localstr(u.encode('UTF-8'), r)
106 106
107 107 except LookupError, k:
108 108 raise error.Abort("%s, please check your locale settings" % k)
109 109 except UnicodeDecodeError:
110 110 pass
111 111 u = s.decode("utf-8", "replace") # last ditch
112 112 return u.encode(encoding, "replace") # can't round-trip
113 113
114 114 def fromlocal(s):
115 115 """
116 116 Convert a string from the local character encoding to UTF-8
117 117
118 118 We attempt to decode strings using the encoding mode set by
119 119 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
120 120 characters will cause an error message. Other modes include
121 121 'replace', which replaces unknown characters with a special
122 122 Unicode character, and 'ignore', which drops the character.
123 123 """
124 124
125 125 # can we do a lossless round-trip?
126 126 if isinstance(s, localstr):
127 127 return s._utf8
128 128
129 129 try:
130 130 return s.decode(encoding, encodingmode).encode("utf-8")
131 131 except UnicodeDecodeError, inst:
132 132 sub = s[max(0, inst.start - 10):inst.start + 10]
133 133 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
134 134 except LookupError, k:
135 135 raise error.Abort("%s, please check your locale settings" % k)
136 136
137 137 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
138 138 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
139 139 and "WFA" or "WF")
140 140
141 141 def colwidth(s):
142 142 "Find the column width of a string for display in the local encoding"
143 143 return ucolwidth(s.decode(encoding, 'replace'))
144 144
145 145 def ucolwidth(d):
146 146 "Find the column width of a Unicode string for display"
147 147 eaw = getattr(unicodedata, 'east_asian_width', None)
148 148 if eaw is not None:
149 149 return sum([eaw(c) in wide and 2 or 1 for c in d])
150 150 return len(d)
151 151
152 def getcols(s, start, c):
153 '''Use colwidth to find a c-column substring of s starting at byte
154 index start'''
155 for x in xrange(start + c, len(s)):
156 t = s[start:x]
157 if colwidth(t) == c:
158 return t
159
152 160 def lower(s):
153 161 "best-effort encoding-aware case-folding of local string s"
154 162 try:
155 163 if isinstance(s, localstr):
156 164 u = s._utf8.decode("utf-8")
157 165 else:
158 166 u = s.decode(encoding, encodingmode)
159 167
160 168 lu = u.lower()
161 169 if u == lu:
162 170 return s # preserve localstring
163 171 return lu.encode(encoding)
164 172 except UnicodeError:
165 173 return s.lower() # we don't know how to fold this except in ASCII
General Comments 0
You need to be logged in to leave comments. Login now