##// END OF EJS Templates
encoding: add 'trim' to trim multi-byte characters at most specified columns...
FUJIWARA Katsunori -
r21856:d24969ee default
parent child Browse files
Show More
@@ -1,287 +1,357 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 def _getpreferredencoding():
11 def _getpreferredencoding():
12 '''
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
16 Python versions.
17
17
18 However, we can't use a version check for this method, as some distributions
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
21 '''
22 try:
22 try:
23 locale.CODESET
23 locale.CODESET
24 except AttributeError:
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
26 return locale.getdefaultlocale()[1]
27
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
32
33 return result
33 return result
34
34
35 _encodingfixers = {
35 _encodingfixers = {
36 '646': lambda: 'ascii',
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
38 'mac-roman': _getpreferredencoding
39 }
39 }
40
40
41 try:
41 try:
42 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
43 if not encoding:
43 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
46 except locale.Error:
47 encoding = 'ascii'
47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
50
50
51 class localstr(str):
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
55 s = str.__new__(cls, l)
56 s._utf8 = u
56 s._utf8 = u
57 return s
57 return s
58 def __hash__(self):
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
59 return hash(self._utf8) # avoid collisions in local string space
60
60
61 def tolocal(s):
61 def tolocal(s):
62 """
62 """
63 Convert a string from internal UTF-8 to local encoding
63 Convert a string from internal UTF-8 to local encoding
64
64
65 All internal strings should be UTF-8 but some repos before the
65 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
66 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
67 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
69 replace unknown characters.
70
70
71 The localstr class is used to cache the known UTF-8 encoding of
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
73 round-trip conversion back to UTF-8.
74
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
76 >>> l = tolocal(u)
77 >>> l
77 >>> l
78 'foo: ?'
78 'foo: ?'
79 >>> fromlocal(l)
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> len(d) # no collision
83 >>> len(d) # no collision
84 2
84 2
85 >>> 'foo: ?' in d
85 >>> 'foo: ?' in d
86 False
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
88 >>> l = tolocal(l1)
89 >>> l
89 >>> l
90 'foo: ?'
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
92 'foo: \\xc3\\xa4'
93 """
93 """
94
94
95 try:
95 try:
96 try:
96 try:
97 # make sure string is actually stored in UTF-8
97 # make sure string is actually stored in UTF-8
98 u = s.decode('UTF-8')
98 u = s.decode('UTF-8')
99 if encoding == 'UTF-8':
99 if encoding == 'UTF-8':
100 # fast path
100 # fast path
101 return s
101 return s
102 r = u.encode(encoding, "replace")
102 r = u.encode(encoding, "replace")
103 if u == r.decode(encoding):
103 if u == r.decode(encoding):
104 # r is a safe, non-lossy encoding of s
104 # r is a safe, non-lossy encoding of s
105 return r
105 return r
106 return localstr(s, r)
106 return localstr(s, r)
107 except UnicodeDecodeError:
107 except UnicodeDecodeError:
108 # we should only get here if we're looking at an ancient changeset
108 # we should only get here if we're looking at an ancient changeset
109 try:
109 try:
110 u = s.decode(fallbackencoding)
110 u = s.decode(fallbackencoding)
111 r = u.encode(encoding, "replace")
111 r = u.encode(encoding, "replace")
112 if u == r.decode(encoding):
112 if u == r.decode(encoding):
113 # r is a safe, non-lossy encoding of s
113 # r is a safe, non-lossy encoding of s
114 return r
114 return r
115 return localstr(u.encode('UTF-8'), r)
115 return localstr(u.encode('UTF-8'), r)
116 except UnicodeDecodeError:
116 except UnicodeDecodeError:
117 u = s.decode("utf-8", "replace") # last ditch
117 u = s.decode("utf-8", "replace") # last ditch
118 return u.encode(encoding, "replace") # can't round-trip
118 return u.encode(encoding, "replace") # can't round-trip
119 except LookupError, k:
119 except LookupError, k:
120 raise error.Abort(k, hint="please check your locale settings")
120 raise error.Abort(k, hint="please check your locale settings")
121
121
122 def fromlocal(s):
122 def fromlocal(s):
123 """
123 """
124 Convert a string from the local character encoding to UTF-8
124 Convert a string from the local character encoding to UTF-8
125
125
126 We attempt to decode strings using the encoding mode set by
126 We attempt to decode strings using the encoding mode set by
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 characters will cause an error message. Other modes include
128 characters will cause an error message. Other modes include
129 'replace', which replaces unknown characters with a special
129 'replace', which replaces unknown characters with a special
130 Unicode character, and 'ignore', which drops the character.
130 Unicode character, and 'ignore', which drops the character.
131 """
131 """
132
132
133 # can we do a lossless round-trip?
133 # can we do a lossless round-trip?
134 if isinstance(s, localstr):
134 if isinstance(s, localstr):
135 return s._utf8
135 return s._utf8
136
136
137 try:
137 try:
138 return s.decode(encoding, encodingmode).encode("utf-8")
138 return s.decode(encoding, encodingmode).encode("utf-8")
139 except UnicodeDecodeError, inst:
139 except UnicodeDecodeError, inst:
140 sub = s[max(0, inst.start - 10):inst.start + 10]
140 sub = s[max(0, inst.start - 10):inst.start + 10]
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 except LookupError, k:
142 except LookupError, k:
143 raise error.Abort(k, hint="please check your locale settings")
143 raise error.Abort(k, hint="please check your locale settings")
144
144
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 and "WFA" or "WF")
147 and "WFA" or "WF")
148
148
149 def colwidth(s):
149 def colwidth(s):
150 "Find the column width of a string for display in the local encoding"
150 "Find the column width of a string for display in the local encoding"
151 return ucolwidth(s.decode(encoding, 'replace'))
151 return ucolwidth(s.decode(encoding, 'replace'))
152
152
153 def ucolwidth(d):
153 def ucolwidth(d):
154 "Find the column width of a Unicode string for display"
154 "Find the column width of a Unicode string for display"
155 eaw = getattr(unicodedata, 'east_asian_width', None)
155 eaw = getattr(unicodedata, 'east_asian_width', None)
156 if eaw is not None:
156 if eaw is not None:
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 return len(d)
158 return len(d)
159
159
160 def getcols(s, start, c):
160 def getcols(s, start, c):
161 '''Use colwidth to find a c-column substring of s starting at byte
161 '''Use colwidth to find a c-column substring of s starting at byte
162 index start'''
162 index start'''
163 for x in xrange(start + c, len(s)):
163 for x in xrange(start + c, len(s)):
164 t = s[start:x]
164 t = s[start:x]
165 if colwidth(t) == c:
165 if colwidth(t) == c:
166 return t
166 return t
167
167
168 def trim(s, width, ellipsis=''):
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170
171 >>> ellipsis = '+++'
172 >>> from mercurial import encoding
173 >>> encoding.encoding = 'utf-8'
174 >>> t= '1234567890'
175 >>> print trim(t, 12, ellipsis=ellipsis)
176 1234567890
177 >>> print trim(t, 10, ellipsis=ellipsis)
178 1234567890
179 >>> print trim(t, 8, ellipsis=ellipsis)
180 12345+++
181 >>> print trim(t, 8)
182 12345678
183 >>> print trim(t, 3, ellipsis=ellipsis)
184 +++
185 >>> print trim(t, 1, ellipsis=ellipsis)
186 +
187 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
188 >>> t = u.encode(encoding.encoding)
189 >>> print trim(t, 12, ellipsis=ellipsis)
190 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
191 >>> print trim(t, 10, ellipsis=ellipsis)
192 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
193 >>> print trim(t, 8, ellipsis=ellipsis)
194 \xe3\x81\x82\xe3\x81\x84+++
195 >>> print trim(t, 5)
196 \xe3\x81\x82\xe3\x81\x84
197 >>> print trim(t, 4, ellipsis=ellipsis)
198 +++
199 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
200 >>> print trim(t, 12, ellipsis=ellipsis)
201 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
202 >>> print trim(t, 10, ellipsis=ellipsis)
203 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
204 >>> print trim(t, 8, ellipsis=ellipsis)
205 \x11\x22\x33\x44\x55+++
206 >>> print trim(t, 8)
207 \x11\x22\x33\x44\x55\x66\x77\x88
208 >>> print trim(t, 3, ellipsis=ellipsis)
209 +++
210 >>> print trim(t, 1, ellipsis=ellipsis)
211 +
212 """
213 try:
214 u = s.decode(encoding)
215 except UnicodeDecodeError:
216 if len(s) <= width: # trimming is not needed
217 return s
218 width -= len(ellipsis)
219 if width <= 0: # no enough room even for ellipsis
220 return ellipsis[:width + len(ellipsis)]
221 return s[:width] + ellipsis
222
223 if ucolwidth(u) <= width: # trimming is not needed
224 return s
225
226 width -= len(ellipsis)
227 if width <= 0: # no enough room even for ellipsis
228 return ellipsis[:width + len(ellipsis)]
229
230 uslice = lambda i: u[:-i]
231 concat = lambda s: s + ellipsis
232 for i in xrange(1, len(u)):
233 usub = uslice(i)
234 if ucolwidth(usub) <= width:
235 return concat(usub.encode(encoding))
236 return ellipsis # no enough room for multi-column characters
237
168 def lower(s):
238 def lower(s):
169 "best-effort encoding-aware case-folding of local string s"
239 "best-effort encoding-aware case-folding of local string s"
170 try:
240 try:
171 s.decode('ascii') # throw exception for non-ASCII character
241 s.decode('ascii') # throw exception for non-ASCII character
172 return s.lower()
242 return s.lower()
173 except UnicodeDecodeError:
243 except UnicodeDecodeError:
174 pass
244 pass
175 try:
245 try:
176 if isinstance(s, localstr):
246 if isinstance(s, localstr):
177 u = s._utf8.decode("utf-8")
247 u = s._utf8.decode("utf-8")
178 else:
248 else:
179 u = s.decode(encoding, encodingmode)
249 u = s.decode(encoding, encodingmode)
180
250
181 lu = u.lower()
251 lu = u.lower()
182 if u == lu:
252 if u == lu:
183 return s # preserve localstring
253 return s # preserve localstring
184 return lu.encode(encoding)
254 return lu.encode(encoding)
185 except UnicodeError:
255 except UnicodeError:
186 return s.lower() # we don't know how to fold this except in ASCII
256 return s.lower() # we don't know how to fold this except in ASCII
187 except LookupError, k:
257 except LookupError, k:
188 raise error.Abort(k, hint="please check your locale settings")
258 raise error.Abort(k, hint="please check your locale settings")
189
259
190 def upper(s):
260 def upper(s):
191 "best-effort encoding-aware case-folding of local string s"
261 "best-effort encoding-aware case-folding of local string s"
192 try:
262 try:
193 s.decode('ascii') # throw exception for non-ASCII character
263 s.decode('ascii') # throw exception for non-ASCII character
194 return s.upper()
264 return s.upper()
195 except UnicodeDecodeError:
265 except UnicodeDecodeError:
196 pass
266 pass
197 try:
267 try:
198 if isinstance(s, localstr):
268 if isinstance(s, localstr):
199 u = s._utf8.decode("utf-8")
269 u = s._utf8.decode("utf-8")
200 else:
270 else:
201 u = s.decode(encoding, encodingmode)
271 u = s.decode(encoding, encodingmode)
202
272
203 uu = u.upper()
273 uu = u.upper()
204 if u == uu:
274 if u == uu:
205 return s # preserve localstring
275 return s # preserve localstring
206 return uu.encode(encoding)
276 return uu.encode(encoding)
207 except UnicodeError:
277 except UnicodeError:
208 return s.upper() # we don't know how to fold this except in ASCII
278 return s.upper() # we don't know how to fold this except in ASCII
209 except LookupError, k:
279 except LookupError, k:
210 raise error.Abort(k, hint="please check your locale settings")
280 raise error.Abort(k, hint="please check your locale settings")
211
281
212 def toutf8b(s):
282 def toutf8b(s):
213 '''convert a local, possibly-binary string into UTF-8b
283 '''convert a local, possibly-binary string into UTF-8b
214
284
215 This is intended as a generic method to preserve data when working
285 This is intended as a generic method to preserve data when working
216 with schemes like JSON and XML that have no provision for
286 with schemes like JSON and XML that have no provision for
217 arbitrary byte strings. As Mercurial often doesn't know
287 arbitrary byte strings. As Mercurial often doesn't know
218 what encoding data is in, we use so-called UTF-8b.
288 what encoding data is in, we use so-called UTF-8b.
219
289
220 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
290 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
221 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
291 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
222 uDC00-uDCFF.
292 uDC00-uDCFF.
223
293
224 Principles of operation:
294 Principles of operation:
225
295
226 - ASCII and UTF-8 data successfully round-trips and is understood
296 - ASCII and UTF-8 data successfully round-trips and is understood
227 by Unicode-oriented clients
297 by Unicode-oriented clients
228 - filenames and file contents in arbitrary other encodings can have
298 - filenames and file contents in arbitrary other encodings can have
229 be round-tripped or recovered by clueful clients
299 be round-tripped or recovered by clueful clients
230 - local strings that have a cached known UTF-8 encoding (aka
300 - local strings that have a cached known UTF-8 encoding (aka
231 localstr) get sent as UTF-8 so Unicode-oriented clients get the
301 localstr) get sent as UTF-8 so Unicode-oriented clients get the
232 Unicode data they want
302 Unicode data they want
233 - because we must preserve UTF-8 bytestring in places such as
303 - because we must preserve UTF-8 bytestring in places such as
234 filenames, metadata can't be roundtripped without help
304 filenames, metadata can't be roundtripped without help
235
305
236 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
306 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
237 arbitrary bytes into an internal Unicode format that can be
307 arbitrary bytes into an internal Unicode format that can be
238 re-encoded back into the original. Here we are exposing the
308 re-encoded back into the original. Here we are exposing the
239 internal surrogate encoding as a UTF-8 string.)
309 internal surrogate encoding as a UTF-8 string.)
240 '''
310 '''
241
311
242 if isinstance(s, localstr):
312 if isinstance(s, localstr):
243 return s._utf8
313 return s._utf8
244
314
245 try:
315 try:
246 if s.decode('utf-8'):
316 if s.decode('utf-8'):
247 return s
317 return s
248 except UnicodeDecodeError:
318 except UnicodeDecodeError:
249 # surrogate-encode any characters that don't round-trip
319 # surrogate-encode any characters that don't round-trip
250 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
320 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
251 r = ""
321 r = ""
252 pos = 0
322 pos = 0
253 for c in s:
323 for c in s:
254 if s2[pos:pos + 1] == c:
324 if s2[pos:pos + 1] == c:
255 r += c
325 r += c
256 pos += 1
326 pos += 1
257 else:
327 else:
258 r += unichr(0xdc00 + ord(c)).encode('utf-8')
328 r += unichr(0xdc00 + ord(c)).encode('utf-8')
259 return r
329 return r
260
330
261 def fromutf8b(s):
331 def fromutf8b(s):
262 '''Given a UTF-8b string, return a local, possibly-binary string.
332 '''Given a UTF-8b string, return a local, possibly-binary string.
263
333
264 return the original binary string. This
334 return the original binary string. This
265 is a round-trip process for strings like filenames, but metadata
335 is a round-trip process for strings like filenames, but metadata
266 that's was passed through tolocal will remain in UTF-8.
336 that's was passed through tolocal will remain in UTF-8.
267
337
268 >>> m = "\\xc3\\xa9\\x99abcd"
338 >>> m = "\\xc3\\xa9\\x99abcd"
269 >>> n = toutf8b(m)
339 >>> n = toutf8b(m)
270 >>> n
340 >>> n
271 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
341 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
272 >>> fromutf8b(n) == m
342 >>> fromutf8b(n) == m
273 True
343 True
274 '''
344 '''
275
345
276 # fast path - look for uDxxx prefixes in s
346 # fast path - look for uDxxx prefixes in s
277 if "\xed" not in s:
347 if "\xed" not in s:
278 return s
348 return s
279
349
280 u = s.decode("utf-8")
350 u = s.decode("utf-8")
281 r = ""
351 r = ""
282 for c in u:
352 for c in u:
283 if ord(c) & 0xff00 == 0xdc00:
353 if ord(c) & 0xff00 == 0xdc00:
284 r += chr(ord(c) & 0xff)
354 r += chr(ord(c) & 0xff)
285 else:
355 else:
286 r += c.encode("utf-8")
356 r += c.encode("utf-8")
287 return r
357 return r
General Comments 0
You need to be logged in to leave comments. Login now