##// END OF EJS Templates
encoding: use i.startswith() instead of i[0] to eliminate py2/3 divergence
Yuya Nishihara -
r32299:7040f513 default
parent child Browse files
Show More
@@ -1,595 +1,592
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 pycompat,
17 pycompat,
18 )
18 )
19
19
20 _sysstr = pycompat.sysstr
20 _sysstr = pycompat.sysstr
21
21
22 if pycompat.ispy3:
22 if pycompat.ispy3:
23 unichr = chr
23 unichr = chr
24
24
25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 # "Unicode Subtleties"), so we need to ignore them in some places for
26 # "Unicode Subtleties"), so we need to ignore them in some places for
27 # sanity.
27 # sanity.
28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 "206a 206b 206c 206d 206e 206f feff".split()]
30 "206a 206b 206c 206d 206e 206f feff".split()]
31 # verify the next function will work
31 # verify the next function will work
32 if pycompat.ispy3:
32 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
33 assert set(i[0] for i in _ignore) == {ord(b'\xe2'), ord(b'\xef')}
34 else:
35 assert set(i[0] for i in _ignore) == {"\xe2", "\xef"}
36
33
37 def hfsignoreclean(s):
34 def hfsignoreclean(s):
38 """Remove codepoints ignored by HFS+ from s.
35 """Remove codepoints ignored by HFS+ from s.
39
36
40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
37 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
41 '.hg'
38 '.hg'
42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
39 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
43 '.hg'
40 '.hg'
44 """
41 """
45 if "\xe2" in s or "\xef" in s:
42 if "\xe2" in s or "\xef" in s:
46 for c in _ignore:
43 for c in _ignore:
47 s = s.replace(c, '')
44 s = s.replace(c, '')
48 return s
45 return s
49
46
50 # encoding.environ is provided read-only, which may not be used to modify
47 # encoding.environ is provided read-only, which may not be used to modify
51 # the process environment
48 # the process environment
52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
49 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
53 if not pycompat.ispy3:
50 if not pycompat.ispy3:
54 environ = os.environ # re-exports
51 environ = os.environ # re-exports
55 elif _nativeenviron:
52 elif _nativeenviron:
56 environ = os.environb # re-exports
53 environ = os.environb # re-exports
57 else:
54 else:
58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
55 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
59 # and recreate it once encoding is settled
56 # and recreate it once encoding is settled
60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
57 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
61 for k, v in os.environ.items()) # re-exports
58 for k, v in os.environ.items()) # re-exports
62
59
63 _encodingfixers = {
60 _encodingfixers = {
64 '646': lambda: 'ascii',
61 '646': lambda: 'ascii',
65 'ANSI_X3.4-1968': lambda: 'ascii',
62 'ANSI_X3.4-1968': lambda: 'ascii',
66 }
63 }
67
64
68 try:
65 try:
69 encoding = environ.get("HGENCODING")
66 encoding = environ.get("HGENCODING")
70 if not encoding:
67 if not encoding:
71 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
68 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
72 encoding = _encodingfixers.get(encoding, lambda: encoding)()
69 encoding = _encodingfixers.get(encoding, lambda: encoding)()
73 except locale.Error:
70 except locale.Error:
74 encoding = 'ascii'
71 encoding = 'ascii'
75 encodingmode = environ.get("HGENCODINGMODE", "strict")
72 encodingmode = environ.get("HGENCODINGMODE", "strict")
76 fallbackencoding = 'ISO-8859-1'
73 fallbackencoding = 'ISO-8859-1'
77
74
78 class localstr(str):
75 class localstr(str):
79 '''This class allows strings that are unmodified to be
76 '''This class allows strings that are unmodified to be
80 round-tripped to the local encoding and back'''
77 round-tripped to the local encoding and back'''
81 def __new__(cls, u, l):
78 def __new__(cls, u, l):
82 s = str.__new__(cls, l)
79 s = str.__new__(cls, l)
83 s._utf8 = u
80 s._utf8 = u
84 return s
81 return s
85 def __hash__(self):
82 def __hash__(self):
86 return hash(self._utf8) # avoid collisions in local string space
83 return hash(self._utf8) # avoid collisions in local string space
87
84
88 def tolocal(s):
85 def tolocal(s):
89 """
86 """
90 Convert a string from internal UTF-8 to local encoding
87 Convert a string from internal UTF-8 to local encoding
91
88
92 All internal strings should be UTF-8 but some repos before the
89 All internal strings should be UTF-8 but some repos before the
93 implementation of locale support may contain latin1 or possibly
90 implementation of locale support may contain latin1 or possibly
94 other character sets. We attempt to decode everything strictly
91 other character sets. We attempt to decode everything strictly
95 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
92 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
96 replace unknown characters.
93 replace unknown characters.
97
94
98 The localstr class is used to cache the known UTF-8 encoding of
95 The localstr class is used to cache the known UTF-8 encoding of
99 strings next to their local representation to allow lossless
96 strings next to their local representation to allow lossless
100 round-trip conversion back to UTF-8.
97 round-trip conversion back to UTF-8.
101
98
102 >>> u = 'foo: \\xc3\\xa4' # utf-8
99 >>> u = 'foo: \\xc3\\xa4' # utf-8
103 >>> l = tolocal(u)
100 >>> l = tolocal(u)
104 >>> l
101 >>> l
105 'foo: ?'
102 'foo: ?'
106 >>> fromlocal(l)
103 >>> fromlocal(l)
107 'foo: \\xc3\\xa4'
104 'foo: \\xc3\\xa4'
108 >>> u2 = 'foo: \\xc3\\xa1'
105 >>> u2 = 'foo: \\xc3\\xa1'
109 >>> d = { l: 1, tolocal(u2): 2 }
106 >>> d = { l: 1, tolocal(u2): 2 }
110 >>> len(d) # no collision
107 >>> len(d) # no collision
111 2
108 2
112 >>> 'foo: ?' in d
109 >>> 'foo: ?' in d
113 False
110 False
114 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
111 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
115 >>> l = tolocal(l1)
112 >>> l = tolocal(l1)
116 >>> l
113 >>> l
117 'foo: ?'
114 'foo: ?'
118 >>> fromlocal(l) # magically in utf-8
115 >>> fromlocal(l) # magically in utf-8
119 'foo: \\xc3\\xa4'
116 'foo: \\xc3\\xa4'
120 """
117 """
121
118
122 try:
119 try:
123 try:
120 try:
124 # make sure string is actually stored in UTF-8
121 # make sure string is actually stored in UTF-8
125 u = s.decode('UTF-8')
122 u = s.decode('UTF-8')
126 if encoding == 'UTF-8':
123 if encoding == 'UTF-8':
127 # fast path
124 # fast path
128 return s
125 return s
129 r = u.encode(_sysstr(encoding), u"replace")
126 r = u.encode(_sysstr(encoding), u"replace")
130 if u == r.decode(_sysstr(encoding)):
127 if u == r.decode(_sysstr(encoding)):
131 # r is a safe, non-lossy encoding of s
128 # r is a safe, non-lossy encoding of s
132 return r
129 return r
133 return localstr(s, r)
130 return localstr(s, r)
134 except UnicodeDecodeError:
131 except UnicodeDecodeError:
135 # we should only get here if we're looking at an ancient changeset
132 # we should only get here if we're looking at an ancient changeset
136 try:
133 try:
137 u = s.decode(_sysstr(fallbackencoding))
134 u = s.decode(_sysstr(fallbackencoding))
138 r = u.encode(_sysstr(encoding), u"replace")
135 r = u.encode(_sysstr(encoding), u"replace")
139 if u == r.decode(_sysstr(encoding)):
136 if u == r.decode(_sysstr(encoding)):
140 # r is a safe, non-lossy encoding of s
137 # r is a safe, non-lossy encoding of s
141 return r
138 return r
142 return localstr(u.encode('UTF-8'), r)
139 return localstr(u.encode('UTF-8'), r)
143 except UnicodeDecodeError:
140 except UnicodeDecodeError:
144 u = s.decode("utf-8", "replace") # last ditch
141 u = s.decode("utf-8", "replace") # last ditch
145 # can't round-trip
142 # can't round-trip
146 return u.encode(_sysstr(encoding), u"replace")
143 return u.encode(_sysstr(encoding), u"replace")
147 except LookupError as k:
144 except LookupError as k:
148 raise error.Abort(k, hint="please check your locale settings")
145 raise error.Abort(k, hint="please check your locale settings")
149
146
150 def fromlocal(s):
147 def fromlocal(s):
151 """
148 """
152 Convert a string from the local character encoding to UTF-8
149 Convert a string from the local character encoding to UTF-8
153
150
154 We attempt to decode strings using the encoding mode set by
151 We attempt to decode strings using the encoding mode set by
155 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
152 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
156 characters will cause an error message. Other modes include
153 characters will cause an error message. Other modes include
157 'replace', which replaces unknown characters with a special
154 'replace', which replaces unknown characters with a special
158 Unicode character, and 'ignore', which drops the character.
155 Unicode character, and 'ignore', which drops the character.
159 """
156 """
160
157
161 # can we do a lossless round-trip?
158 # can we do a lossless round-trip?
162 if isinstance(s, localstr):
159 if isinstance(s, localstr):
163 return s._utf8
160 return s._utf8
164
161
165 try:
162 try:
166 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
163 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
167 return u.encode("utf-8")
164 return u.encode("utf-8")
168 except UnicodeDecodeError as inst:
165 except UnicodeDecodeError as inst:
169 sub = s[max(0, inst.start - 10):inst.start + 10]
166 sub = s[max(0, inst.start - 10):inst.start + 10]
170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
167 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 except LookupError as k:
168 except LookupError as k:
172 raise error.Abort(k, hint="please check your locale settings")
169 raise error.Abort(k, hint="please check your locale settings")
173
170
174 def unitolocal(u):
171 def unitolocal(u):
175 """Convert a unicode string to a byte string of local encoding"""
172 """Convert a unicode string to a byte string of local encoding"""
176 return tolocal(u.encode('utf-8'))
173 return tolocal(u.encode('utf-8'))
177
174
178 def unifromlocal(s):
175 def unifromlocal(s):
179 """Convert a byte string of local encoding to a unicode string"""
176 """Convert a byte string of local encoding to a unicode string"""
180 return fromlocal(s).decode('utf-8')
177 return fromlocal(s).decode('utf-8')
181
178
182 # converter functions between native str and byte string. use these if the
179 # converter functions between native str and byte string. use these if the
183 # character encoding is not aware (e.g. exception message) or is known to
180 # character encoding is not aware (e.g. exception message) or is known to
184 # be locale dependent (e.g. date formatting.)
181 # be locale dependent (e.g. date formatting.)
185 if pycompat.ispy3:
182 if pycompat.ispy3:
186 strtolocal = unitolocal
183 strtolocal = unitolocal
187 strfromlocal = unifromlocal
184 strfromlocal = unifromlocal
188 else:
185 else:
189 strtolocal = pycompat.identity
186 strtolocal = pycompat.identity
190 strfromlocal = pycompat.identity
187 strfromlocal = pycompat.identity
191
188
192 if not _nativeenviron:
189 if not _nativeenviron:
193 # now encoding and helper functions are available, recreate the environ
190 # now encoding and helper functions are available, recreate the environ
194 # dict to be exported to other modules
191 # dict to be exported to other modules
195 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
192 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
196 for k, v in os.environ.items()) # re-exports
193 for k, v in os.environ.items()) # re-exports
197
194
198 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
195 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
199 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
196 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
200 and "WFA" or "WF")
197 and "WFA" or "WF")
201
198
202 def colwidth(s):
199 def colwidth(s):
203 "Find the column width of a string for display in the local encoding"
200 "Find the column width of a string for display in the local encoding"
204 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
201 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
205
202
206 def ucolwidth(d):
203 def ucolwidth(d):
207 "Find the column width of a Unicode string for display"
204 "Find the column width of a Unicode string for display"
208 eaw = getattr(unicodedata, 'east_asian_width', None)
205 eaw = getattr(unicodedata, 'east_asian_width', None)
209 if eaw is not None:
206 if eaw is not None:
210 return sum([eaw(c) in wide and 2 or 1 for c in d])
207 return sum([eaw(c) in wide and 2 or 1 for c in d])
211 return len(d)
208 return len(d)
212
209
213 def getcols(s, start, c):
210 def getcols(s, start, c):
214 '''Use colwidth to find a c-column substring of s starting at byte
211 '''Use colwidth to find a c-column substring of s starting at byte
215 index start'''
212 index start'''
216 for x in xrange(start + c, len(s)):
213 for x in xrange(start + c, len(s)):
217 t = s[start:x]
214 t = s[start:x]
218 if colwidth(t) == c:
215 if colwidth(t) == c:
219 return t
216 return t
220
217
221 def trim(s, width, ellipsis='', leftside=False):
218 def trim(s, width, ellipsis='', leftside=False):
222 """Trim string 's' to at most 'width' columns (including 'ellipsis').
219 """Trim string 's' to at most 'width' columns (including 'ellipsis').
223
220
224 If 'leftside' is True, left side of string 's' is trimmed.
221 If 'leftside' is True, left side of string 's' is trimmed.
225 'ellipsis' is always placed at trimmed side.
222 'ellipsis' is always placed at trimmed side.
226
223
227 >>> ellipsis = '+++'
224 >>> ellipsis = '+++'
228 >>> from . import encoding
225 >>> from . import encoding
229 >>> encoding.encoding = 'utf-8'
226 >>> encoding.encoding = 'utf-8'
230 >>> t= '1234567890'
227 >>> t= '1234567890'
231 >>> print trim(t, 12, ellipsis=ellipsis)
228 >>> print trim(t, 12, ellipsis=ellipsis)
232 1234567890
229 1234567890
233 >>> print trim(t, 10, ellipsis=ellipsis)
230 >>> print trim(t, 10, ellipsis=ellipsis)
234 1234567890
231 1234567890
235 >>> print trim(t, 8, ellipsis=ellipsis)
232 >>> print trim(t, 8, ellipsis=ellipsis)
236 12345+++
233 12345+++
237 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
234 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
238 +++67890
235 +++67890
239 >>> print trim(t, 8)
236 >>> print trim(t, 8)
240 12345678
237 12345678
241 >>> print trim(t, 8, leftside=True)
238 >>> print trim(t, 8, leftside=True)
242 34567890
239 34567890
243 >>> print trim(t, 3, ellipsis=ellipsis)
240 >>> print trim(t, 3, ellipsis=ellipsis)
244 +++
241 +++
245 >>> print trim(t, 1, ellipsis=ellipsis)
242 >>> print trim(t, 1, ellipsis=ellipsis)
246 +
243 +
247 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
244 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
248 >>> t = u.encode(encoding.encoding)
245 >>> t = u.encode(encoding.encoding)
249 >>> print trim(t, 12, ellipsis=ellipsis)
246 >>> print trim(t, 12, ellipsis=ellipsis)
250 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
247 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
251 >>> print trim(t, 10, ellipsis=ellipsis)
248 >>> print trim(t, 10, ellipsis=ellipsis)
252 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
249 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
253 >>> print trim(t, 8, ellipsis=ellipsis)
250 >>> print trim(t, 8, ellipsis=ellipsis)
254 \xe3\x81\x82\xe3\x81\x84+++
251 \xe3\x81\x82\xe3\x81\x84+++
255 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
252 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
256 +++\xe3\x81\x88\xe3\x81\x8a
253 +++\xe3\x81\x88\xe3\x81\x8a
257 >>> print trim(t, 5)
254 >>> print trim(t, 5)
258 \xe3\x81\x82\xe3\x81\x84
255 \xe3\x81\x82\xe3\x81\x84
259 >>> print trim(t, 5, leftside=True)
256 >>> print trim(t, 5, leftside=True)
260 \xe3\x81\x88\xe3\x81\x8a
257 \xe3\x81\x88\xe3\x81\x8a
261 >>> print trim(t, 4, ellipsis=ellipsis)
258 >>> print trim(t, 4, ellipsis=ellipsis)
262 +++
259 +++
263 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
260 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
264 +++
261 +++
265 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
262 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
266 >>> print trim(t, 12, ellipsis=ellipsis)
263 >>> print trim(t, 12, ellipsis=ellipsis)
267 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
264 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
268 >>> print trim(t, 10, ellipsis=ellipsis)
265 >>> print trim(t, 10, ellipsis=ellipsis)
269 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
266 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
270 >>> print trim(t, 8, ellipsis=ellipsis)
267 >>> print trim(t, 8, ellipsis=ellipsis)
271 \x11\x22\x33\x44\x55+++
268 \x11\x22\x33\x44\x55+++
272 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
269 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
273 +++\x66\x77\x88\x99\xaa
270 +++\x66\x77\x88\x99\xaa
274 >>> print trim(t, 8)
271 >>> print trim(t, 8)
275 \x11\x22\x33\x44\x55\x66\x77\x88
272 \x11\x22\x33\x44\x55\x66\x77\x88
276 >>> print trim(t, 8, leftside=True)
273 >>> print trim(t, 8, leftside=True)
277 \x33\x44\x55\x66\x77\x88\x99\xaa
274 \x33\x44\x55\x66\x77\x88\x99\xaa
278 >>> print trim(t, 3, ellipsis=ellipsis)
275 >>> print trim(t, 3, ellipsis=ellipsis)
279 +++
276 +++
280 >>> print trim(t, 1, ellipsis=ellipsis)
277 >>> print trim(t, 1, ellipsis=ellipsis)
281 +
278 +
282 """
279 """
283 try:
280 try:
284 u = s.decode(_sysstr(encoding))
281 u = s.decode(_sysstr(encoding))
285 except UnicodeDecodeError:
282 except UnicodeDecodeError:
286 if len(s) <= width: # trimming is not needed
283 if len(s) <= width: # trimming is not needed
287 return s
284 return s
288 width -= len(ellipsis)
285 width -= len(ellipsis)
289 if width <= 0: # no enough room even for ellipsis
286 if width <= 0: # no enough room even for ellipsis
290 return ellipsis[:width + len(ellipsis)]
287 return ellipsis[:width + len(ellipsis)]
291 if leftside:
288 if leftside:
292 return ellipsis + s[-width:]
289 return ellipsis + s[-width:]
293 return s[:width] + ellipsis
290 return s[:width] + ellipsis
294
291
295 if ucolwidth(u) <= width: # trimming is not needed
292 if ucolwidth(u) <= width: # trimming is not needed
296 return s
293 return s
297
294
298 width -= len(ellipsis)
295 width -= len(ellipsis)
299 if width <= 0: # no enough room even for ellipsis
296 if width <= 0: # no enough room even for ellipsis
300 return ellipsis[:width + len(ellipsis)]
297 return ellipsis[:width + len(ellipsis)]
301
298
302 if leftside:
299 if leftside:
303 uslice = lambda i: u[i:]
300 uslice = lambda i: u[i:]
304 concat = lambda s: ellipsis + s
301 concat = lambda s: ellipsis + s
305 else:
302 else:
306 uslice = lambda i: u[:-i]
303 uslice = lambda i: u[:-i]
307 concat = lambda s: s + ellipsis
304 concat = lambda s: s + ellipsis
308 for i in xrange(1, len(u)):
305 for i in xrange(1, len(u)):
309 usub = uslice(i)
306 usub = uslice(i)
310 if ucolwidth(usub) <= width:
307 if ucolwidth(usub) <= width:
311 return concat(usub.encode(_sysstr(encoding)))
308 return concat(usub.encode(_sysstr(encoding)))
312 return ellipsis # no enough room for multi-column characters
309 return ellipsis # no enough room for multi-column characters
313
310
314 def _asciilower(s):
311 def _asciilower(s):
315 '''convert a string to lowercase if ASCII
312 '''convert a string to lowercase if ASCII
316
313
317 Raises UnicodeDecodeError if non-ASCII characters are found.'''
314 Raises UnicodeDecodeError if non-ASCII characters are found.'''
318 s.decode('ascii')
315 s.decode('ascii')
319 return s.lower()
316 return s.lower()
320
317
321 def asciilower(s):
318 def asciilower(s):
322 # delay importing avoids cyclic dependency around "parsers" in
319 # delay importing avoids cyclic dependency around "parsers" in
323 # pure Python build (util => i18n => encoding => parsers => util)
320 # pure Python build (util => i18n => encoding => parsers => util)
324 from . import parsers
321 from . import parsers
325 impl = getattr(parsers, 'asciilower', _asciilower)
322 impl = getattr(parsers, 'asciilower', _asciilower)
326 global asciilower
323 global asciilower
327 asciilower = impl
324 asciilower = impl
328 return impl(s)
325 return impl(s)
329
326
330 def _asciiupper(s):
327 def _asciiupper(s):
331 '''convert a string to uppercase if ASCII
328 '''convert a string to uppercase if ASCII
332
329
333 Raises UnicodeDecodeError if non-ASCII characters are found.'''
330 Raises UnicodeDecodeError if non-ASCII characters are found.'''
334 s.decode('ascii')
331 s.decode('ascii')
335 return s.upper()
332 return s.upper()
336
333
337 def asciiupper(s):
334 def asciiupper(s):
338 # delay importing avoids cyclic dependency around "parsers" in
335 # delay importing avoids cyclic dependency around "parsers" in
339 # pure Python build (util => i18n => encoding => parsers => util)
336 # pure Python build (util => i18n => encoding => parsers => util)
340 from . import parsers
337 from . import parsers
341 impl = getattr(parsers, 'asciiupper', _asciiupper)
338 impl = getattr(parsers, 'asciiupper', _asciiupper)
342 global asciiupper
339 global asciiupper
343 asciiupper = impl
340 asciiupper = impl
344 return impl(s)
341 return impl(s)
345
342
346 def lower(s):
343 def lower(s):
347 "best-effort encoding-aware case-folding of local string s"
344 "best-effort encoding-aware case-folding of local string s"
348 try:
345 try:
349 return asciilower(s)
346 return asciilower(s)
350 except UnicodeDecodeError:
347 except UnicodeDecodeError:
351 pass
348 pass
352 try:
349 try:
353 if isinstance(s, localstr):
350 if isinstance(s, localstr):
354 u = s._utf8.decode("utf-8")
351 u = s._utf8.decode("utf-8")
355 else:
352 else:
356 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
353 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
357
354
358 lu = u.lower()
355 lu = u.lower()
359 if u == lu:
356 if u == lu:
360 return s # preserve localstring
357 return s # preserve localstring
361 return lu.encode(_sysstr(encoding))
358 return lu.encode(_sysstr(encoding))
362 except UnicodeError:
359 except UnicodeError:
363 return s.lower() # we don't know how to fold this except in ASCII
360 return s.lower() # we don't know how to fold this except in ASCII
364 except LookupError as k:
361 except LookupError as k:
365 raise error.Abort(k, hint="please check your locale settings")
362 raise error.Abort(k, hint="please check your locale settings")
366
363
367 def upper(s):
364 def upper(s):
368 "best-effort encoding-aware case-folding of local string s"
365 "best-effort encoding-aware case-folding of local string s"
369 try:
366 try:
370 return asciiupper(s)
367 return asciiupper(s)
371 except UnicodeDecodeError:
368 except UnicodeDecodeError:
372 return upperfallback(s)
369 return upperfallback(s)
373
370
374 def upperfallback(s):
371 def upperfallback(s):
375 try:
372 try:
376 if isinstance(s, localstr):
373 if isinstance(s, localstr):
377 u = s._utf8.decode("utf-8")
374 u = s._utf8.decode("utf-8")
378 else:
375 else:
379 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
376 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
380
377
381 uu = u.upper()
378 uu = u.upper()
382 if u == uu:
379 if u == uu:
383 return s # preserve localstring
380 return s # preserve localstring
384 return uu.encode(_sysstr(encoding))
381 return uu.encode(_sysstr(encoding))
385 except UnicodeError:
382 except UnicodeError:
386 return s.upper() # we don't know how to fold this except in ASCII
383 return s.upper() # we don't know how to fold this except in ASCII
387 except LookupError as k:
384 except LookupError as k:
388 raise error.Abort(k, hint="please check your locale settings")
385 raise error.Abort(k, hint="please check your locale settings")
389
386
390 class normcasespecs(object):
387 class normcasespecs(object):
391 '''what a platform's normcase does to ASCII strings
388 '''what a platform's normcase does to ASCII strings
392
389
393 This is specified per platform, and should be consistent with what normcase
390 This is specified per platform, and should be consistent with what normcase
394 on that platform actually does.
391 on that platform actually does.
395
392
396 lower: normcase lowercases ASCII strings
393 lower: normcase lowercases ASCII strings
397 upper: normcase uppercases ASCII strings
394 upper: normcase uppercases ASCII strings
398 other: the fallback function should always be called
395 other: the fallback function should always be called
399
396
400 This should be kept in sync with normcase_spec in util.h.'''
397 This should be kept in sync with normcase_spec in util.h.'''
401 lower = -1
398 lower = -1
402 upper = 1
399 upper = 1
403 other = 0
400 other = 0
404
401
405 _jsonmap = []
402 _jsonmap = []
406 _jsonmap.extend("\\u%04x" % x for x in range(32))
403 _jsonmap.extend("\\u%04x" % x for x in range(32))
407 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
404 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
408 _jsonmap.append('\\u007f')
405 _jsonmap.append('\\u007f')
409 _jsonmap[0x09] = '\\t'
406 _jsonmap[0x09] = '\\t'
410 _jsonmap[0x0a] = '\\n'
407 _jsonmap[0x0a] = '\\n'
411 _jsonmap[0x22] = '\\"'
408 _jsonmap[0x22] = '\\"'
412 _jsonmap[0x5c] = '\\\\'
409 _jsonmap[0x5c] = '\\\\'
413 _jsonmap[0x08] = '\\b'
410 _jsonmap[0x08] = '\\b'
414 _jsonmap[0x0c] = '\\f'
411 _jsonmap[0x0c] = '\\f'
415 _jsonmap[0x0d] = '\\r'
412 _jsonmap[0x0d] = '\\r'
416 _paranoidjsonmap = _jsonmap[:]
413 _paranoidjsonmap = _jsonmap[:]
417 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
414 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
418 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
415 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
419 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
416 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
420
417
421 def jsonescape(s, paranoid=False):
418 def jsonescape(s, paranoid=False):
422 '''returns a string suitable for JSON
419 '''returns a string suitable for JSON
423
420
424 JSON is problematic for us because it doesn't support non-Unicode
421 JSON is problematic for us because it doesn't support non-Unicode
425 bytes. To deal with this, we take the following approach:
422 bytes. To deal with this, we take the following approach:
426
423
427 - localstr objects are converted back to UTF-8
424 - localstr objects are converted back to UTF-8
428 - valid UTF-8/ASCII strings are passed as-is
425 - valid UTF-8/ASCII strings are passed as-is
429 - other strings are converted to UTF-8b surrogate encoding
426 - other strings are converted to UTF-8b surrogate encoding
430 - apply JSON-specified string escaping
427 - apply JSON-specified string escaping
431
428
432 (escapes are doubled in these tests)
429 (escapes are doubled in these tests)
433
430
434 >>> jsonescape('this is a test')
431 >>> jsonescape('this is a test')
435 'this is a test'
432 'this is a test'
436 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
433 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
437 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
434 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
438 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
435 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
439 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
436 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
440 >>> jsonescape('a weird byte: \\xdd')
437 >>> jsonescape('a weird byte: \\xdd')
441 'a weird byte: \\xed\\xb3\\x9d'
438 'a weird byte: \\xed\\xb3\\x9d'
442 >>> jsonescape('utf-8: caf\\xc3\\xa9')
439 >>> jsonescape('utf-8: caf\\xc3\\xa9')
443 'utf-8: caf\\xc3\\xa9'
440 'utf-8: caf\\xc3\\xa9'
444 >>> jsonescape('')
441 >>> jsonescape('')
445 ''
442 ''
446
443
447 If paranoid, non-ascii and common troublesome characters are also escaped.
444 If paranoid, non-ascii and common troublesome characters are also escaped.
448 This is suitable for web output.
445 This is suitable for web output.
449
446
450 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
447 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
451 'escape boundary: ~ \\\\u007f \\\\u0080'
448 'escape boundary: ~ \\\\u007f \\\\u0080'
452 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
449 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
453 'a weird byte: \\\\udcdd'
450 'a weird byte: \\\\udcdd'
454 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
451 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
455 'utf-8: caf\\\\u00e9'
452 'utf-8: caf\\\\u00e9'
456 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
453 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
457 'non-BMP: \\\\ud834\\\\udd1e'
454 'non-BMP: \\\\ud834\\\\udd1e'
458 >>> jsonescape('<foo@example.org>', paranoid=True)
455 >>> jsonescape('<foo@example.org>', paranoid=True)
459 '\\\\u003cfoo@example.org\\\\u003e'
456 '\\\\u003cfoo@example.org\\\\u003e'
460 '''
457 '''
461
458
462 if paranoid:
459 if paranoid:
463 jm = _paranoidjsonmap
460 jm = _paranoidjsonmap
464 else:
461 else:
465 jm = _jsonmap
462 jm = _jsonmap
466
463
467 u8chars = toutf8b(s)
464 u8chars = toutf8b(s)
468 try:
465 try:
469 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
466 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
470 except IndexError:
467 except IndexError:
471 pass
468 pass
472 # non-BMP char is represented as UTF-16 surrogate pair
469 # non-BMP char is represented as UTF-16 surrogate pair
473 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
470 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
474 u16codes.pop(0) # drop BOM
471 u16codes.pop(0) # drop BOM
475 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
472 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
476
473
477 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
474 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
478
475
479 def getutf8char(s, pos):
476 def getutf8char(s, pos):
480 '''get the next full utf-8 character in the given string, starting at pos
477 '''get the next full utf-8 character in the given string, starting at pos
481
478
482 Raises a UnicodeError if the given location does not start a valid
479 Raises a UnicodeError if the given location does not start a valid
483 utf-8 character.
480 utf-8 character.
484 '''
481 '''
485
482
486 # find how many bytes to attempt decoding from first nibble
483 # find how many bytes to attempt decoding from first nibble
487 l = _utf8len[ord(s[pos]) >> 4]
484 l = _utf8len[ord(s[pos]) >> 4]
488 if not l: # ascii
485 if not l: # ascii
489 return s[pos]
486 return s[pos]
490
487
491 c = s[pos:pos + l]
488 c = s[pos:pos + l]
492 # validate with attempted decode
489 # validate with attempted decode
493 c.decode("utf-8")
490 c.decode("utf-8")
494 return c
491 return c
495
492
496 def toutf8b(s):
493 def toutf8b(s):
497 '''convert a local, possibly-binary string into UTF-8b
494 '''convert a local, possibly-binary string into UTF-8b
498
495
499 This is intended as a generic method to preserve data when working
496 This is intended as a generic method to preserve data when working
500 with schemes like JSON and XML that have no provision for
497 with schemes like JSON and XML that have no provision for
501 arbitrary byte strings. As Mercurial often doesn't know
498 arbitrary byte strings. As Mercurial often doesn't know
502 what encoding data is in, we use so-called UTF-8b.
499 what encoding data is in, we use so-called UTF-8b.
503
500
504 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
501 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
505 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
502 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
506 uDC00-uDCFF.
503 uDC00-uDCFF.
507
504
508 Principles of operation:
505 Principles of operation:
509
506
510 - ASCII and UTF-8 data successfully round-trips and is understood
507 - ASCII and UTF-8 data successfully round-trips and is understood
511 by Unicode-oriented clients
508 by Unicode-oriented clients
512 - filenames and file contents in arbitrary other encodings can have
509 - filenames and file contents in arbitrary other encodings can have
513 be round-tripped or recovered by clueful clients
510 be round-tripped or recovered by clueful clients
514 - local strings that have a cached known UTF-8 encoding (aka
511 - local strings that have a cached known UTF-8 encoding (aka
515 localstr) get sent as UTF-8 so Unicode-oriented clients get the
512 localstr) get sent as UTF-8 so Unicode-oriented clients get the
516 Unicode data they want
513 Unicode data they want
517 - because we must preserve UTF-8 bytestring in places such as
514 - because we must preserve UTF-8 bytestring in places such as
518 filenames, metadata can't be roundtripped without help
515 filenames, metadata can't be roundtripped without help
519
516
520 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
517 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
521 arbitrary bytes into an internal Unicode format that can be
518 arbitrary bytes into an internal Unicode format that can be
522 re-encoded back into the original. Here we are exposing the
519 re-encoded back into the original. Here we are exposing the
523 internal surrogate encoding as a UTF-8 string.)
520 internal surrogate encoding as a UTF-8 string.)
524 '''
521 '''
525
522
526 if "\xed" not in s:
523 if "\xed" not in s:
527 if isinstance(s, localstr):
524 if isinstance(s, localstr):
528 return s._utf8
525 return s._utf8
529 try:
526 try:
530 s.decode('utf-8')
527 s.decode('utf-8')
531 return s
528 return s
532 except UnicodeDecodeError:
529 except UnicodeDecodeError:
533 pass
530 pass
534
531
535 r = ""
532 r = ""
536 pos = 0
533 pos = 0
537 l = len(s)
534 l = len(s)
538 while pos < l:
535 while pos < l:
539 try:
536 try:
540 c = getutf8char(s, pos)
537 c = getutf8char(s, pos)
541 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
538 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
542 # have to re-escape existing U+DCxx characters
539 # have to re-escape existing U+DCxx characters
543 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
540 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
544 pos += 1
541 pos += 1
545 else:
542 else:
546 pos += len(c)
543 pos += len(c)
547 except UnicodeDecodeError:
544 except UnicodeDecodeError:
548 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
545 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
549 pos += 1
546 pos += 1
550 r += c
547 r += c
551 return r
548 return r
552
549
553 def fromutf8b(s):
550 def fromutf8b(s):
554 '''Given a UTF-8b string, return a local, possibly-binary string.
551 '''Given a UTF-8b string, return a local, possibly-binary string.
555
552
556 return the original binary string. This
553 return the original binary string. This
557 is a round-trip process for strings like filenames, but metadata
554 is a round-trip process for strings like filenames, but metadata
558 that's was passed through tolocal will remain in UTF-8.
555 that's was passed through tolocal will remain in UTF-8.
559
556
560 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
557 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
561 >>> m = "\\xc3\\xa9\\x99abcd"
558 >>> m = "\\xc3\\xa9\\x99abcd"
562 >>> toutf8b(m)
559 >>> toutf8b(m)
563 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
560 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
564 >>> roundtrip(m)
561 >>> roundtrip(m)
565 True
562 True
566 >>> roundtrip("\\xc2\\xc2\\x80")
563 >>> roundtrip("\\xc2\\xc2\\x80")
567 True
564 True
568 >>> roundtrip("\\xef\\xbf\\xbd")
565 >>> roundtrip("\\xef\\xbf\\xbd")
569 True
566 True
570 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
567 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
571 True
568 True
572 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
569 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
573 True
570 True
574 '''
571 '''
575
572
576 # fast path - look for uDxxx prefixes in s
573 # fast path - look for uDxxx prefixes in s
577 if "\xed" not in s:
574 if "\xed" not in s:
578 return s
575 return s
579
576
580 # We could do this with the unicode type but some Python builds
577 # We could do this with the unicode type but some Python builds
581 # use UTF-16 internally (issue5031) which causes non-BMP code
578 # use UTF-16 internally (issue5031) which causes non-BMP code
582 # points to be escaped. Instead, we use our handy getutf8char
579 # points to be escaped. Instead, we use our handy getutf8char
583 # helper again to walk the string without "decoding" it.
580 # helper again to walk the string without "decoding" it.
584
581
585 r = ""
582 r = ""
586 pos = 0
583 pos = 0
587 l = len(s)
584 l = len(s)
588 while pos < l:
585 while pos < l:
589 c = getutf8char(s, pos)
586 c = getutf8char(s, pos)
590 pos += len(c)
587 pos += len(c)
591 # unescape U+DCxx characters
588 # unescape U+DCxx characters
592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
589 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
593 c = chr(ord(c.decode("utf-8")) & 0xff)
590 c = chr(ord(c.decode("utf-8")) & 0xff)
594 r += c
591 r += c
595 return r
592 return r
General Comments 0
You need to be logged in to leave comments. Login now