##// END OF EJS Templates
encoding: make wide character class list a sysstr...
Augie Fackler -
r32529:0ec17613 default
parent child Browse files
Show More
@@ -1,593 +1,593 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 _sysstr = pycompat.sysstr
21 _sysstr = pycompat.sysstr
22
22
23 if pycompat.ispy3:
23 if pycompat.ispy3:
24 unichr = chr
24 unichr = chr
25
25
26 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
27 # "Unicode Subtleties"), so we need to ignore them in some places for
27 # "Unicode Subtleties"), so we need to ignore them in some places for
28 # sanity.
28 # sanity.
29 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
30 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
31 "206a 206b 206c 206d 206e 206f feff".split()]
31 "206a 206b 206c 206d 206e 206f feff".split()]
32 # verify the next function will work
32 # verify the next function will work
33 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
33 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
34
34
35 def hfsignoreclean(s):
35 def hfsignoreclean(s):
36 """Remove codepoints ignored by HFS+ from s.
36 """Remove codepoints ignored by HFS+ from s.
37
37
38 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
38 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
39 '.hg'
39 '.hg'
40 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
40 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
41 '.hg'
41 '.hg'
42 """
42 """
43 if "\xe2" in s or "\xef" in s:
43 if "\xe2" in s or "\xef" in s:
44 for c in _ignore:
44 for c in _ignore:
45 s = s.replace(c, '')
45 s = s.replace(c, '')
46 return s
46 return s
47
47
48 # encoding.environ is provided read-only, which may not be used to modify
48 # encoding.environ is provided read-only, which may not be used to modify
49 # the process environment
49 # the process environment
50 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
50 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
51 if not pycompat.ispy3:
51 if not pycompat.ispy3:
52 environ = os.environ # re-exports
52 environ = os.environ # re-exports
53 elif _nativeenviron:
53 elif _nativeenviron:
54 environ = os.environb # re-exports
54 environ = os.environb # re-exports
55 else:
55 else:
56 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
56 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
57 # and recreate it once encoding is settled
57 # and recreate it once encoding is settled
58 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
58 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
59 for k, v in os.environ.items()) # re-exports
59 for k, v in os.environ.items()) # re-exports
60
60
61 _encodingfixers = {
61 _encodingfixers = {
62 '646': lambda: 'ascii',
62 '646': lambda: 'ascii',
63 'ANSI_X3.4-1968': lambda: 'ascii',
63 'ANSI_X3.4-1968': lambda: 'ascii',
64 }
64 }
65
65
66 try:
66 try:
67 encoding = environ.get("HGENCODING")
67 encoding = environ.get("HGENCODING")
68 if not encoding:
68 if not encoding:
69 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
69 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
70 encoding = _encodingfixers.get(encoding, lambda: encoding)()
70 encoding = _encodingfixers.get(encoding, lambda: encoding)()
71 except locale.Error:
71 except locale.Error:
72 encoding = 'ascii'
72 encoding = 'ascii'
73 encodingmode = environ.get("HGENCODINGMODE", "strict")
73 encodingmode = environ.get("HGENCODINGMODE", "strict")
74 fallbackencoding = 'ISO-8859-1'
74 fallbackencoding = 'ISO-8859-1'
75
75
76 class localstr(str):
76 class localstr(str):
77 '''This class allows strings that are unmodified to be
77 '''This class allows strings that are unmodified to be
78 round-tripped to the local encoding and back'''
78 round-tripped to the local encoding and back'''
79 def __new__(cls, u, l):
79 def __new__(cls, u, l):
80 s = str.__new__(cls, l)
80 s = str.__new__(cls, l)
81 s._utf8 = u
81 s._utf8 = u
82 return s
82 return s
83 def __hash__(self):
83 def __hash__(self):
84 return hash(self._utf8) # avoid collisions in local string space
84 return hash(self._utf8) # avoid collisions in local string space
85
85
86 def tolocal(s):
86 def tolocal(s):
87 """
87 """
88 Convert a string from internal UTF-8 to local encoding
88 Convert a string from internal UTF-8 to local encoding
89
89
90 All internal strings should be UTF-8 but some repos before the
90 All internal strings should be UTF-8 but some repos before the
91 implementation of locale support may contain latin1 or possibly
91 implementation of locale support may contain latin1 or possibly
92 other character sets. We attempt to decode everything strictly
92 other character sets. We attempt to decode everything strictly
93 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
93 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
94 replace unknown characters.
94 replace unknown characters.
95
95
96 The localstr class is used to cache the known UTF-8 encoding of
96 The localstr class is used to cache the known UTF-8 encoding of
97 strings next to their local representation to allow lossless
97 strings next to their local representation to allow lossless
98 round-trip conversion back to UTF-8.
98 round-trip conversion back to UTF-8.
99
99
100 >>> u = 'foo: \\xc3\\xa4' # utf-8
100 >>> u = 'foo: \\xc3\\xa4' # utf-8
101 >>> l = tolocal(u)
101 >>> l = tolocal(u)
102 >>> l
102 >>> l
103 'foo: ?'
103 'foo: ?'
104 >>> fromlocal(l)
104 >>> fromlocal(l)
105 'foo: \\xc3\\xa4'
105 'foo: \\xc3\\xa4'
106 >>> u2 = 'foo: \\xc3\\xa1'
106 >>> u2 = 'foo: \\xc3\\xa1'
107 >>> d = { l: 1, tolocal(u2): 2 }
107 >>> d = { l: 1, tolocal(u2): 2 }
108 >>> len(d) # no collision
108 >>> len(d) # no collision
109 2
109 2
110 >>> 'foo: ?' in d
110 >>> 'foo: ?' in d
111 False
111 False
112 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
112 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
113 >>> l = tolocal(l1)
113 >>> l = tolocal(l1)
114 >>> l
114 >>> l
115 'foo: ?'
115 'foo: ?'
116 >>> fromlocal(l) # magically in utf-8
116 >>> fromlocal(l) # magically in utf-8
117 'foo: \\xc3\\xa4'
117 'foo: \\xc3\\xa4'
118 """
118 """
119
119
120 try:
120 try:
121 try:
121 try:
122 # make sure string is actually stored in UTF-8
122 # make sure string is actually stored in UTF-8
123 u = s.decode('UTF-8')
123 u = s.decode('UTF-8')
124 if encoding == 'UTF-8':
124 if encoding == 'UTF-8':
125 # fast path
125 # fast path
126 return s
126 return s
127 r = u.encode(_sysstr(encoding), u"replace")
127 r = u.encode(_sysstr(encoding), u"replace")
128 if u == r.decode(_sysstr(encoding)):
128 if u == r.decode(_sysstr(encoding)):
129 # r is a safe, non-lossy encoding of s
129 # r is a safe, non-lossy encoding of s
130 return r
130 return r
131 return localstr(s, r)
131 return localstr(s, r)
132 except UnicodeDecodeError:
132 except UnicodeDecodeError:
133 # we should only get here if we're looking at an ancient changeset
133 # we should only get here if we're looking at an ancient changeset
134 try:
134 try:
135 u = s.decode(_sysstr(fallbackencoding))
135 u = s.decode(_sysstr(fallbackencoding))
136 r = u.encode(_sysstr(encoding), u"replace")
136 r = u.encode(_sysstr(encoding), u"replace")
137 if u == r.decode(_sysstr(encoding)):
137 if u == r.decode(_sysstr(encoding)):
138 # r is a safe, non-lossy encoding of s
138 # r is a safe, non-lossy encoding of s
139 return r
139 return r
140 return localstr(u.encode('UTF-8'), r)
140 return localstr(u.encode('UTF-8'), r)
141 except UnicodeDecodeError:
141 except UnicodeDecodeError:
142 u = s.decode("utf-8", "replace") # last ditch
142 u = s.decode("utf-8", "replace") # last ditch
143 # can't round-trip
143 # can't round-trip
144 return u.encode(_sysstr(encoding), u"replace")
144 return u.encode(_sysstr(encoding), u"replace")
145 except LookupError as k:
145 except LookupError as k:
146 raise error.Abort(k, hint="please check your locale settings")
146 raise error.Abort(k, hint="please check your locale settings")
147
147
148 def fromlocal(s):
148 def fromlocal(s):
149 """
149 """
150 Convert a string from the local character encoding to UTF-8
150 Convert a string from the local character encoding to UTF-8
151
151
152 We attempt to decode strings using the encoding mode set by
152 We attempt to decode strings using the encoding mode set by
153 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
153 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
154 characters will cause an error message. Other modes include
154 characters will cause an error message. Other modes include
155 'replace', which replaces unknown characters with a special
155 'replace', which replaces unknown characters with a special
156 Unicode character, and 'ignore', which drops the character.
156 Unicode character, and 'ignore', which drops the character.
157 """
157 """
158
158
159 # can we do a lossless round-trip?
159 # can we do a lossless round-trip?
160 if isinstance(s, localstr):
160 if isinstance(s, localstr):
161 return s._utf8
161 return s._utf8
162
162
163 try:
163 try:
164 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
164 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
165 return u.encode("utf-8")
165 return u.encode("utf-8")
166 except UnicodeDecodeError as inst:
166 except UnicodeDecodeError as inst:
167 sub = s[max(0, inst.start - 10):inst.start + 10]
167 sub = s[max(0, inst.start - 10):inst.start + 10]
168 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
168 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
169 except LookupError as k:
169 except LookupError as k:
170 raise error.Abort(k, hint="please check your locale settings")
170 raise error.Abort(k, hint="please check your locale settings")
171
171
172 def unitolocal(u):
172 def unitolocal(u):
173 """Convert a unicode string to a byte string of local encoding"""
173 """Convert a unicode string to a byte string of local encoding"""
174 return tolocal(u.encode('utf-8'))
174 return tolocal(u.encode('utf-8'))
175
175
176 def unifromlocal(s):
176 def unifromlocal(s):
177 """Convert a byte string of local encoding to a unicode string"""
177 """Convert a byte string of local encoding to a unicode string"""
178 return fromlocal(s).decode('utf-8')
178 return fromlocal(s).decode('utf-8')
179
179
180 # converter functions between native str and byte string. use these if the
180 # converter functions between native str and byte string. use these if the
181 # character encoding is not aware (e.g. exception message) or is known to
181 # character encoding is not aware (e.g. exception message) or is known to
182 # be locale dependent (e.g. date formatting.)
182 # be locale dependent (e.g. date formatting.)
183 if pycompat.ispy3:
183 if pycompat.ispy3:
184 strtolocal = unitolocal
184 strtolocal = unitolocal
185 strfromlocal = unifromlocal
185 strfromlocal = unifromlocal
186 else:
186 else:
187 strtolocal = pycompat.identity
187 strtolocal = pycompat.identity
188 strfromlocal = pycompat.identity
188 strfromlocal = pycompat.identity
189
189
190 if not _nativeenviron:
190 if not _nativeenviron:
191 # now encoding and helper functions are available, recreate the environ
191 # now encoding and helper functions are available, recreate the environ
192 # dict to be exported to other modules
192 # dict to be exported to other modules
193 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
193 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
194 for k, v in os.environ.items()) # re-exports
194 for k, v in os.environ.items()) # re-exports
195
195
196 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
196 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
197 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
197 wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
198 and "WFA" or "WF")
198 and "WFA" or "WF")
199
199
200 def colwidth(s):
200 def colwidth(s):
201 "Find the column width of a string for display in the local encoding"
201 "Find the column width of a string for display in the local encoding"
202 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
202 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
203
203
204 def ucolwidth(d):
204 def ucolwidth(d):
205 "Find the column width of a Unicode string for display"
205 "Find the column width of a Unicode string for display"
206 eaw = getattr(unicodedata, 'east_asian_width', None)
206 eaw = getattr(unicodedata, 'east_asian_width', None)
207 if eaw is not None:
207 if eaw is not None:
208 return sum([eaw(c) in wide and 2 or 1 for c in d])
208 return sum([eaw(c) in wide and 2 or 1 for c in d])
209 return len(d)
209 return len(d)
210
210
211 def getcols(s, start, c):
211 def getcols(s, start, c):
212 '''Use colwidth to find a c-column substring of s starting at byte
212 '''Use colwidth to find a c-column substring of s starting at byte
213 index start'''
213 index start'''
214 for x in xrange(start + c, len(s)):
214 for x in xrange(start + c, len(s)):
215 t = s[start:x]
215 t = s[start:x]
216 if colwidth(t) == c:
216 if colwidth(t) == c:
217 return t
217 return t
218
218
219 def trim(s, width, ellipsis='', leftside=False):
219 def trim(s, width, ellipsis='', leftside=False):
220 """Trim string 's' to at most 'width' columns (including 'ellipsis').
220 """Trim string 's' to at most 'width' columns (including 'ellipsis').
221
221
222 If 'leftside' is True, left side of string 's' is trimmed.
222 If 'leftside' is True, left side of string 's' is trimmed.
223 'ellipsis' is always placed at trimmed side.
223 'ellipsis' is always placed at trimmed side.
224
224
225 >>> ellipsis = '+++'
225 >>> ellipsis = '+++'
226 >>> from . import encoding
226 >>> from . import encoding
227 >>> encoding.encoding = 'utf-8'
227 >>> encoding.encoding = 'utf-8'
228 >>> t= '1234567890'
228 >>> t= '1234567890'
229 >>> print trim(t, 12, ellipsis=ellipsis)
229 >>> print trim(t, 12, ellipsis=ellipsis)
230 1234567890
230 1234567890
231 >>> print trim(t, 10, ellipsis=ellipsis)
231 >>> print trim(t, 10, ellipsis=ellipsis)
232 1234567890
232 1234567890
233 >>> print trim(t, 8, ellipsis=ellipsis)
233 >>> print trim(t, 8, ellipsis=ellipsis)
234 12345+++
234 12345+++
235 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
235 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
236 +++67890
236 +++67890
237 >>> print trim(t, 8)
237 >>> print trim(t, 8)
238 12345678
238 12345678
239 >>> print trim(t, 8, leftside=True)
239 >>> print trim(t, 8, leftside=True)
240 34567890
240 34567890
241 >>> print trim(t, 3, ellipsis=ellipsis)
241 >>> print trim(t, 3, ellipsis=ellipsis)
242 +++
242 +++
243 >>> print trim(t, 1, ellipsis=ellipsis)
243 >>> print trim(t, 1, ellipsis=ellipsis)
244 +
244 +
245 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
245 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
246 >>> t = u.encode(encoding.encoding)
246 >>> t = u.encode(encoding.encoding)
247 >>> print trim(t, 12, ellipsis=ellipsis)
247 >>> print trim(t, 12, ellipsis=ellipsis)
248 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
248 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
249 >>> print trim(t, 10, ellipsis=ellipsis)
249 >>> print trim(t, 10, ellipsis=ellipsis)
250 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
250 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
251 >>> print trim(t, 8, ellipsis=ellipsis)
251 >>> print trim(t, 8, ellipsis=ellipsis)
252 \xe3\x81\x82\xe3\x81\x84+++
252 \xe3\x81\x82\xe3\x81\x84+++
253 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
253 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
254 +++\xe3\x81\x88\xe3\x81\x8a
254 +++\xe3\x81\x88\xe3\x81\x8a
255 >>> print trim(t, 5)
255 >>> print trim(t, 5)
256 \xe3\x81\x82\xe3\x81\x84
256 \xe3\x81\x82\xe3\x81\x84
257 >>> print trim(t, 5, leftside=True)
257 >>> print trim(t, 5, leftside=True)
258 \xe3\x81\x88\xe3\x81\x8a
258 \xe3\x81\x88\xe3\x81\x8a
259 >>> print trim(t, 4, ellipsis=ellipsis)
259 >>> print trim(t, 4, ellipsis=ellipsis)
260 +++
260 +++
261 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
261 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
262 +++
262 +++
263 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
263 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
264 >>> print trim(t, 12, ellipsis=ellipsis)
264 >>> print trim(t, 12, ellipsis=ellipsis)
265 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
265 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
266 >>> print trim(t, 10, ellipsis=ellipsis)
266 >>> print trim(t, 10, ellipsis=ellipsis)
267 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
267 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
268 >>> print trim(t, 8, ellipsis=ellipsis)
268 >>> print trim(t, 8, ellipsis=ellipsis)
269 \x11\x22\x33\x44\x55+++
269 \x11\x22\x33\x44\x55+++
270 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
270 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
271 +++\x66\x77\x88\x99\xaa
271 +++\x66\x77\x88\x99\xaa
272 >>> print trim(t, 8)
272 >>> print trim(t, 8)
273 \x11\x22\x33\x44\x55\x66\x77\x88
273 \x11\x22\x33\x44\x55\x66\x77\x88
274 >>> print trim(t, 8, leftside=True)
274 >>> print trim(t, 8, leftside=True)
275 \x33\x44\x55\x66\x77\x88\x99\xaa
275 \x33\x44\x55\x66\x77\x88\x99\xaa
276 >>> print trim(t, 3, ellipsis=ellipsis)
276 >>> print trim(t, 3, ellipsis=ellipsis)
277 +++
277 +++
278 >>> print trim(t, 1, ellipsis=ellipsis)
278 >>> print trim(t, 1, ellipsis=ellipsis)
279 +
279 +
280 """
280 """
281 try:
281 try:
282 u = s.decode(_sysstr(encoding))
282 u = s.decode(_sysstr(encoding))
283 except UnicodeDecodeError:
283 except UnicodeDecodeError:
284 if len(s) <= width: # trimming is not needed
284 if len(s) <= width: # trimming is not needed
285 return s
285 return s
286 width -= len(ellipsis)
286 width -= len(ellipsis)
287 if width <= 0: # no enough room even for ellipsis
287 if width <= 0: # no enough room even for ellipsis
288 return ellipsis[:width + len(ellipsis)]
288 return ellipsis[:width + len(ellipsis)]
289 if leftside:
289 if leftside:
290 return ellipsis + s[-width:]
290 return ellipsis + s[-width:]
291 return s[:width] + ellipsis
291 return s[:width] + ellipsis
292
292
293 if ucolwidth(u) <= width: # trimming is not needed
293 if ucolwidth(u) <= width: # trimming is not needed
294 return s
294 return s
295
295
296 width -= len(ellipsis)
296 width -= len(ellipsis)
297 if width <= 0: # no enough room even for ellipsis
297 if width <= 0: # no enough room even for ellipsis
298 return ellipsis[:width + len(ellipsis)]
298 return ellipsis[:width + len(ellipsis)]
299
299
300 if leftside:
300 if leftside:
301 uslice = lambda i: u[i:]
301 uslice = lambda i: u[i:]
302 concat = lambda s: ellipsis + s
302 concat = lambda s: ellipsis + s
303 else:
303 else:
304 uslice = lambda i: u[:-i]
304 uslice = lambda i: u[:-i]
305 concat = lambda s: s + ellipsis
305 concat = lambda s: s + ellipsis
306 for i in xrange(1, len(u)):
306 for i in xrange(1, len(u)):
307 usub = uslice(i)
307 usub = uslice(i)
308 if ucolwidth(usub) <= width:
308 if ucolwidth(usub) <= width:
309 return concat(usub.encode(_sysstr(encoding)))
309 return concat(usub.encode(_sysstr(encoding)))
310 return ellipsis # no enough room for multi-column characters
310 return ellipsis # no enough room for multi-column characters
311
311
312 def _asciilower(s):
312 def _asciilower(s):
313 '''convert a string to lowercase if ASCII
313 '''convert a string to lowercase if ASCII
314
314
315 Raises UnicodeDecodeError if non-ASCII characters are found.'''
315 Raises UnicodeDecodeError if non-ASCII characters are found.'''
316 s.decode('ascii')
316 s.decode('ascii')
317 return s.lower()
317 return s.lower()
318
318
319 def asciilower(s):
319 def asciilower(s):
320 # delay importing avoids cyclic dependency around "parsers" in
320 # delay importing avoids cyclic dependency around "parsers" in
321 # pure Python build (util => i18n => encoding => parsers => util)
321 # pure Python build (util => i18n => encoding => parsers => util)
322 parsers = policy.importmod(r'parsers')
322 parsers = policy.importmod(r'parsers')
323 impl = getattr(parsers, 'asciilower', _asciilower)
323 impl = getattr(parsers, 'asciilower', _asciilower)
324 global asciilower
324 global asciilower
325 asciilower = impl
325 asciilower = impl
326 return impl(s)
326 return impl(s)
327
327
328 def _asciiupper(s):
328 def _asciiupper(s):
329 '''convert a string to uppercase if ASCII
329 '''convert a string to uppercase if ASCII
330
330
331 Raises UnicodeDecodeError if non-ASCII characters are found.'''
331 Raises UnicodeDecodeError if non-ASCII characters are found.'''
332 s.decode('ascii')
332 s.decode('ascii')
333 return s.upper()
333 return s.upper()
334
334
335 def asciiupper(s):
335 def asciiupper(s):
336 # delay importing avoids cyclic dependency around "parsers" in
336 # delay importing avoids cyclic dependency around "parsers" in
337 # pure Python build (util => i18n => encoding => parsers => util)
337 # pure Python build (util => i18n => encoding => parsers => util)
338 parsers = policy.importmod(r'parsers')
338 parsers = policy.importmod(r'parsers')
339 impl = getattr(parsers, 'asciiupper', _asciiupper)
339 impl = getattr(parsers, 'asciiupper', _asciiupper)
340 global asciiupper
340 global asciiupper
341 asciiupper = impl
341 asciiupper = impl
342 return impl(s)
342 return impl(s)
343
343
344 def lower(s):
344 def lower(s):
345 "best-effort encoding-aware case-folding of local string s"
345 "best-effort encoding-aware case-folding of local string s"
346 try:
346 try:
347 return asciilower(s)
347 return asciilower(s)
348 except UnicodeDecodeError:
348 except UnicodeDecodeError:
349 pass
349 pass
350 try:
350 try:
351 if isinstance(s, localstr):
351 if isinstance(s, localstr):
352 u = s._utf8.decode("utf-8")
352 u = s._utf8.decode("utf-8")
353 else:
353 else:
354 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
354 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
355
355
356 lu = u.lower()
356 lu = u.lower()
357 if u == lu:
357 if u == lu:
358 return s # preserve localstring
358 return s # preserve localstring
359 return lu.encode(_sysstr(encoding))
359 return lu.encode(_sysstr(encoding))
360 except UnicodeError:
360 except UnicodeError:
361 return s.lower() # we don't know how to fold this except in ASCII
361 return s.lower() # we don't know how to fold this except in ASCII
362 except LookupError as k:
362 except LookupError as k:
363 raise error.Abort(k, hint="please check your locale settings")
363 raise error.Abort(k, hint="please check your locale settings")
364
364
365 def upper(s):
365 def upper(s):
366 "best-effort encoding-aware case-folding of local string s"
366 "best-effort encoding-aware case-folding of local string s"
367 try:
367 try:
368 return asciiupper(s)
368 return asciiupper(s)
369 except UnicodeDecodeError:
369 except UnicodeDecodeError:
370 return upperfallback(s)
370 return upperfallback(s)
371
371
372 def upperfallback(s):
372 def upperfallback(s):
373 try:
373 try:
374 if isinstance(s, localstr):
374 if isinstance(s, localstr):
375 u = s._utf8.decode("utf-8")
375 u = s._utf8.decode("utf-8")
376 else:
376 else:
377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
378
378
379 uu = u.upper()
379 uu = u.upper()
380 if u == uu:
380 if u == uu:
381 return s # preserve localstring
381 return s # preserve localstring
382 return uu.encode(_sysstr(encoding))
382 return uu.encode(_sysstr(encoding))
383 except UnicodeError:
383 except UnicodeError:
384 return s.upper() # we don't know how to fold this except in ASCII
384 return s.upper() # we don't know how to fold this except in ASCII
385 except LookupError as k:
385 except LookupError as k:
386 raise error.Abort(k, hint="please check your locale settings")
386 raise error.Abort(k, hint="please check your locale settings")
387
387
388 class normcasespecs(object):
388 class normcasespecs(object):
389 '''what a platform's normcase does to ASCII strings
389 '''what a platform's normcase does to ASCII strings
390
390
391 This is specified per platform, and should be consistent with what normcase
391 This is specified per platform, and should be consistent with what normcase
392 on that platform actually does.
392 on that platform actually does.
393
393
394 lower: normcase lowercases ASCII strings
394 lower: normcase lowercases ASCII strings
395 upper: normcase uppercases ASCII strings
395 upper: normcase uppercases ASCII strings
396 other: the fallback function should always be called
396 other: the fallback function should always be called
397
397
398 This should be kept in sync with normcase_spec in util.h.'''
398 This should be kept in sync with normcase_spec in util.h.'''
399 lower = -1
399 lower = -1
400 upper = 1
400 upper = 1
401 other = 0
401 other = 0
402
402
403 _jsonmap = []
403 _jsonmap = []
404 _jsonmap.extend("\\u%04x" % x for x in range(32))
404 _jsonmap.extend("\\u%04x" % x for x in range(32))
405 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
405 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
406 _jsonmap.append('\\u007f')
406 _jsonmap.append('\\u007f')
407 _jsonmap[0x09] = '\\t'
407 _jsonmap[0x09] = '\\t'
408 _jsonmap[0x0a] = '\\n'
408 _jsonmap[0x0a] = '\\n'
409 _jsonmap[0x22] = '\\"'
409 _jsonmap[0x22] = '\\"'
410 _jsonmap[0x5c] = '\\\\'
410 _jsonmap[0x5c] = '\\\\'
411 _jsonmap[0x08] = '\\b'
411 _jsonmap[0x08] = '\\b'
412 _jsonmap[0x0c] = '\\f'
412 _jsonmap[0x0c] = '\\f'
413 _jsonmap[0x0d] = '\\r'
413 _jsonmap[0x0d] = '\\r'
414 _paranoidjsonmap = _jsonmap[:]
414 _paranoidjsonmap = _jsonmap[:]
415 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
415 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
416 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
416 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
417 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
417 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
418
418
419 def jsonescape(s, paranoid=False):
419 def jsonescape(s, paranoid=False):
420 '''returns a string suitable for JSON
420 '''returns a string suitable for JSON
421
421
422 JSON is problematic for us because it doesn't support non-Unicode
422 JSON is problematic for us because it doesn't support non-Unicode
423 bytes. To deal with this, we take the following approach:
423 bytes. To deal with this, we take the following approach:
424
424
425 - localstr objects are converted back to UTF-8
425 - localstr objects are converted back to UTF-8
426 - valid UTF-8/ASCII strings are passed as-is
426 - valid UTF-8/ASCII strings are passed as-is
427 - other strings are converted to UTF-8b surrogate encoding
427 - other strings are converted to UTF-8b surrogate encoding
428 - apply JSON-specified string escaping
428 - apply JSON-specified string escaping
429
429
430 (escapes are doubled in these tests)
430 (escapes are doubled in these tests)
431
431
432 >>> jsonescape('this is a test')
432 >>> jsonescape('this is a test')
433 'this is a test'
433 'this is a test'
434 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
434 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
435 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
435 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
436 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
436 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
437 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
437 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
438 >>> jsonescape('a weird byte: \\xdd')
438 >>> jsonescape('a weird byte: \\xdd')
439 'a weird byte: \\xed\\xb3\\x9d'
439 'a weird byte: \\xed\\xb3\\x9d'
440 >>> jsonescape('utf-8: caf\\xc3\\xa9')
440 >>> jsonescape('utf-8: caf\\xc3\\xa9')
441 'utf-8: caf\\xc3\\xa9'
441 'utf-8: caf\\xc3\\xa9'
442 >>> jsonescape('')
442 >>> jsonescape('')
443 ''
443 ''
444
444
445 If paranoid, non-ascii and common troublesome characters are also escaped.
445 If paranoid, non-ascii and common troublesome characters are also escaped.
446 This is suitable for web output.
446 This is suitable for web output.
447
447
448 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
448 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
449 'escape boundary: ~ \\\\u007f \\\\u0080'
449 'escape boundary: ~ \\\\u007f \\\\u0080'
450 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
450 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
451 'a weird byte: \\\\udcdd'
451 'a weird byte: \\\\udcdd'
452 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
452 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
453 'utf-8: caf\\\\u00e9'
453 'utf-8: caf\\\\u00e9'
454 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
454 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
455 'non-BMP: \\\\ud834\\\\udd1e'
455 'non-BMP: \\\\ud834\\\\udd1e'
456 >>> jsonescape('<foo@example.org>', paranoid=True)
456 >>> jsonescape('<foo@example.org>', paranoid=True)
457 '\\\\u003cfoo@example.org\\\\u003e'
457 '\\\\u003cfoo@example.org\\\\u003e'
458 '''
458 '''
459
459
460 if paranoid:
460 if paranoid:
461 jm = _paranoidjsonmap
461 jm = _paranoidjsonmap
462 else:
462 else:
463 jm = _jsonmap
463 jm = _jsonmap
464
464
465 u8chars = toutf8b(s)
465 u8chars = toutf8b(s)
466 try:
466 try:
467 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
467 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
468 except IndexError:
468 except IndexError:
469 pass
469 pass
470 # non-BMP char is represented as UTF-16 surrogate pair
470 # non-BMP char is represented as UTF-16 surrogate pair
471 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
471 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
472 u16codes.pop(0) # drop BOM
472 u16codes.pop(0) # drop BOM
473 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
473 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
474
474
475 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
475 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
476
476
477 def getutf8char(s, pos):
477 def getutf8char(s, pos):
478 '''get the next full utf-8 character in the given string, starting at pos
478 '''get the next full utf-8 character in the given string, starting at pos
479
479
480 Raises a UnicodeError if the given location does not start a valid
480 Raises a UnicodeError if the given location does not start a valid
481 utf-8 character.
481 utf-8 character.
482 '''
482 '''
483
483
484 # find how many bytes to attempt decoding from first nibble
484 # find how many bytes to attempt decoding from first nibble
485 l = _utf8len[ord(s[pos]) >> 4]
485 l = _utf8len[ord(s[pos]) >> 4]
486 if not l: # ascii
486 if not l: # ascii
487 return s[pos]
487 return s[pos]
488
488
489 c = s[pos:pos + l]
489 c = s[pos:pos + l]
490 # validate with attempted decode
490 # validate with attempted decode
491 c.decode("utf-8")
491 c.decode("utf-8")
492 return c
492 return c
493
493
494 def toutf8b(s):
494 def toutf8b(s):
495 '''convert a local, possibly-binary string into UTF-8b
495 '''convert a local, possibly-binary string into UTF-8b
496
496
497 This is intended as a generic method to preserve data when working
497 This is intended as a generic method to preserve data when working
498 with schemes like JSON and XML that have no provision for
498 with schemes like JSON and XML that have no provision for
499 arbitrary byte strings. As Mercurial often doesn't know
499 arbitrary byte strings. As Mercurial often doesn't know
500 what encoding data is in, we use so-called UTF-8b.
500 what encoding data is in, we use so-called UTF-8b.
501
501
502 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
502 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
503 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
503 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
504 uDC00-uDCFF.
504 uDC00-uDCFF.
505
505
506 Principles of operation:
506 Principles of operation:
507
507
508 - ASCII and UTF-8 data successfully round-trips and is understood
508 - ASCII and UTF-8 data successfully round-trips and is understood
509 by Unicode-oriented clients
509 by Unicode-oriented clients
510 - filenames and file contents in arbitrary other encodings can have
510 - filenames and file contents in arbitrary other encodings can have
511 be round-tripped or recovered by clueful clients
511 be round-tripped or recovered by clueful clients
512 - local strings that have a cached known UTF-8 encoding (aka
512 - local strings that have a cached known UTF-8 encoding (aka
513 localstr) get sent as UTF-8 so Unicode-oriented clients get the
513 localstr) get sent as UTF-8 so Unicode-oriented clients get the
514 Unicode data they want
514 Unicode data they want
515 - because we must preserve UTF-8 bytestring in places such as
515 - because we must preserve UTF-8 bytestring in places such as
516 filenames, metadata can't be roundtripped without help
516 filenames, metadata can't be roundtripped without help
517
517
518 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
518 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
519 arbitrary bytes into an internal Unicode format that can be
519 arbitrary bytes into an internal Unicode format that can be
520 re-encoded back into the original. Here we are exposing the
520 re-encoded back into the original. Here we are exposing the
521 internal surrogate encoding as a UTF-8 string.)
521 internal surrogate encoding as a UTF-8 string.)
522 '''
522 '''
523
523
524 if "\xed" not in s:
524 if "\xed" not in s:
525 if isinstance(s, localstr):
525 if isinstance(s, localstr):
526 return s._utf8
526 return s._utf8
527 try:
527 try:
528 s.decode('utf-8')
528 s.decode('utf-8')
529 return s
529 return s
530 except UnicodeDecodeError:
530 except UnicodeDecodeError:
531 pass
531 pass
532
532
533 r = ""
533 r = ""
534 pos = 0
534 pos = 0
535 l = len(s)
535 l = len(s)
536 while pos < l:
536 while pos < l:
537 try:
537 try:
538 c = getutf8char(s, pos)
538 c = getutf8char(s, pos)
539 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
539 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
540 # have to re-escape existing U+DCxx characters
540 # have to re-escape existing U+DCxx characters
541 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
541 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
542 pos += 1
542 pos += 1
543 else:
543 else:
544 pos += len(c)
544 pos += len(c)
545 except UnicodeDecodeError:
545 except UnicodeDecodeError:
546 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
546 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
547 pos += 1
547 pos += 1
548 r += c
548 r += c
549 return r
549 return r
550
550
551 def fromutf8b(s):
551 def fromutf8b(s):
552 '''Given a UTF-8b string, return a local, possibly-binary string.
552 '''Given a UTF-8b string, return a local, possibly-binary string.
553
553
554 return the original binary string. This
554 return the original binary string. This
555 is a round-trip process for strings like filenames, but metadata
555 is a round-trip process for strings like filenames, but metadata
556 that's was passed through tolocal will remain in UTF-8.
556 that's was passed through tolocal will remain in UTF-8.
557
557
558 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
558 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
559 >>> m = "\\xc3\\xa9\\x99abcd"
559 >>> m = "\\xc3\\xa9\\x99abcd"
560 >>> toutf8b(m)
560 >>> toutf8b(m)
561 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
561 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
562 >>> roundtrip(m)
562 >>> roundtrip(m)
563 True
563 True
564 >>> roundtrip("\\xc2\\xc2\\x80")
564 >>> roundtrip("\\xc2\\xc2\\x80")
565 True
565 True
566 >>> roundtrip("\\xef\\xbf\\xbd")
566 >>> roundtrip("\\xef\\xbf\\xbd")
567 True
567 True
568 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
568 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
569 True
569 True
570 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
570 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
571 True
571 True
572 '''
572 '''
573
573
574 # fast path - look for uDxxx prefixes in s
574 # fast path - look for uDxxx prefixes in s
575 if "\xed" not in s:
575 if "\xed" not in s:
576 return s
576 return s
577
577
578 # We could do this with the unicode type but some Python builds
578 # We could do this with the unicode type but some Python builds
579 # use UTF-16 internally (issue5031) which causes non-BMP code
579 # use UTF-16 internally (issue5031) which causes non-BMP code
580 # points to be escaped. Instead, we use our handy getutf8char
580 # points to be escaped. Instead, we use our handy getutf8char
581 # helper again to walk the string without "decoding" it.
581 # helper again to walk the string without "decoding" it.
582
582
583 r = ""
583 r = ""
584 pos = 0
584 pos = 0
585 l = len(s)
585 l = len(s)
586 while pos < l:
586 while pos < l:
587 c = getutf8char(s, pos)
587 c = getutf8char(s, pos)
588 pos += len(c)
588 pos += len(c)
589 # unescape U+DCxx characters
589 # unescape U+DCxx characters
590 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
590 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
591 c = chr(ord(c.decode("utf-8")) & 0xff)
591 c = chr(ord(c.decode("utf-8")) & 0xff)
592 r += c
592 r += c
593 return r
593 return r
General Comments 0
You need to be logged in to leave comments. Login now