##// END OF EJS Templates
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara -
r33810:dabe1f11 default
parent child Browse files
Show More
@@ -1,575 +1,575 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 from __future__ import absolute_import
8 from __future__ import absolute_import
9
9
10 import array
10 import array
11 import locale
11 import locale
12 import os
12 import os
13 import unicodedata
13 import unicodedata
14
14
15 from . import (
15 from . import (
16 error,
16 error,
17 policy,
17 policy,
18 pycompat,
18 pycompat,
19 )
19 )
20
20
21 charencode = policy.importmod(r'charencode')
21 charencode = policy.importmod(r'charencode')
22
22
23 asciilower = charencode.asciilower
23 asciilower = charencode.asciilower
24 asciiupper = charencode.asciiupper
24 asciiupper = charencode.asciiupper
25
25
26 _sysstr = pycompat.sysstr
26 _sysstr = pycompat.sysstr
27
27
28 if pycompat.ispy3:
28 if pycompat.ispy3:
29 unichr = chr
29 unichr = chr
30
30
31 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
31 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
32 # "Unicode Subtleties"), so we need to ignore them in some places for
32 # "Unicode Subtleties"), so we need to ignore them in some places for
33 # sanity.
33 # sanity.
34 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
34 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
35 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
35 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
36 "206a 206b 206c 206d 206e 206f feff".split()]
36 "206a 206b 206c 206d 206e 206f feff".split()]
37 # verify the next function will work
37 # verify the next function will work
38 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
38 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
39
39
40 def hfsignoreclean(s):
40 def hfsignoreclean(s):
41 """Remove codepoints ignored by HFS+ from s.
41 """Remove codepoints ignored by HFS+ from s.
42
42
43 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
43 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
44 '.hg'
44 '.hg'
45 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
45 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
46 '.hg'
46 '.hg'
47 """
47 """
48 if "\xe2" in s or "\xef" in s:
48 if "\xe2" in s or "\xef" in s:
49 for c in _ignore:
49 for c in _ignore:
50 s = s.replace(c, '')
50 s = s.replace(c, '')
51 return s
51 return s
52
52
53 # encoding.environ is provided read-only, which may not be used to modify
53 # encoding.environ is provided read-only, which may not be used to modify
54 # the process environment
54 # the process environment
55 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
55 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
56 if not pycompat.ispy3:
56 if not pycompat.ispy3:
57 environ = os.environ # re-exports
57 environ = os.environ # re-exports
58 elif _nativeenviron:
58 elif _nativeenviron:
59 environ = os.environb # re-exports
59 environ = os.environb # re-exports
60 else:
60 else:
61 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
61 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
62 # and recreate it once encoding is settled
62 # and recreate it once encoding is settled
63 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
63 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
64 for k, v in os.environ.items()) # re-exports
64 for k, v in os.environ.items()) # re-exports
65
65
66 _encodingfixers = {
66 _encodingfixers = {
67 '646': lambda: 'ascii',
67 '646': lambda: 'ascii',
68 'ANSI_X3.4-1968': lambda: 'ascii',
68 'ANSI_X3.4-1968': lambda: 'ascii',
69 }
69 }
70
70
71 try:
71 try:
72 encoding = environ.get("HGENCODING")
72 encoding = environ.get("HGENCODING")
73 if not encoding:
73 if not encoding:
74 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
74 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 encoding = _encodingfixers.get(encoding, lambda: encoding)()
76 except locale.Error:
76 except locale.Error:
77 encoding = 'ascii'
77 encoding = 'ascii'
78 encodingmode = environ.get("HGENCODINGMODE", "strict")
78 encodingmode = environ.get("HGENCODINGMODE", "strict")
79 fallbackencoding = 'ISO-8859-1'
79 fallbackencoding = 'ISO-8859-1'
80
80
81 class localstr(str):
81 class localstr(bytes):
82 '''This class allows strings that are unmodified to be
82 '''This class allows strings that are unmodified to be
83 round-tripped to the local encoding and back'''
83 round-tripped to the local encoding and back'''
84 def __new__(cls, u, l):
84 def __new__(cls, u, l):
85 s = str.__new__(cls, l)
85 s = bytes.__new__(cls, l)
86 s._utf8 = u
86 s._utf8 = u
87 return s
87 return s
88 def __hash__(self):
88 def __hash__(self):
89 return hash(self._utf8) # avoid collisions in local string space
89 return hash(self._utf8) # avoid collisions in local string space
90
90
91 def tolocal(s):
91 def tolocal(s):
92 """
92 """
93 Convert a string from internal UTF-8 to local encoding
93 Convert a string from internal UTF-8 to local encoding
94
94
95 All internal strings should be UTF-8 but some repos before the
95 All internal strings should be UTF-8 but some repos before the
96 implementation of locale support may contain latin1 or possibly
96 implementation of locale support may contain latin1 or possibly
97 other character sets. We attempt to decode everything strictly
97 other character sets. We attempt to decode everything strictly
98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
99 replace unknown characters.
99 replace unknown characters.
100
100
101 The localstr class is used to cache the known UTF-8 encoding of
101 The localstr class is used to cache the known UTF-8 encoding of
102 strings next to their local representation to allow lossless
102 strings next to their local representation to allow lossless
103 round-trip conversion back to UTF-8.
103 round-trip conversion back to UTF-8.
104
104
105 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 >>> u = 'foo: \\xc3\\xa4' # utf-8
106 >>> l = tolocal(u)
106 >>> l = tolocal(u)
107 >>> l
107 >>> l
108 'foo: ?'
108 'foo: ?'
109 >>> fromlocal(l)
109 >>> fromlocal(l)
110 'foo: \\xc3\\xa4'
110 'foo: \\xc3\\xa4'
111 >>> u2 = 'foo: \\xc3\\xa1'
111 >>> u2 = 'foo: \\xc3\\xa1'
112 >>> d = { l: 1, tolocal(u2): 2 }
112 >>> d = { l: 1, tolocal(u2): 2 }
113 >>> len(d) # no collision
113 >>> len(d) # no collision
114 2
114 2
115 >>> 'foo: ?' in d
115 >>> 'foo: ?' in d
116 False
116 False
117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
118 >>> l = tolocal(l1)
118 >>> l = tolocal(l1)
119 >>> l
119 >>> l
120 'foo: ?'
120 'foo: ?'
121 >>> fromlocal(l) # magically in utf-8
121 >>> fromlocal(l) # magically in utf-8
122 'foo: \\xc3\\xa4'
122 'foo: \\xc3\\xa4'
123 """
123 """
124
124
125 try:
125 try:
126 try:
126 try:
127 # make sure string is actually stored in UTF-8
127 # make sure string is actually stored in UTF-8
128 u = s.decode('UTF-8')
128 u = s.decode('UTF-8')
129 if encoding == 'UTF-8':
129 if encoding == 'UTF-8':
130 # fast path
130 # fast path
131 return s
131 return s
132 r = u.encode(_sysstr(encoding), u"replace")
132 r = u.encode(_sysstr(encoding), u"replace")
133 if u == r.decode(_sysstr(encoding)):
133 if u == r.decode(_sysstr(encoding)):
134 # r is a safe, non-lossy encoding of s
134 # r is a safe, non-lossy encoding of s
135 return r
135 return r
136 return localstr(s, r)
136 return localstr(s, r)
137 except UnicodeDecodeError:
137 except UnicodeDecodeError:
138 # we should only get here if we're looking at an ancient changeset
138 # we should only get here if we're looking at an ancient changeset
139 try:
139 try:
140 u = s.decode(_sysstr(fallbackencoding))
140 u = s.decode(_sysstr(fallbackencoding))
141 r = u.encode(_sysstr(encoding), u"replace")
141 r = u.encode(_sysstr(encoding), u"replace")
142 if u == r.decode(_sysstr(encoding)):
142 if u == r.decode(_sysstr(encoding)):
143 # r is a safe, non-lossy encoding of s
143 # r is a safe, non-lossy encoding of s
144 return r
144 return r
145 return localstr(u.encode('UTF-8'), r)
145 return localstr(u.encode('UTF-8'), r)
146 except UnicodeDecodeError:
146 except UnicodeDecodeError:
147 u = s.decode("utf-8", "replace") # last ditch
147 u = s.decode("utf-8", "replace") # last ditch
148 # can't round-trip
148 # can't round-trip
149 return u.encode(_sysstr(encoding), u"replace")
149 return u.encode(_sysstr(encoding), u"replace")
150 except LookupError as k:
150 except LookupError as k:
151 raise error.Abort(k, hint="please check your locale settings")
151 raise error.Abort(k, hint="please check your locale settings")
152
152
153 def fromlocal(s):
153 def fromlocal(s):
154 """
154 """
155 Convert a string from the local character encoding to UTF-8
155 Convert a string from the local character encoding to UTF-8
156
156
157 We attempt to decode strings using the encoding mode set by
157 We attempt to decode strings using the encoding mode set by
158 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
158 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
159 characters will cause an error message. Other modes include
159 characters will cause an error message. Other modes include
160 'replace', which replaces unknown characters with a special
160 'replace', which replaces unknown characters with a special
161 Unicode character, and 'ignore', which drops the character.
161 Unicode character, and 'ignore', which drops the character.
162 """
162 """
163
163
164 # can we do a lossless round-trip?
164 # can we do a lossless round-trip?
165 if isinstance(s, localstr):
165 if isinstance(s, localstr):
166 return s._utf8
166 return s._utf8
167
167
168 try:
168 try:
169 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
169 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
170 return u.encode("utf-8")
170 return u.encode("utf-8")
171 except UnicodeDecodeError as inst:
171 except UnicodeDecodeError as inst:
172 sub = s[max(0, inst.start - 10):inst.start + 10]
172 sub = s[max(0, inst.start - 10):inst.start + 10]
173 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
173 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
174 except LookupError as k:
174 except LookupError as k:
175 raise error.Abort(k, hint="please check your locale settings")
175 raise error.Abort(k, hint="please check your locale settings")
176
176
177 def unitolocal(u):
177 def unitolocal(u):
178 """Convert a unicode string to a byte string of local encoding"""
178 """Convert a unicode string to a byte string of local encoding"""
179 return tolocal(u.encode('utf-8'))
179 return tolocal(u.encode('utf-8'))
180
180
181 def unifromlocal(s):
181 def unifromlocal(s):
182 """Convert a byte string of local encoding to a unicode string"""
182 """Convert a byte string of local encoding to a unicode string"""
183 return fromlocal(s).decode('utf-8')
183 return fromlocal(s).decode('utf-8')
184
184
185 def unimethod(bytesfunc):
185 def unimethod(bytesfunc):
186 """Create a proxy method that forwards __unicode__() and __str__() of
186 """Create a proxy method that forwards __unicode__() and __str__() of
187 Python 3 to __bytes__()"""
187 Python 3 to __bytes__()"""
188 def unifunc(obj):
188 def unifunc(obj):
189 return unifromlocal(bytesfunc(obj))
189 return unifromlocal(bytesfunc(obj))
190 return unifunc
190 return unifunc
191
191
192 # converter functions between native str and byte string. use these if the
192 # converter functions between native str and byte string. use these if the
193 # character encoding is not aware (e.g. exception message) or is known to
193 # character encoding is not aware (e.g. exception message) or is known to
194 # be locale dependent (e.g. date formatting.)
194 # be locale dependent (e.g. date formatting.)
195 if pycompat.ispy3:
195 if pycompat.ispy3:
196 strtolocal = unitolocal
196 strtolocal = unitolocal
197 strfromlocal = unifromlocal
197 strfromlocal = unifromlocal
198 strmethod = unimethod
198 strmethod = unimethod
199 else:
199 else:
200 strtolocal = pycompat.identity
200 strtolocal = pycompat.identity
201 strfromlocal = pycompat.identity
201 strfromlocal = pycompat.identity
202 strmethod = pycompat.identity
202 strmethod = pycompat.identity
203
203
204 if not _nativeenviron:
204 if not _nativeenviron:
205 # now encoding and helper functions are available, recreate the environ
205 # now encoding and helper functions are available, recreate the environ
206 # dict to be exported to other modules
206 # dict to be exported to other modules
207 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
207 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
208 for k, v in os.environ.items()) # re-exports
208 for k, v in os.environ.items()) # re-exports
209
209
210 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
210 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
211 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
211 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
212 and "WFA" or "WF")
212 and "WFA" or "WF")
213
213
214 def colwidth(s):
214 def colwidth(s):
215 "Find the column width of a string for display in the local encoding"
215 "Find the column width of a string for display in the local encoding"
216 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
216 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
217
217
218 def ucolwidth(d):
218 def ucolwidth(d):
219 "Find the column width of a Unicode string for display"
219 "Find the column width of a Unicode string for display"
220 eaw = getattr(unicodedata, 'east_asian_width', None)
220 eaw = getattr(unicodedata, 'east_asian_width', None)
221 if eaw is not None:
221 if eaw is not None:
222 return sum([eaw(c) in _wide and 2 or 1 for c in d])
222 return sum([eaw(c) in _wide and 2 or 1 for c in d])
223 return len(d)
223 return len(d)
224
224
225 def getcols(s, start, c):
225 def getcols(s, start, c):
226 '''Use colwidth to find a c-column substring of s starting at byte
226 '''Use colwidth to find a c-column substring of s starting at byte
227 index start'''
227 index start'''
228 for x in xrange(start + c, len(s)):
228 for x in xrange(start + c, len(s)):
229 t = s[start:x]
229 t = s[start:x]
230 if colwidth(t) == c:
230 if colwidth(t) == c:
231 return t
231 return t
232
232
233 def trim(s, width, ellipsis='', leftside=False):
233 def trim(s, width, ellipsis='', leftside=False):
234 """Trim string 's' to at most 'width' columns (including 'ellipsis').
234 """Trim string 's' to at most 'width' columns (including 'ellipsis').
235
235
236 If 'leftside' is True, left side of string 's' is trimmed.
236 If 'leftside' is True, left side of string 's' is trimmed.
237 'ellipsis' is always placed at trimmed side.
237 'ellipsis' is always placed at trimmed side.
238
238
239 >>> ellipsis = '+++'
239 >>> ellipsis = '+++'
240 >>> from . import encoding
240 >>> from . import encoding
241 >>> encoding.encoding = 'utf-8'
241 >>> encoding.encoding = 'utf-8'
242 >>> t= '1234567890'
242 >>> t= '1234567890'
243 >>> print trim(t, 12, ellipsis=ellipsis)
243 >>> print trim(t, 12, ellipsis=ellipsis)
244 1234567890
244 1234567890
245 >>> print trim(t, 10, ellipsis=ellipsis)
245 >>> print trim(t, 10, ellipsis=ellipsis)
246 1234567890
246 1234567890
247 >>> print trim(t, 8, ellipsis=ellipsis)
247 >>> print trim(t, 8, ellipsis=ellipsis)
248 12345+++
248 12345+++
249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
250 +++67890
250 +++67890
251 >>> print trim(t, 8)
251 >>> print trim(t, 8)
252 12345678
252 12345678
253 >>> print trim(t, 8, leftside=True)
253 >>> print trim(t, 8, leftside=True)
254 34567890
254 34567890
255 >>> print trim(t, 3, ellipsis=ellipsis)
255 >>> print trim(t, 3, ellipsis=ellipsis)
256 +++
256 +++
257 >>> print trim(t, 1, ellipsis=ellipsis)
257 >>> print trim(t, 1, ellipsis=ellipsis)
258 +
258 +
259 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
259 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
260 >>> t = u.encode(encoding.encoding)
260 >>> t = u.encode(encoding.encoding)
261 >>> print trim(t, 12, ellipsis=ellipsis)
261 >>> print trim(t, 12, ellipsis=ellipsis)
262 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
262 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
263 >>> print trim(t, 10, ellipsis=ellipsis)
263 >>> print trim(t, 10, ellipsis=ellipsis)
264 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
264 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
265 >>> print trim(t, 8, ellipsis=ellipsis)
265 >>> print trim(t, 8, ellipsis=ellipsis)
266 \xe3\x81\x82\xe3\x81\x84+++
266 \xe3\x81\x82\xe3\x81\x84+++
267 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
267 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
268 +++\xe3\x81\x88\xe3\x81\x8a
268 +++\xe3\x81\x88\xe3\x81\x8a
269 >>> print trim(t, 5)
269 >>> print trim(t, 5)
270 \xe3\x81\x82\xe3\x81\x84
270 \xe3\x81\x82\xe3\x81\x84
271 >>> print trim(t, 5, leftside=True)
271 >>> print trim(t, 5, leftside=True)
272 \xe3\x81\x88\xe3\x81\x8a
272 \xe3\x81\x88\xe3\x81\x8a
273 >>> print trim(t, 4, ellipsis=ellipsis)
273 >>> print trim(t, 4, ellipsis=ellipsis)
274 +++
274 +++
275 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
275 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
276 +++
276 +++
277 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
277 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
278 >>> print trim(t, 12, ellipsis=ellipsis)
278 >>> print trim(t, 12, ellipsis=ellipsis)
279 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
279 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
280 >>> print trim(t, 10, ellipsis=ellipsis)
280 >>> print trim(t, 10, ellipsis=ellipsis)
281 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
281 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
282 >>> print trim(t, 8, ellipsis=ellipsis)
282 >>> print trim(t, 8, ellipsis=ellipsis)
283 \x11\x22\x33\x44\x55+++
283 \x11\x22\x33\x44\x55+++
284 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
284 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
285 +++\x66\x77\x88\x99\xaa
285 +++\x66\x77\x88\x99\xaa
286 >>> print trim(t, 8)
286 >>> print trim(t, 8)
287 \x11\x22\x33\x44\x55\x66\x77\x88
287 \x11\x22\x33\x44\x55\x66\x77\x88
288 >>> print trim(t, 8, leftside=True)
288 >>> print trim(t, 8, leftside=True)
289 \x33\x44\x55\x66\x77\x88\x99\xaa
289 \x33\x44\x55\x66\x77\x88\x99\xaa
290 >>> print trim(t, 3, ellipsis=ellipsis)
290 >>> print trim(t, 3, ellipsis=ellipsis)
291 +++
291 +++
292 >>> print trim(t, 1, ellipsis=ellipsis)
292 >>> print trim(t, 1, ellipsis=ellipsis)
293 +
293 +
294 """
294 """
295 try:
295 try:
296 u = s.decode(_sysstr(encoding))
296 u = s.decode(_sysstr(encoding))
297 except UnicodeDecodeError:
297 except UnicodeDecodeError:
298 if len(s) <= width: # trimming is not needed
298 if len(s) <= width: # trimming is not needed
299 return s
299 return s
300 width -= len(ellipsis)
300 width -= len(ellipsis)
301 if width <= 0: # no enough room even for ellipsis
301 if width <= 0: # no enough room even for ellipsis
302 return ellipsis[:width + len(ellipsis)]
302 return ellipsis[:width + len(ellipsis)]
303 if leftside:
303 if leftside:
304 return ellipsis + s[-width:]
304 return ellipsis + s[-width:]
305 return s[:width] + ellipsis
305 return s[:width] + ellipsis
306
306
307 if ucolwidth(u) <= width: # trimming is not needed
307 if ucolwidth(u) <= width: # trimming is not needed
308 return s
308 return s
309
309
310 width -= len(ellipsis)
310 width -= len(ellipsis)
311 if width <= 0: # no enough room even for ellipsis
311 if width <= 0: # no enough room even for ellipsis
312 return ellipsis[:width + len(ellipsis)]
312 return ellipsis[:width + len(ellipsis)]
313
313
314 if leftside:
314 if leftside:
315 uslice = lambda i: u[i:]
315 uslice = lambda i: u[i:]
316 concat = lambda s: ellipsis + s
316 concat = lambda s: ellipsis + s
317 else:
317 else:
318 uslice = lambda i: u[:-i]
318 uslice = lambda i: u[:-i]
319 concat = lambda s: s + ellipsis
319 concat = lambda s: s + ellipsis
320 for i in xrange(1, len(u)):
320 for i in xrange(1, len(u)):
321 usub = uslice(i)
321 usub = uslice(i)
322 if ucolwidth(usub) <= width:
322 if ucolwidth(usub) <= width:
323 return concat(usub.encode(_sysstr(encoding)))
323 return concat(usub.encode(_sysstr(encoding)))
324 return ellipsis # no enough room for multi-column characters
324 return ellipsis # no enough room for multi-column characters
325
325
326 def lower(s):
326 def lower(s):
327 "best-effort encoding-aware case-folding of local string s"
327 "best-effort encoding-aware case-folding of local string s"
328 try:
328 try:
329 return asciilower(s)
329 return asciilower(s)
330 except UnicodeDecodeError:
330 except UnicodeDecodeError:
331 pass
331 pass
332 try:
332 try:
333 if isinstance(s, localstr):
333 if isinstance(s, localstr):
334 u = s._utf8.decode("utf-8")
334 u = s._utf8.decode("utf-8")
335 else:
335 else:
336 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
336 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
337
337
338 lu = u.lower()
338 lu = u.lower()
339 if u == lu:
339 if u == lu:
340 return s # preserve localstring
340 return s # preserve localstring
341 return lu.encode(_sysstr(encoding))
341 return lu.encode(_sysstr(encoding))
342 except UnicodeError:
342 except UnicodeError:
343 return s.lower() # we don't know how to fold this except in ASCII
343 return s.lower() # we don't know how to fold this except in ASCII
344 except LookupError as k:
344 except LookupError as k:
345 raise error.Abort(k, hint="please check your locale settings")
345 raise error.Abort(k, hint="please check your locale settings")
346
346
347 def upper(s):
347 def upper(s):
348 "best-effort encoding-aware case-folding of local string s"
348 "best-effort encoding-aware case-folding of local string s"
349 try:
349 try:
350 return asciiupper(s)
350 return asciiupper(s)
351 except UnicodeDecodeError:
351 except UnicodeDecodeError:
352 return upperfallback(s)
352 return upperfallback(s)
353
353
354 def upperfallback(s):
354 def upperfallback(s):
355 try:
355 try:
356 if isinstance(s, localstr):
356 if isinstance(s, localstr):
357 u = s._utf8.decode("utf-8")
357 u = s._utf8.decode("utf-8")
358 else:
358 else:
359 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
359 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
360
360
361 uu = u.upper()
361 uu = u.upper()
362 if u == uu:
362 if u == uu:
363 return s # preserve localstring
363 return s # preserve localstring
364 return uu.encode(_sysstr(encoding))
364 return uu.encode(_sysstr(encoding))
365 except UnicodeError:
365 except UnicodeError:
366 return s.upper() # we don't know how to fold this except in ASCII
366 return s.upper() # we don't know how to fold this except in ASCII
367 except LookupError as k:
367 except LookupError as k:
368 raise error.Abort(k, hint="please check your locale settings")
368 raise error.Abort(k, hint="please check your locale settings")
369
369
370 class normcasespecs(object):
370 class normcasespecs(object):
371 '''what a platform's normcase does to ASCII strings
371 '''what a platform's normcase does to ASCII strings
372
372
373 This is specified per platform, and should be consistent with what normcase
373 This is specified per platform, and should be consistent with what normcase
374 on that platform actually does.
374 on that platform actually does.
375
375
376 lower: normcase lowercases ASCII strings
376 lower: normcase lowercases ASCII strings
377 upper: normcase uppercases ASCII strings
377 upper: normcase uppercases ASCII strings
378 other: the fallback function should always be called
378 other: the fallback function should always be called
379
379
380 This should be kept in sync with normcase_spec in util.h.'''
380 This should be kept in sync with normcase_spec in util.h.'''
381 lower = -1
381 lower = -1
382 upper = 1
382 upper = 1
383 other = 0
383 other = 0
384
384
385 _jsonmap = []
385 _jsonmap = []
386 _jsonmap.extend("\\u%04x" % x for x in range(32))
386 _jsonmap.extend("\\u%04x" % x for x in range(32))
387 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
387 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
388 _jsonmap.append('\\u007f')
388 _jsonmap.append('\\u007f')
389 _jsonmap[0x09] = '\\t'
389 _jsonmap[0x09] = '\\t'
390 _jsonmap[0x0a] = '\\n'
390 _jsonmap[0x0a] = '\\n'
391 _jsonmap[0x22] = '\\"'
391 _jsonmap[0x22] = '\\"'
392 _jsonmap[0x5c] = '\\\\'
392 _jsonmap[0x5c] = '\\\\'
393 _jsonmap[0x08] = '\\b'
393 _jsonmap[0x08] = '\\b'
394 _jsonmap[0x0c] = '\\f'
394 _jsonmap[0x0c] = '\\f'
395 _jsonmap[0x0d] = '\\r'
395 _jsonmap[0x0d] = '\\r'
396 _paranoidjsonmap = _jsonmap[:]
396 _paranoidjsonmap = _jsonmap[:]
397 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
397 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
398 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
398 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
399 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
399 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
400
400
401 def jsonescape(s, paranoid=False):
401 def jsonescape(s, paranoid=False):
402 '''returns a string suitable for JSON
402 '''returns a string suitable for JSON
403
403
404 JSON is problematic for us because it doesn't support non-Unicode
404 JSON is problematic for us because it doesn't support non-Unicode
405 bytes. To deal with this, we take the following approach:
405 bytes. To deal with this, we take the following approach:
406
406
407 - localstr objects are converted back to UTF-8
407 - localstr objects are converted back to UTF-8
408 - valid UTF-8/ASCII strings are passed as-is
408 - valid UTF-8/ASCII strings are passed as-is
409 - other strings are converted to UTF-8b surrogate encoding
409 - other strings are converted to UTF-8b surrogate encoding
410 - apply JSON-specified string escaping
410 - apply JSON-specified string escaping
411
411
412 (escapes are doubled in these tests)
412 (escapes are doubled in these tests)
413
413
414 >>> jsonescape('this is a test')
414 >>> jsonescape('this is a test')
415 'this is a test'
415 'this is a test'
416 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
416 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
417 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
417 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
418 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
418 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
419 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
419 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
420 >>> jsonescape('a weird byte: \\xdd')
420 >>> jsonescape('a weird byte: \\xdd')
421 'a weird byte: \\xed\\xb3\\x9d'
421 'a weird byte: \\xed\\xb3\\x9d'
422 >>> jsonescape('utf-8: caf\\xc3\\xa9')
422 >>> jsonescape('utf-8: caf\\xc3\\xa9')
423 'utf-8: caf\\xc3\\xa9'
423 'utf-8: caf\\xc3\\xa9'
424 >>> jsonescape('')
424 >>> jsonescape('')
425 ''
425 ''
426
426
427 If paranoid, non-ascii and common troublesome characters are also escaped.
427 If paranoid, non-ascii and common troublesome characters are also escaped.
428 This is suitable for web output.
428 This is suitable for web output.
429
429
430 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
430 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
431 'escape boundary: ~ \\\\u007f \\\\u0080'
431 'escape boundary: ~ \\\\u007f \\\\u0080'
432 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
432 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
433 'a weird byte: \\\\udcdd'
433 'a weird byte: \\\\udcdd'
434 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
434 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
435 'utf-8: caf\\\\u00e9'
435 'utf-8: caf\\\\u00e9'
436 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
436 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
437 'non-BMP: \\\\ud834\\\\udd1e'
437 'non-BMP: \\\\ud834\\\\udd1e'
438 >>> jsonescape('<foo@example.org>', paranoid=True)
438 >>> jsonescape('<foo@example.org>', paranoid=True)
439 '\\\\u003cfoo@example.org\\\\u003e'
439 '\\\\u003cfoo@example.org\\\\u003e'
440 '''
440 '''
441
441
442 if paranoid:
442 if paranoid:
443 jm = _paranoidjsonmap
443 jm = _paranoidjsonmap
444 else:
444 else:
445 jm = _jsonmap
445 jm = _jsonmap
446
446
447 u8chars = toutf8b(s)
447 u8chars = toutf8b(s)
448 try:
448 try:
449 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
449 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
450 except IndexError:
450 except IndexError:
451 pass
451 pass
452 # non-BMP char is represented as UTF-16 surrogate pair
452 # non-BMP char is represented as UTF-16 surrogate pair
453 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
453 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
454 u16codes.pop(0) # drop BOM
454 u16codes.pop(0) # drop BOM
455 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
455 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
456
456
457 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
457 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
458
458
459 def getutf8char(s, pos):
459 def getutf8char(s, pos):
460 '''get the next full utf-8 character in the given string, starting at pos
460 '''get the next full utf-8 character in the given string, starting at pos
461
461
462 Raises a UnicodeError if the given location does not start a valid
462 Raises a UnicodeError if the given location does not start a valid
463 utf-8 character.
463 utf-8 character.
464 '''
464 '''
465
465
466 # find how many bytes to attempt decoding from first nibble
466 # find how many bytes to attempt decoding from first nibble
467 l = _utf8len[ord(s[pos]) >> 4]
467 l = _utf8len[ord(s[pos]) >> 4]
468 if not l: # ascii
468 if not l: # ascii
469 return s[pos]
469 return s[pos]
470
470
471 c = s[pos:pos + l]
471 c = s[pos:pos + l]
472 # validate with attempted decode
472 # validate with attempted decode
473 c.decode("utf-8")
473 c.decode("utf-8")
474 return c
474 return c
475
475
476 def toutf8b(s):
476 def toutf8b(s):
477 '''convert a local, possibly-binary string into UTF-8b
477 '''convert a local, possibly-binary string into UTF-8b
478
478
479 This is intended as a generic method to preserve data when working
479 This is intended as a generic method to preserve data when working
480 with schemes like JSON and XML that have no provision for
480 with schemes like JSON and XML that have no provision for
481 arbitrary byte strings. As Mercurial often doesn't know
481 arbitrary byte strings. As Mercurial often doesn't know
482 what encoding data is in, we use so-called UTF-8b.
482 what encoding data is in, we use so-called UTF-8b.
483
483
484 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
484 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
485 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
485 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
486 uDC00-uDCFF.
486 uDC00-uDCFF.
487
487
488 Principles of operation:
488 Principles of operation:
489
489
490 - ASCII and UTF-8 data successfully round-trips and is understood
490 - ASCII and UTF-8 data successfully round-trips and is understood
491 by Unicode-oriented clients
491 by Unicode-oriented clients
492 - filenames and file contents in arbitrary other encodings can have
492 - filenames and file contents in arbitrary other encodings can have
493 be round-tripped or recovered by clueful clients
493 be round-tripped or recovered by clueful clients
494 - local strings that have a cached known UTF-8 encoding (aka
494 - local strings that have a cached known UTF-8 encoding (aka
495 localstr) get sent as UTF-8 so Unicode-oriented clients get the
495 localstr) get sent as UTF-8 so Unicode-oriented clients get the
496 Unicode data they want
496 Unicode data they want
497 - because we must preserve UTF-8 bytestring in places such as
497 - because we must preserve UTF-8 bytestring in places such as
498 filenames, metadata can't be roundtripped without help
498 filenames, metadata can't be roundtripped without help
499
499
500 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
500 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
501 arbitrary bytes into an internal Unicode format that can be
501 arbitrary bytes into an internal Unicode format that can be
502 re-encoded back into the original. Here we are exposing the
502 re-encoded back into the original. Here we are exposing the
503 internal surrogate encoding as a UTF-8 string.)
503 internal surrogate encoding as a UTF-8 string.)
504 '''
504 '''
505
505
506 if "\xed" not in s:
506 if "\xed" not in s:
507 if isinstance(s, localstr):
507 if isinstance(s, localstr):
508 return s._utf8
508 return s._utf8
509 try:
509 try:
510 s.decode('utf-8')
510 s.decode('utf-8')
511 return s
511 return s
512 except UnicodeDecodeError:
512 except UnicodeDecodeError:
513 pass
513 pass
514
514
515 r = ""
515 r = ""
516 pos = 0
516 pos = 0
517 l = len(s)
517 l = len(s)
518 while pos < l:
518 while pos < l:
519 try:
519 try:
520 c = getutf8char(s, pos)
520 c = getutf8char(s, pos)
521 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
521 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
522 # have to re-escape existing U+DCxx characters
522 # have to re-escape existing U+DCxx characters
523 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
523 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
524 pos += 1
524 pos += 1
525 else:
525 else:
526 pos += len(c)
526 pos += len(c)
527 except UnicodeDecodeError:
527 except UnicodeDecodeError:
528 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
528 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
529 pos += 1
529 pos += 1
530 r += c
530 r += c
531 return r
531 return r
532
532
533 def fromutf8b(s):
533 def fromutf8b(s):
534 '''Given a UTF-8b string, return a local, possibly-binary string.
534 '''Given a UTF-8b string, return a local, possibly-binary string.
535
535
536 return the original binary string. This
536 return the original binary string. This
537 is a round-trip process for strings like filenames, but metadata
537 is a round-trip process for strings like filenames, but metadata
538 that's was passed through tolocal will remain in UTF-8.
538 that's was passed through tolocal will remain in UTF-8.
539
539
540 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
540 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
541 >>> m = "\\xc3\\xa9\\x99abcd"
541 >>> m = "\\xc3\\xa9\\x99abcd"
542 >>> toutf8b(m)
542 >>> toutf8b(m)
543 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
543 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
544 >>> roundtrip(m)
544 >>> roundtrip(m)
545 True
545 True
546 >>> roundtrip("\\xc2\\xc2\\x80")
546 >>> roundtrip("\\xc2\\xc2\\x80")
547 True
547 True
548 >>> roundtrip("\\xef\\xbf\\xbd")
548 >>> roundtrip("\\xef\\xbf\\xbd")
549 True
549 True
550 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
550 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
551 True
551 True
552 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
552 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
553 True
553 True
554 '''
554 '''
555
555
556 # fast path - look for uDxxx prefixes in s
556 # fast path - look for uDxxx prefixes in s
557 if "\xed" not in s:
557 if "\xed" not in s:
558 return s
558 return s
559
559
560 # We could do this with the unicode type but some Python builds
560 # We could do this with the unicode type but some Python builds
561 # use UTF-16 internally (issue5031) which causes non-BMP code
561 # use UTF-16 internally (issue5031) which causes non-BMP code
562 # points to be escaped. Instead, we use our handy getutf8char
562 # points to be escaped. Instead, we use our handy getutf8char
563 # helper again to walk the string without "decoding" it.
563 # helper again to walk the string without "decoding" it.
564
564
565 r = ""
565 r = ""
566 pos = 0
566 pos = 0
567 l = len(s)
567 l = len(s)
568 while pos < l:
568 while pos < l:
569 c = getutf8char(s, pos)
569 c = getutf8char(s, pos)
570 pos += len(c)
570 pos += len(c)
571 # unescape U+DCxx characters
571 # unescape U+DCxx characters
572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
573 c = chr(ord(c.decode("utf-8")) & 0xff)
573 c = chr(ord(c.decode("utf-8")) & 0xff)
574 r += c
574 r += c
575 return r
575 return r
General Comments 0
You need to be logged in to leave comments. Login now