##// END OF EJS Templates
encoding: handle empty string in toutf8
Matt Mackall -
r22425:6fd944c2 default
parent child Browse files
Show More
@@ -1,380 +1,380 b''
1 # encoding.py - character transcoding support for Mercurial
1 # encoding.py - character transcoding support for Mercurial
2 #
2 #
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
6 # GNU General Public License version 2 or any later version.
7
7
8 import error
8 import error
9 import unicodedata, locale, os
9 import unicodedata, locale, os
10
10
11 def _getpreferredencoding():
11 def _getpreferredencoding():
12 '''
12 '''
13 On darwin, getpreferredencoding ignores the locale environment and
13 On darwin, getpreferredencoding ignores the locale environment and
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 for Python 2.7 and up. This is the same corrected code for earlier
15 for Python 2.7 and up. This is the same corrected code for earlier
16 Python versions.
16 Python versions.
17
17
18 However, we can't use a version check for this method, as some distributions
18 However, we can't use a version check for this method, as some distributions
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 encoding, as it is unlikely that this encoding is the actually expected.
20 encoding, as it is unlikely that this encoding is the actually expected.
21 '''
21 '''
22 try:
22 try:
23 locale.CODESET
23 locale.CODESET
24 except AttributeError:
24 except AttributeError:
25 # Fall back to parsing environment variables :-(
25 # Fall back to parsing environment variables :-(
26 return locale.getdefaultlocale()[1]
26 return locale.getdefaultlocale()[1]
27
27
28 oldloc = locale.setlocale(locale.LC_CTYPE)
28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 locale.setlocale(locale.LC_CTYPE, "")
29 locale.setlocale(locale.LC_CTYPE, "")
30 result = locale.nl_langinfo(locale.CODESET)
30 result = locale.nl_langinfo(locale.CODESET)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
31 locale.setlocale(locale.LC_CTYPE, oldloc)
32
32
33 return result
33 return result
34
34
35 _encodingfixers = {
35 _encodingfixers = {
36 '646': lambda: 'ascii',
36 '646': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
37 'ANSI_X3.4-1968': lambda: 'ascii',
38 'mac-roman': _getpreferredencoding
38 'mac-roman': _getpreferredencoding
39 }
39 }
40
40
41 try:
41 try:
42 encoding = os.environ.get("HGENCODING")
42 encoding = os.environ.get("HGENCODING")
43 if not encoding:
43 if not encoding:
44 encoding = locale.getpreferredencoding() or 'ascii'
44 encoding = locale.getpreferredencoding() or 'ascii'
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 except locale.Error:
46 except locale.Error:
47 encoding = 'ascii'
47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1'
49 fallbackencoding = 'ISO-8859-1'
50
50
51 class localstr(str):
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
55 s = str.__new__(cls, l)
56 s._utf8 = u
56 s._utf8 = u
57 return s
57 return s
58 def __hash__(self):
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
59 return hash(self._utf8) # avoid collisions in local string space
60
60
61 def tolocal(s):
61 def tolocal(s):
62 """
62 """
63 Convert a string from internal UTF-8 to local encoding
63 Convert a string from internal UTF-8 to local encoding
64
64
65 All internal strings should be UTF-8 but some repos before the
65 All internal strings should be UTF-8 but some repos before the
66 implementation of locale support may contain latin1 or possibly
66 implementation of locale support may contain latin1 or possibly
67 other character sets. We attempt to decode everything strictly
67 other character sets. We attempt to decode everything strictly
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 replace unknown characters.
69 replace unknown characters.
70
70
71 The localstr class is used to cache the known UTF-8 encoding of
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
73 round-trip conversion back to UTF-8.
74
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
76 >>> l = tolocal(u)
77 >>> l
77 >>> l
78 'foo: ?'
78 'foo: ?'
79 >>> fromlocal(l)
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> len(d) # no collision
83 >>> len(d) # no collision
84 2
84 2
85 >>> 'foo: ?' in d
85 >>> 'foo: ?' in d
86 False
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
88 >>> l = tolocal(l1)
89 >>> l
89 >>> l
90 'foo: ?'
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
92 'foo: \\xc3\\xa4'
93 """
93 """
94
94
95 try:
95 try:
96 try:
96 try:
97 # make sure string is actually stored in UTF-8
97 # make sure string is actually stored in UTF-8
98 u = s.decode('UTF-8')
98 u = s.decode('UTF-8')
99 if encoding == 'UTF-8':
99 if encoding == 'UTF-8':
100 # fast path
100 # fast path
101 return s
101 return s
102 r = u.encode(encoding, "replace")
102 r = u.encode(encoding, "replace")
103 if u == r.decode(encoding):
103 if u == r.decode(encoding):
104 # r is a safe, non-lossy encoding of s
104 # r is a safe, non-lossy encoding of s
105 return r
105 return r
106 return localstr(s, r)
106 return localstr(s, r)
107 except UnicodeDecodeError:
107 except UnicodeDecodeError:
108 # we should only get here if we're looking at an ancient changeset
108 # we should only get here if we're looking at an ancient changeset
109 try:
109 try:
110 u = s.decode(fallbackencoding)
110 u = s.decode(fallbackencoding)
111 r = u.encode(encoding, "replace")
111 r = u.encode(encoding, "replace")
112 if u == r.decode(encoding):
112 if u == r.decode(encoding):
113 # r is a safe, non-lossy encoding of s
113 # r is a safe, non-lossy encoding of s
114 return r
114 return r
115 return localstr(u.encode('UTF-8'), r)
115 return localstr(u.encode('UTF-8'), r)
116 except UnicodeDecodeError:
116 except UnicodeDecodeError:
117 u = s.decode("utf-8", "replace") # last ditch
117 u = s.decode("utf-8", "replace") # last ditch
118 return u.encode(encoding, "replace") # can't round-trip
118 return u.encode(encoding, "replace") # can't round-trip
119 except LookupError, k:
119 except LookupError, k:
120 raise error.Abort(k, hint="please check your locale settings")
120 raise error.Abort(k, hint="please check your locale settings")
121
121
122 def fromlocal(s):
122 def fromlocal(s):
123 """
123 """
124 Convert a string from the local character encoding to UTF-8
124 Convert a string from the local character encoding to UTF-8
125
125
126 We attempt to decode strings using the encoding mode set by
126 We attempt to decode strings using the encoding mode set by
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 characters will cause an error message. Other modes include
128 characters will cause an error message. Other modes include
129 'replace', which replaces unknown characters with a special
129 'replace', which replaces unknown characters with a special
130 Unicode character, and 'ignore', which drops the character.
130 Unicode character, and 'ignore', which drops the character.
131 """
131 """
132
132
133 # can we do a lossless round-trip?
133 # can we do a lossless round-trip?
134 if isinstance(s, localstr):
134 if isinstance(s, localstr):
135 return s._utf8
135 return s._utf8
136
136
137 try:
137 try:
138 return s.decode(encoding, encodingmode).encode("utf-8")
138 return s.decode(encoding, encodingmode).encode("utf-8")
139 except UnicodeDecodeError, inst:
139 except UnicodeDecodeError, inst:
140 sub = s[max(0, inst.start - 10):inst.start + 10]
140 sub = s[max(0, inst.start - 10):inst.start + 10]
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 except LookupError, k:
142 except LookupError, k:
143 raise error.Abort(k, hint="please check your locale settings")
143 raise error.Abort(k, hint="please check your locale settings")
144
144
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 and "WFA" or "WF")
147 and "WFA" or "WF")
148
148
149 def colwidth(s):
149 def colwidth(s):
150 "Find the column width of a string for display in the local encoding"
150 "Find the column width of a string for display in the local encoding"
151 return ucolwidth(s.decode(encoding, 'replace'))
151 return ucolwidth(s.decode(encoding, 'replace'))
152
152
153 def ucolwidth(d):
153 def ucolwidth(d):
154 "Find the column width of a Unicode string for display"
154 "Find the column width of a Unicode string for display"
155 eaw = getattr(unicodedata, 'east_asian_width', None)
155 eaw = getattr(unicodedata, 'east_asian_width', None)
156 if eaw is not None:
156 if eaw is not None:
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
157 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 return len(d)
158 return len(d)
159
159
160 def getcols(s, start, c):
160 def getcols(s, start, c):
161 '''Use colwidth to find a c-column substring of s starting at byte
161 '''Use colwidth to find a c-column substring of s starting at byte
162 index start'''
162 index start'''
163 for x in xrange(start + c, len(s)):
163 for x in xrange(start + c, len(s)):
164 t = s[start:x]
164 t = s[start:x]
165 if colwidth(t) == c:
165 if colwidth(t) == c:
166 return t
166 return t
167
167
168 def trim(s, width, ellipsis='', leftside=False):
168 def trim(s, width, ellipsis='', leftside=False):
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170
170
171 If 'leftside' is True, left side of string 's' is trimmed.
171 If 'leftside' is True, left side of string 's' is trimmed.
172 'ellipsis' is always placed at trimmed side.
172 'ellipsis' is always placed at trimmed side.
173
173
174 >>> ellipsis = '+++'
174 >>> ellipsis = '+++'
175 >>> from mercurial import encoding
175 >>> from mercurial import encoding
176 >>> encoding.encoding = 'utf-8'
176 >>> encoding.encoding = 'utf-8'
177 >>> t= '1234567890'
177 >>> t= '1234567890'
178 >>> print trim(t, 12, ellipsis=ellipsis)
178 >>> print trim(t, 12, ellipsis=ellipsis)
179 1234567890
179 1234567890
180 >>> print trim(t, 10, ellipsis=ellipsis)
180 >>> print trim(t, 10, ellipsis=ellipsis)
181 1234567890
181 1234567890
182 >>> print trim(t, 8, ellipsis=ellipsis)
182 >>> print trim(t, 8, ellipsis=ellipsis)
183 12345+++
183 12345+++
184 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
184 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
185 +++67890
185 +++67890
186 >>> print trim(t, 8)
186 >>> print trim(t, 8)
187 12345678
187 12345678
188 >>> print trim(t, 8, leftside=True)
188 >>> print trim(t, 8, leftside=True)
189 34567890
189 34567890
190 >>> print trim(t, 3, ellipsis=ellipsis)
190 >>> print trim(t, 3, ellipsis=ellipsis)
191 +++
191 +++
192 >>> print trim(t, 1, ellipsis=ellipsis)
192 >>> print trim(t, 1, ellipsis=ellipsis)
193 +
193 +
194 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
194 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
195 >>> t = u.encode(encoding.encoding)
195 >>> t = u.encode(encoding.encoding)
196 >>> print trim(t, 12, ellipsis=ellipsis)
196 >>> print trim(t, 12, ellipsis=ellipsis)
197 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
197 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
198 >>> print trim(t, 10, ellipsis=ellipsis)
198 >>> print trim(t, 10, ellipsis=ellipsis)
199 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
199 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
200 >>> print trim(t, 8, ellipsis=ellipsis)
200 >>> print trim(t, 8, ellipsis=ellipsis)
201 \xe3\x81\x82\xe3\x81\x84+++
201 \xe3\x81\x82\xe3\x81\x84+++
202 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
202 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
203 +++\xe3\x81\x88\xe3\x81\x8a
203 +++\xe3\x81\x88\xe3\x81\x8a
204 >>> print trim(t, 5)
204 >>> print trim(t, 5)
205 \xe3\x81\x82\xe3\x81\x84
205 \xe3\x81\x82\xe3\x81\x84
206 >>> print trim(t, 5, leftside=True)
206 >>> print trim(t, 5, leftside=True)
207 \xe3\x81\x88\xe3\x81\x8a
207 \xe3\x81\x88\xe3\x81\x8a
208 >>> print trim(t, 4, ellipsis=ellipsis)
208 >>> print trim(t, 4, ellipsis=ellipsis)
209 +++
209 +++
210 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
210 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
211 +++
211 +++
212 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
212 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
213 >>> print trim(t, 12, ellipsis=ellipsis)
213 >>> print trim(t, 12, ellipsis=ellipsis)
214 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
214 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
215 >>> print trim(t, 10, ellipsis=ellipsis)
215 >>> print trim(t, 10, ellipsis=ellipsis)
216 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
216 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
217 >>> print trim(t, 8, ellipsis=ellipsis)
217 >>> print trim(t, 8, ellipsis=ellipsis)
218 \x11\x22\x33\x44\x55+++
218 \x11\x22\x33\x44\x55+++
219 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
219 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
220 +++\x66\x77\x88\x99\xaa
220 +++\x66\x77\x88\x99\xaa
221 >>> print trim(t, 8)
221 >>> print trim(t, 8)
222 \x11\x22\x33\x44\x55\x66\x77\x88
222 \x11\x22\x33\x44\x55\x66\x77\x88
223 >>> print trim(t, 8, leftside=True)
223 >>> print trim(t, 8, leftside=True)
224 \x33\x44\x55\x66\x77\x88\x99\xaa
224 \x33\x44\x55\x66\x77\x88\x99\xaa
225 >>> print trim(t, 3, ellipsis=ellipsis)
225 >>> print trim(t, 3, ellipsis=ellipsis)
226 +++
226 +++
227 >>> print trim(t, 1, ellipsis=ellipsis)
227 >>> print trim(t, 1, ellipsis=ellipsis)
228 +
228 +
229 """
229 """
230 try:
230 try:
231 u = s.decode(encoding)
231 u = s.decode(encoding)
232 except UnicodeDecodeError:
232 except UnicodeDecodeError:
233 if len(s) <= width: # trimming is not needed
233 if len(s) <= width: # trimming is not needed
234 return s
234 return s
235 width -= len(ellipsis)
235 width -= len(ellipsis)
236 if width <= 0: # no enough room even for ellipsis
236 if width <= 0: # no enough room even for ellipsis
237 return ellipsis[:width + len(ellipsis)]
237 return ellipsis[:width + len(ellipsis)]
238 if leftside:
238 if leftside:
239 return ellipsis + s[-width:]
239 return ellipsis + s[-width:]
240 return s[:width] + ellipsis
240 return s[:width] + ellipsis
241
241
242 if ucolwidth(u) <= width: # trimming is not needed
242 if ucolwidth(u) <= width: # trimming is not needed
243 return s
243 return s
244
244
245 width -= len(ellipsis)
245 width -= len(ellipsis)
246 if width <= 0: # no enough room even for ellipsis
246 if width <= 0: # no enough room even for ellipsis
247 return ellipsis[:width + len(ellipsis)]
247 return ellipsis[:width + len(ellipsis)]
248
248
249 if leftside:
249 if leftside:
250 uslice = lambda i: u[i:]
250 uslice = lambda i: u[i:]
251 concat = lambda s: ellipsis + s
251 concat = lambda s: ellipsis + s
252 else:
252 else:
253 uslice = lambda i: u[:-i]
253 uslice = lambda i: u[:-i]
254 concat = lambda s: s + ellipsis
254 concat = lambda s: s + ellipsis
255 for i in xrange(1, len(u)):
255 for i in xrange(1, len(u)):
256 usub = uslice(i)
256 usub = uslice(i)
257 if ucolwidth(usub) <= width:
257 if ucolwidth(usub) <= width:
258 return concat(usub.encode(encoding))
258 return concat(usub.encode(encoding))
259 return ellipsis # no enough room for multi-column characters
259 return ellipsis # no enough room for multi-column characters
260
260
261 def lower(s):
261 def lower(s):
262 "best-effort encoding-aware case-folding of local string s"
262 "best-effort encoding-aware case-folding of local string s"
263 try:
263 try:
264 s.decode('ascii') # throw exception for non-ASCII character
264 s.decode('ascii') # throw exception for non-ASCII character
265 return s.lower()
265 return s.lower()
266 except UnicodeDecodeError:
266 except UnicodeDecodeError:
267 pass
267 pass
268 try:
268 try:
269 if isinstance(s, localstr):
269 if isinstance(s, localstr):
270 u = s._utf8.decode("utf-8")
270 u = s._utf8.decode("utf-8")
271 else:
271 else:
272 u = s.decode(encoding, encodingmode)
272 u = s.decode(encoding, encodingmode)
273
273
274 lu = u.lower()
274 lu = u.lower()
275 if u == lu:
275 if u == lu:
276 return s # preserve localstring
276 return s # preserve localstring
277 return lu.encode(encoding)
277 return lu.encode(encoding)
278 except UnicodeError:
278 except UnicodeError:
279 return s.lower() # we don't know how to fold this except in ASCII
279 return s.lower() # we don't know how to fold this except in ASCII
280 except LookupError, k:
280 except LookupError, k:
281 raise error.Abort(k, hint="please check your locale settings")
281 raise error.Abort(k, hint="please check your locale settings")
282
282
283 def upper(s):
283 def upper(s):
284 "best-effort encoding-aware case-folding of local string s"
284 "best-effort encoding-aware case-folding of local string s"
285 try:
285 try:
286 s.decode('ascii') # throw exception for non-ASCII character
286 s.decode('ascii') # throw exception for non-ASCII character
287 return s.upper()
287 return s.upper()
288 except UnicodeDecodeError:
288 except UnicodeDecodeError:
289 pass
289 pass
290 try:
290 try:
291 if isinstance(s, localstr):
291 if isinstance(s, localstr):
292 u = s._utf8.decode("utf-8")
292 u = s._utf8.decode("utf-8")
293 else:
293 else:
294 u = s.decode(encoding, encodingmode)
294 u = s.decode(encoding, encodingmode)
295
295
296 uu = u.upper()
296 uu = u.upper()
297 if u == uu:
297 if u == uu:
298 return s # preserve localstring
298 return s # preserve localstring
299 return uu.encode(encoding)
299 return uu.encode(encoding)
300 except UnicodeError:
300 except UnicodeError:
301 return s.upper() # we don't know how to fold this except in ASCII
301 return s.upper() # we don't know how to fold this except in ASCII
302 except LookupError, k:
302 except LookupError, k:
303 raise error.Abort(k, hint="please check your locale settings")
303 raise error.Abort(k, hint="please check your locale settings")
304
304
305 def toutf8b(s):
305 def toutf8b(s):
306 '''convert a local, possibly-binary string into UTF-8b
306 '''convert a local, possibly-binary string into UTF-8b
307
307
308 This is intended as a generic method to preserve data when working
308 This is intended as a generic method to preserve data when working
309 with schemes like JSON and XML that have no provision for
309 with schemes like JSON and XML that have no provision for
310 arbitrary byte strings. As Mercurial often doesn't know
310 arbitrary byte strings. As Mercurial often doesn't know
311 what encoding data is in, we use so-called UTF-8b.
311 what encoding data is in, we use so-called UTF-8b.
312
312
313 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
313 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
314 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
314 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
315 uDC00-uDCFF.
315 uDC00-uDCFF.
316
316
317 Principles of operation:
317 Principles of operation:
318
318
319 - ASCII and UTF-8 data successfully round-trips and is understood
319 - ASCII and UTF-8 data successfully round-trips and is understood
320 by Unicode-oriented clients
320 by Unicode-oriented clients
321 - filenames and file contents in arbitrary other encodings can have
321 - filenames and file contents in arbitrary other encodings can have
322 be round-tripped or recovered by clueful clients
322 be round-tripped or recovered by clueful clients
323 - local strings that have a cached known UTF-8 encoding (aka
323 - local strings that have a cached known UTF-8 encoding (aka
324 localstr) get sent as UTF-8 so Unicode-oriented clients get the
324 localstr) get sent as UTF-8 so Unicode-oriented clients get the
325 Unicode data they want
325 Unicode data they want
326 - because we must preserve UTF-8 bytestring in places such as
326 - because we must preserve UTF-8 bytestring in places such as
327 filenames, metadata can't be roundtripped without help
327 filenames, metadata can't be roundtripped without help
328
328
329 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
329 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
330 arbitrary bytes into an internal Unicode format that can be
330 arbitrary bytes into an internal Unicode format that can be
331 re-encoded back into the original. Here we are exposing the
331 re-encoded back into the original. Here we are exposing the
332 internal surrogate encoding as a UTF-8 string.)
332 internal surrogate encoding as a UTF-8 string.)
333 '''
333 '''
334
334
335 if isinstance(s, localstr):
335 if isinstance(s, localstr):
336 return s._utf8
336 return s._utf8
337
337
338 try:
338 try:
339 if s.decode('utf-8'):
339 s.decode('utf-8')
340 return s
340 return s
341 except UnicodeDecodeError:
341 except UnicodeDecodeError:
342 # surrogate-encode any characters that don't round-trip
342 # surrogate-encode any characters that don't round-trip
343 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
343 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
344 r = ""
344 r = ""
345 pos = 0
345 pos = 0
346 for c in s:
346 for c in s:
347 if s2[pos:pos + 1] == c:
347 if s2[pos:pos + 1] == c:
348 r += c
348 r += c
349 pos += 1
349 pos += 1
350 else:
350 else:
351 r += unichr(0xdc00 + ord(c)).encode('utf-8')
351 r += unichr(0xdc00 + ord(c)).encode('utf-8')
352 return r
352 return r
353
353
354 def fromutf8b(s):
354 def fromutf8b(s):
355 '''Given a UTF-8b string, return a local, possibly-binary string.
355 '''Given a UTF-8b string, return a local, possibly-binary string.
356
356
357 return the original binary string. This
357 return the original binary string. This
358 is a round-trip process for strings like filenames, but metadata
358 is a round-trip process for strings like filenames, but metadata
359 that's was passed through tolocal will remain in UTF-8.
359 that's was passed through tolocal will remain in UTF-8.
360
360
361 >>> m = "\\xc3\\xa9\\x99abcd"
361 >>> m = "\\xc3\\xa9\\x99abcd"
362 >>> n = toutf8b(m)
362 >>> n = toutf8b(m)
363 >>> n
363 >>> n
364 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
364 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
365 >>> fromutf8b(n) == m
365 >>> fromutf8b(n) == m
366 True
366 True
367 '''
367 '''
368
368
369 # fast path - look for uDxxx prefixes in s
369 # fast path - look for uDxxx prefixes in s
370 if "\xed" not in s:
370 if "\xed" not in s:
371 return s
371 return s
372
372
373 u = s.decode("utf-8")
373 u = s.decode("utf-8")
374 r = ""
374 r = ""
375 for c in u:
375 for c in u:
376 if ord(c) & 0xff00 == 0xdc00:
376 if ord(c) & 0xff00 == 0xdc00:
377 r += chr(ord(c) & 0xff)
377 r += chr(ord(c) & 0xff)
378 else:
378 else:
379 r += c.encode("utf-8")
379 r += c.encode("utf-8")
380 return r
380 return r
General Comments 0
You need to be logged in to leave comments. Login now