##// END OF EJS Templates
encoding: remove workaround for locale.getpreferredencoding()...
Gregory Szorc -
r32276:1a3a08b5 default
parent child Browse files
Show More
@@ -1,620 +1,595 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 pycompat,
18 18 )
19 19
20 20 _sysstr = pycompat.sysstr
21 21
22 22 if pycompat.ispy3:
23 23 unichr = chr
24 24
25 25 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
26 26 # "Unicode Subtleties"), so we need to ignore them in some places for
27 27 # sanity.
28 28 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
29 29 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
30 30 "206a 206b 206c 206d 206e 206f feff".split()]
31 31 # verify the next function will work
32 32 if pycompat.ispy3:
33 33 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
34 34 else:
35 35 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
36 36
37 37 def hfsignoreclean(s):
38 38 """Remove codepoints ignored by HFS+ from s.
39 39
40 40 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
41 41 '.hg'
42 42 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
43 43 '.hg'
44 44 """
45 45 if "\xe2" in s or "\xef" in s:
46 46 for c in _ignore:
47 47 s = s.replace(c, '')
48 48 return s
49 49
50 50 # encoding.environ is provided read-only, which may not be used to modify
51 51 # the process environment
52 52 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
53 53 if not pycompat.ispy3:
54 54 environ = os.environ # re-exports
55 55 elif _nativeenviron:
56 56 environ = os.environb # re-exports
57 57 else:
58 58 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
59 59 # and recreate it once encoding is settled
60 60 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
61 61 for k, v in os.environ.items()) # re-exports
62 62
63 def _getpreferredencoding():
64 '''
65 On darwin, getpreferredencoding ignores the locale environment and
66 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
67 for Python 2.7 and up. This is the same corrected code for earlier
68 Python versions.
69
70 However, we can't use a version check for this method, as some distributions
71 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
72 encoding, as it is unlikely that this encoding is the actually expected.
73 '''
74 try:
75 locale.CODESET
76 except AttributeError:
77 # Fall back to parsing environment variables :-(
78 return locale.getdefaultlocale()[1]
79
80 oldloc = locale.setlocale(locale.LC_CTYPE)
81 locale.setlocale(locale.LC_CTYPE, "")
82 result = locale.nl_langinfo(locale.CODESET)
83 locale.setlocale(locale.LC_CTYPE, oldloc)
84
85 return result
86
87 63 _encodingfixers = {
88 64 '646': lambda: 'ascii',
89 65 'ANSI_X3.4-1968': lambda: 'ascii',
90 'mac-roman': _getpreferredencoding
91 66 }
92 67
93 68 try:
94 69 encoding = environ.get("HGENCODING")
95 70 if not encoding:
96 71 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
97 72 encoding = _encodingfixers.get(encoding, lambda: encoding)()
98 73 except locale.Error:
99 74 encoding = 'ascii'
100 75 encodingmode = environ.get("HGENCODINGMODE", "strict")
101 76 fallbackencoding = 'ISO-8859-1'
102 77
103 78 class localstr(str):
104 79 '''This class allows strings that are unmodified to be
105 80 round-tripped to the local encoding and back'''
106 81 def __new__(cls, u, l):
107 82 s = str.__new__(cls, l)
108 83 s._utf8 = u
109 84 return s
110 85 def __hash__(self):
111 86 return hash(self._utf8) # avoid collisions in local string space
112 87
113 88 def tolocal(s):
114 89 """
115 90 Convert a string from internal UTF-8 to local encoding
116 91
117 92 All internal strings should be UTF-8 but some repos before the
118 93 implementation of locale support may contain latin1 or possibly
119 94 other character sets. We attempt to decode everything strictly
120 95 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
121 96 replace unknown characters.
122 97
123 98 The localstr class is used to cache the known UTF-8 encoding of
124 99 strings next to their local representation to allow lossless
125 100 round-trip conversion back to UTF-8.
126 101
127 102 >>> u = 'foo: \\xc3\\xa4' # utf-8
128 103 >>> l = tolocal(u)
129 104 >>> l
130 105 'foo: ?'
131 106 >>> fromlocal(l)
132 107 'foo: \\xc3\\xa4'
133 108 >>> u2 = 'foo: \\xc3\\xa1'
134 109 >>> d = { l: 1, tolocal(u2): 2 }
135 110 >>> len(d) # no collision
136 111 2
137 112 >>> 'foo: ?' in d
138 113 False
139 114 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
140 115 >>> l = tolocal(l1)
141 116 >>> l
142 117 'foo: ?'
143 118 >>> fromlocal(l) # magically in utf-8
144 119 'foo: \\xc3\\xa4'
145 120 """
146 121
147 122 try:
148 123 try:
149 124 # make sure string is actually stored in UTF-8
150 125 u = s.decode('UTF-8')
151 126 if encoding == 'UTF-8':
152 127 # fast path
153 128 return s
154 129 r = u.encode(_sysstr(encoding), u"replace")
155 130 if u == r.decode(_sysstr(encoding)):
156 131 # r is a safe, non-lossy encoding of s
157 132 return r
158 133 return localstr(s, r)
159 134 except UnicodeDecodeError:
160 135 # we should only get here if we're looking at an ancient changeset
161 136 try:
162 137 u = s.decode(_sysstr(fallbackencoding))
163 138 r = u.encode(_sysstr(encoding), u"replace")
164 139 if u == r.decode(_sysstr(encoding)):
165 140 # r is a safe, non-lossy encoding of s
166 141 return r
167 142 return localstr(u.encode('UTF-8'), r)
168 143 except UnicodeDecodeError:
169 144 u = s.decode("utf-8", "replace") # last ditch
170 145 # can't round-trip
171 146 return u.encode(_sysstr(encoding), u"replace")
172 147 except LookupError as k:
173 148 raise error.Abort(k, hint="please check your locale settings")
174 149
175 150 def fromlocal(s):
176 151 """
177 152 Convert a string from the local character encoding to UTF-8
178 153
179 154 We attempt to decode strings using the encoding mode set by
180 155 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
181 156 characters will cause an error message. Other modes include
182 157 'replace', which replaces unknown characters with a special
183 158 Unicode character, and 'ignore', which drops the character.
184 159 """
185 160
186 161 # can we do a lossless round-trip?
187 162 if isinstance(s, localstr):
188 163 return s._utf8
189 164
190 165 try:
191 166 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
192 167 return u.encode("utf-8")
193 168 except UnicodeDecodeError as inst:
194 169 sub = s[max(0, inst.start - 10):inst.start + 10]
195 170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
196 171 except LookupError as k:
197 172 raise error.Abort(k, hint="please check your locale settings")
198 173
199 174 def unitolocal(u):
200 175 """Convert a unicode string to a byte string of local encoding"""
201 176 return tolocal(u.encode('utf-8'))
202 177
203 178 def unifromlocal(s):
204 179 """Convert a byte string of local encoding to a unicode string"""
205 180 return fromlocal(s).decode('utf-8')
206 181
207 182 # converter functions between native str and byte string. use these if the
208 183 # character encoding is not aware (e.g. exception message) or is known to
209 184 # be locale dependent (e.g. date formatting.)
210 185 if pycompat.ispy3:
211 186 strtolocal = unitolocal
212 187 strfromlocal = unifromlocal
213 188 else:
214 189 strtolocal = pycompat.identity
215 190 strfromlocal = pycompat.identity
216 191
217 192 if not _nativeenviron:
218 193 # now encoding and helper functions are available, recreate the environ
219 194 # dict to be exported to other modules
220 195 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
221 196 for k, v in os.environ.items()) # re-exports
222 197
223 198 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
224 199 wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
225 200 and "WFA" or "WF")
226 201
227 202 def colwidth(s):
228 203 "Find the column width of a string for display in the local encoding"
229 204 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
230 205
231 206 def ucolwidth(d):
232 207 "Find the column width of a Unicode string for display"
233 208 eaw = getattr(unicodedata, 'east_asian_width', None)
234 209 if eaw is not None:
235 210 return sum([eaw(c) in wide and 2 or 1 for c in d])
236 211 return len(d)
237 212
238 213 def getcols(s, start, c):
239 214 '''Use colwidth to find a c-column substring of s starting at byte
240 215 index start'''
241 216 for x in xrange(start + c, len(s)):
242 217 t = s[start:x]
243 218 if colwidth(t) == c:
244 219 return t
245 220
246 221 def trim(s, width, ellipsis='', leftside=False):
247 222 """Trim string 's' to at most 'width' columns (including 'ellipsis').
248 223
249 224 If 'leftside' is True, left side of string 's' is trimmed.
250 225 'ellipsis' is always placed at trimmed side.
251 226
252 227 >>> ellipsis = '+++'
253 228 >>> from . import encoding
254 229 >>> encoding.encoding = 'utf-8'
255 230 >>> t= '1234567890'
256 231 >>> print trim(t, 12, ellipsis=ellipsis)
257 232 1234567890
258 233 >>> print trim(t, 10, ellipsis=ellipsis)
259 234 1234567890
260 235 >>> print trim(t, 8, ellipsis=ellipsis)
261 236 12345+++
262 237 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
263 238 +++67890
264 239 >>> print trim(t, 8)
265 240 12345678
266 241 >>> print trim(t, 8, leftside=True)
267 242 34567890
268 243 >>> print trim(t, 3, ellipsis=ellipsis)
269 244 +++
270 245 >>> print trim(t, 1, ellipsis=ellipsis)
271 246 +
272 247 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
273 248 >>> t = u.encode(encoding.encoding)
274 249 >>> print trim(t, 12, ellipsis=ellipsis)
275 250 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
276 251 >>> print trim(t, 10, ellipsis=ellipsis)
277 252 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
278 253 >>> print trim(t, 8, ellipsis=ellipsis)
279 254 \xe3\x81\x82\xe3\x81\x84+++
280 255 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
281 256 +++\xe3\x81\x88\xe3\x81\x8a
282 257 >>> print trim(t, 5)
283 258 \xe3\x81\x82\xe3\x81\x84
284 259 >>> print trim(t, 5, leftside=True)
285 260 \xe3\x81\x88\xe3\x81\x8a
286 261 >>> print trim(t, 4, ellipsis=ellipsis)
287 262 +++
288 263 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
289 264 +++
290 265 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
291 266 >>> print trim(t, 12, ellipsis=ellipsis)
292 267 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
293 268 >>> print trim(t, 10, ellipsis=ellipsis)
294 269 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
295 270 >>> print trim(t, 8, ellipsis=ellipsis)
296 271 \x11\x22\x33\x44\x55+++
297 272 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
298 273 +++\x66\x77\x88\x99\xaa
299 274 >>> print trim(t, 8)
300 275 \x11\x22\x33\x44\x55\x66\x77\x88
301 276 >>> print trim(t, 8, leftside=True)
302 277 \x33\x44\x55\x66\x77\x88\x99\xaa
303 278 >>> print trim(t, 3, ellipsis=ellipsis)
304 279 +++
305 280 >>> print trim(t, 1, ellipsis=ellipsis)
306 281 +
307 282 """
308 283 try:
309 284 u = s.decode(_sysstr(encoding))
310 285 except UnicodeDecodeError:
311 286 if len(s) <= width: # trimming is not needed
312 287 return s
313 288 width -= len(ellipsis)
314 289 if width <= 0: # no enough room even for ellipsis
315 290 return ellipsis[:width + len(ellipsis)]
316 291 if leftside:
317 292 return ellipsis + s[-width:]
318 293 return s[:width] + ellipsis
319 294
320 295 if ucolwidth(u) <= width: # trimming is not needed
321 296 return s
322 297
323 298 width -= len(ellipsis)
324 299 if width <= 0: # no enough room even for ellipsis
325 300 return ellipsis[:width + len(ellipsis)]
326 301
327 302 if leftside:
328 303 uslice = lambda i: u[i:]
329 304 concat = lambda s: ellipsis + s
330 305 else:
331 306 uslice = lambda i: u[:-i]
332 307 concat = lambda s: s + ellipsis
333 308 for i in xrange(1, len(u)):
334 309 usub = uslice(i)
335 310 if ucolwidth(usub) <= width:
336 311 return concat(usub.encode(_sysstr(encoding)))
337 312 return ellipsis # no enough room for multi-column characters
338 313
339 314 def _asciilower(s):
340 315 '''convert a string to lowercase if ASCII
341 316
342 317 Raises UnicodeDecodeError if non-ASCII characters are found.'''
343 318 s.decode('ascii')
344 319 return s.lower()
345 320
346 321 def asciilower(s):
347 322 # delay importing avoids cyclic dependency around "parsers" in
348 323 # pure Python build (util => i18n => encoding => parsers => util)
349 324 from . import parsers
350 325 impl = getattr(parsers, 'asciilower', _asciilower)
351 326 global asciilower
352 327 asciilower = impl
353 328 return impl(s)
354 329
355 330 def _asciiupper(s):
356 331 '''convert a string to uppercase if ASCII
357 332
358 333 Raises UnicodeDecodeError if non-ASCII characters are found.'''
359 334 s.decode('ascii')
360 335 return s.upper()
361 336
362 337 def asciiupper(s):
363 338 # delay importing avoids cyclic dependency around "parsers" in
364 339 # pure Python build (util => i18n => encoding => parsers => util)
365 340 from . import parsers
366 341 impl = getattr(parsers, 'asciiupper', _asciiupper)
367 342 global asciiupper
368 343 asciiupper = impl
369 344 return impl(s)
370 345
371 346 def lower(s):
372 347 "best-effort encoding-aware case-folding of local string s"
373 348 try:
374 349 return asciilower(s)
375 350 except UnicodeDecodeError:
376 351 pass
377 352 try:
378 353 if isinstance(s, localstr):
379 354 u = s._utf8.decode("utf-8")
380 355 else:
381 356 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
382 357
383 358 lu = u.lower()
384 359 if u == lu:
385 360 return s # preserve localstring
386 361 return lu.encode(_sysstr(encoding))
387 362 except UnicodeError:
388 363 return s.lower() # we don't know how to fold this except in ASCII
389 364 except LookupError as k:
390 365 raise error.Abort(k, hint="please check your locale settings")
391 366
392 367 def upper(s):
393 368 "best-effort encoding-aware case-folding of local string s"
394 369 try:
395 370 return asciiupper(s)
396 371 except UnicodeDecodeError:
397 372 return upperfallback(s)
398 373
399 374 def upperfallback(s):
400 375 try:
401 376 if isinstance(s, localstr):
402 377 u = s._utf8.decode("utf-8")
403 378 else:
404 379 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
405 380
406 381 uu = u.upper()
407 382 if u == uu:
408 383 return s # preserve localstring
409 384 return uu.encode(_sysstr(encoding))
410 385 except UnicodeError:
411 386 return s.upper() # we don't know how to fold this except in ASCII
412 387 except LookupError as k:
413 388 raise error.Abort(k, hint="please check your locale settings")
414 389
415 390 class normcasespecs(object):
416 391 '''what a platform's normcase does to ASCII strings
417 392
418 393 This is specified per platform, and should be consistent with what normcase
419 394 on that platform actually does.
420 395
421 396 lower: normcase lowercases ASCII strings
422 397 upper: normcase uppercases ASCII strings
423 398 other: the fallback function should always be called
424 399
425 400 This should be kept in sync with normcase_spec in util.h.'''
426 401 lower = -1
427 402 upper = 1
428 403 other = 0
429 404
430 405 _jsonmap = []
431 406 _jsonmap.extend("\\u%04x" % x for x in range(32))
432 407 _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
433 408 _jsonmap.append('\\u007f')
434 409 _jsonmap[0x09] = '\\t'
435 410 _jsonmap[0x0a] = '\\n'
436 411 _jsonmap[0x22] = '\\"'
437 412 _jsonmap[0x5c] = '\\\\'
438 413 _jsonmap[0x08] = '\\b'
439 414 _jsonmap[0x0c] = '\\f'
440 415 _jsonmap[0x0d] = '\\r'
441 416 _paranoidjsonmap = _jsonmap[:]
442 417 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
443 418 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
444 419 _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
445 420
446 421 def jsonescape(s, paranoid=False):
447 422 '''returns a string suitable for JSON
448 423
449 424 JSON is problematic for us because it doesn't support non-Unicode
450 425 bytes. To deal with this, we take the following approach:
451 426
452 427 - localstr objects are converted back to UTF-8
453 428 - valid UTF-8/ASCII strings are passed as-is
454 429 - other strings are converted to UTF-8b surrogate encoding
455 430 - apply JSON-specified string escaping
456 431
457 432 (escapes are doubled in these tests)
458 433
459 434 >>> jsonescape('this is a test')
460 435 'this is a test'
461 436 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
462 437 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
463 438 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
464 439 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
465 440 >>> jsonescape('a weird byte: \\xdd')
466 441 'a weird byte: \\xed\\xb3\\x9d'
467 442 >>> jsonescape('utf-8: caf\\xc3\\xa9')
468 443 'utf-8: caf\\xc3\\xa9'
469 444 >>> jsonescape('')
470 445 ''
471 446
472 447 If paranoid, non-ascii and common troublesome characters are also escaped.
473 448 This is suitable for web output.
474 449
475 450 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
476 451 'escape boundary: ~ \\\\u007f \\\\u0080'
477 452 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
478 453 'a weird byte: \\\\udcdd'
479 454 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
480 455 'utf-8: caf\\\\u00e9'
481 456 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
482 457 'non-BMP: \\\\ud834\\\\udd1e'
483 458 >>> jsonescape('<foo@example.org>', paranoid=True)
484 459 '\\\\u003cfoo@example.org\\\\u003e'
485 460 '''
486 461
487 462 if paranoid:
488 463 jm = _paranoidjsonmap
489 464 else:
490 465 jm = _jsonmap
491 466
492 467 u8chars = toutf8b(s)
493 468 try:
494 469 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
495 470 except IndexError:
496 471 pass
497 472 # non-BMP char is represented as UTF-16 surrogate pair
498 473 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
499 474 u16codes.pop(0) # drop BOM
500 475 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
501 476
502 477 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
503 478
504 479 def getutf8char(s, pos):
505 480 '''get the next full utf-8 character in the given string, starting at pos
506 481
507 482 Raises a UnicodeError if the given location does not start a valid
508 483 utf-8 character.
509 484 '''
510 485
511 486 # find how many bytes to attempt decoding from first nibble
512 487 l = _utf8len[ord(s[pos]) >> 4]
513 488 if not l: # ascii
514 489 return s[pos]
515 490
516 491 c = s[pos:pos + l]
517 492 # validate with attempted decode
518 493 c.decode("utf-8")
519 494 return c
520 495
521 496 def toutf8b(s):
522 497 '''convert a local, possibly-binary string into UTF-8b
523 498
524 499 This is intended as a generic method to preserve data when working
525 500 with schemes like JSON and XML that have no provision for
526 501 arbitrary byte strings. As Mercurial often doesn't know
527 502 what encoding data is in, we use so-called UTF-8b.
528 503
529 504 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
530 505 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
531 506 uDC00-uDCFF.
532 507
533 508 Principles of operation:
534 509
535 510 - ASCII and UTF-8 data successfully round-trips and is understood
536 511 by Unicode-oriented clients
537 512 - filenames and file contents in arbitrary other encodings can have
538 513 be round-tripped or recovered by clueful clients
539 514 - local strings that have a cached known UTF-8 encoding (aka
540 515 localstr) get sent as UTF-8 so Unicode-oriented clients get the
541 516 Unicode data they want
542 517 - because we must preserve UTF-8 bytestring in places such as
543 518 filenames, metadata can't be roundtripped without help
544 519
545 520 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
546 521 arbitrary bytes into an internal Unicode format that can be
547 522 re-encoded back into the original. Here we are exposing the
548 523 internal surrogate encoding as a UTF-8 string.)
549 524 '''
550 525
551 526 if "\xed" not in s:
552 527 if isinstance(s, localstr):
553 528 return s._utf8
554 529 try:
555 530 s.decode('utf-8')
556 531 return s
557 532 except UnicodeDecodeError:
558 533 pass
559 534
560 535 r = ""
561 536 pos = 0
562 537 l = len(s)
563 538 while pos < l:
564 539 try:
565 540 c = getutf8char(s, pos)
566 541 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
567 542 # have to re-escape existing U+DCxx characters
568 543 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
569 544 pos += 1
570 545 else:
571 546 pos += len(c)
572 547 except UnicodeDecodeError:
573 548 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
574 549 pos += 1
575 550 r += c
576 551 return r
577 552
578 553 def fromutf8b(s):
579 554 '''Given a UTF-8b string, return a local, possibly-binary string.
580 555
581 556 return the original binary string. This
582 557 is a round-trip process for strings like filenames, but metadata
583 558 that's was passed through tolocal will remain in UTF-8.
584 559
585 560 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
586 561 >>> m = "\\xc3\\xa9\\x99abcd"
587 562 >>> toutf8b(m)
588 563 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
589 564 >>> roundtrip(m)
590 565 True
591 566 >>> roundtrip("\\xc2\\xc2\\x80")
592 567 True
593 568 >>> roundtrip("\\xef\\xbf\\xbd")
594 569 True
595 570 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
596 571 True
597 572 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
598 573 True
599 574 '''
600 575
601 576 # fast path - look for uDxxx prefixes in s
602 577 if "\xed" not in s:
603 578 return s
604 579
605 580 # We could do this with the unicode type but some Python builds
606 581 # use UTF-16 internally (issue5031) which causes non-BMP code
607 582 # points to be escaped. Instead, we use our handy getutf8char
608 583 # helper again to walk the string without "decoding" it.
609 584
610 585 r = ""
611 586 pos = 0
612 587 l = len(s)
613 588 while pos < l:
614 589 c = getutf8char(s, pos)
615 590 pos += len(c)
616 591 # unescape U+DCxx characters
617 592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
618 593 c = chr(ord(c.decode("utf-8")) & 0xff)
619 594 r += c
620 595 return r
General Comments 0
You need to be logged in to leave comments. Login now