##// END OF EJS Templates
encoding: use raw strings for encoding arguments...
Gregory Szorc -
r42002:25694a78 default
parent child Browse files
Show More
@@ -1,616 +1,616 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from . import (
15 15 error,
16 16 policy,
17 17 pycompat,
18 18 )
19 19
20 20 from .pure import (
21 21 charencode as charencodepure,
22 22 )
23 23
24 24 charencode = policy.importmod(r'charencode')
25 25
26 26 isasciistr = charencode.isasciistr
27 27 asciilower = charencode.asciilower
28 28 asciiupper = charencode.asciiupper
29 29 _jsonescapeu8fast = charencode.jsonescapeu8fast
30 30
31 31 _sysstr = pycompat.sysstr
32 32
33 33 if pycompat.ispy3:
34 34 unichr = chr
35 35
36 36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 37 # "Unicode Subtleties"), so we need to ignore them in some places for
38 38 # sanity.
39 39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
40 40 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
41 41 "206a 206b 206c 206d 206e 206f feff".split()]
42 42 # verify the next function will work
43 43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44 44
45 45 def hfsignoreclean(s):
46 46 """Remove codepoints ignored by HFS+ from s.
47 47
48 48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
49 49 '.hg'
50 50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
51 51 '.hg'
52 52 """
53 53 if "\xe2" in s or "\xef" in s:
54 54 for c in _ignore:
55 55 s = s.replace(c, '')
56 56 return s
57 57
58 58 # encoding.environ is provided read-only, which may not be used to modify
59 59 # the process environment
60 60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
61 61 if not pycompat.ispy3:
62 62 environ = os.environ # re-exports
63 63 elif _nativeenviron:
64 64 environ = os.environb # re-exports
65 65 else:
66 66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 67 # and recreate it once encoding is settled
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
68 environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
69 69 for k, v in os.environ.items()) # re-exports
70 70
71 71 _encodingrewrites = {
72 72 '646': 'ascii',
73 73 'ANSI_X3.4-1968': 'ascii',
74 74 }
75 75 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
76 76 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
77 77 # https://bugs.python.org/issue13216
78 78 if pycompat.iswindows and not pycompat.ispy3:
79 79 _encodingrewrites['cp65001'] = 'utf-8'
80 80
81 81 try:
82 82 encoding = environ.get("HGENCODING")
83 83 if not encoding:
84 84 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
85 85 encoding = _encodingrewrites.get(encoding, encoding)
86 86 except locale.Error:
87 87 encoding = 'ascii'
88 88 encodingmode = environ.get("HGENCODINGMODE", "strict")
89 89 fallbackencoding = 'ISO-8859-1'
90 90
91 91 class localstr(bytes):
92 92 '''This class allows strings that are unmodified to be
93 93 round-tripped to the local encoding and back'''
94 94 def __new__(cls, u, l):
95 95 s = bytes.__new__(cls, l)
96 96 s._utf8 = u
97 97 return s
98 98 def __hash__(self):
99 99 return hash(self._utf8) # avoid collisions in local string space
100 100
101 101 class safelocalstr(bytes):
102 102 """Tagged string denoting it was previously an internal UTF-8 string,
103 103 and can be converted back to UTF-8 losslessly
104 104
105 105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
106 106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
107 107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
108 108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
109 109 """
110 110
111 111 def tolocal(s):
112 112 """
113 113 Convert a string from internal UTF-8 to local encoding
114 114
115 115 All internal strings should be UTF-8 but some repos before the
116 116 implementation of locale support may contain latin1 or possibly
117 117 other character sets. We attempt to decode everything strictly
118 118 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
119 119 replace unknown characters.
120 120
121 121 The localstr class is used to cache the known UTF-8 encoding of
122 122 strings next to their local representation to allow lossless
123 123 round-trip conversion back to UTF-8.
124 124
125 125 >>> u = b'foo: \\xc3\\xa4' # utf-8
126 126 >>> l = tolocal(u)
127 127 >>> l
128 128 'foo: ?'
129 129 >>> fromlocal(l)
130 130 'foo: \\xc3\\xa4'
131 131 >>> u2 = b'foo: \\xc3\\xa1'
132 132 >>> d = { l: 1, tolocal(u2): 2 }
133 133 >>> len(d) # no collision
134 134 2
135 135 >>> b'foo: ?' in d
136 136 False
137 137 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
138 138 >>> l = tolocal(l1)
139 139 >>> l
140 140 'foo: ?'
141 141 >>> fromlocal(l) # magically in utf-8
142 142 'foo: \\xc3\\xa4'
143 143 """
144 144
145 145 if isasciistr(s):
146 146 return s
147 147
148 148 try:
149 149 try:
150 150 # make sure string is actually stored in UTF-8
151 151 u = s.decode('UTF-8')
152 152 if encoding == 'UTF-8':
153 153 # fast path
154 154 return s
155 r = u.encode(_sysstr(encoding), u"replace")
155 r = u.encode(_sysstr(encoding), r"replace")
156 156 if u == r.decode(_sysstr(encoding)):
157 157 # r is a safe, non-lossy encoding of s
158 158 return safelocalstr(r)
159 159 return localstr(s, r)
160 160 except UnicodeDecodeError:
161 161 # we should only get here if we're looking at an ancient changeset
162 162 try:
163 163 u = s.decode(_sysstr(fallbackencoding))
164 r = u.encode(_sysstr(encoding), u"replace")
164 r = u.encode(_sysstr(encoding), r"replace")
165 165 if u == r.decode(_sysstr(encoding)):
166 166 # r is a safe, non-lossy encoding of s
167 167 return safelocalstr(r)
168 168 return localstr(u.encode('UTF-8'), r)
169 169 except UnicodeDecodeError:
170 170 u = s.decode("utf-8", "replace") # last ditch
171 171 # can't round-trip
172 return u.encode(_sysstr(encoding), u"replace")
172 return u.encode(_sysstr(encoding), r"replace")
173 173 except LookupError as k:
174 174 raise error.Abort(k, hint="please check your locale settings")
175 175
176 176 def fromlocal(s):
177 177 """
178 178 Convert a string from the local character encoding to UTF-8
179 179
180 180 We attempt to decode strings using the encoding mode set by
181 181 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
182 182 characters will cause an error message. Other modes include
183 183 'replace', which replaces unknown characters with a special
184 184 Unicode character, and 'ignore', which drops the character.
185 185 """
186 186
187 187 # can we do a lossless round-trip?
188 188 if isinstance(s, localstr):
189 189 return s._utf8
190 190 if isasciistr(s):
191 191 return s
192 192
193 193 try:
194 194 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
195 195 return u.encode("utf-8")
196 196 except UnicodeDecodeError as inst:
197 197 sub = s[max(0, inst.start - 10):inst.start + 10]
198 198 raise error.Abort("decoding near '%s': %s!"
199 199 % (sub, pycompat.bytestr(inst)))
200 200 except LookupError as k:
201 201 raise error.Abort(k, hint="please check your locale settings")
202 202
203 203 def unitolocal(u):
204 204 """Convert a unicode string to a byte string of local encoding"""
205 205 return tolocal(u.encode('utf-8'))
206 206
207 207 def unifromlocal(s):
208 208 """Convert a byte string of local encoding to a unicode string"""
209 209 return fromlocal(s).decode('utf-8')
210 210
211 211 def unimethod(bytesfunc):
212 212 """Create a proxy method that forwards __unicode__() and __str__() of
213 213 Python 3 to __bytes__()"""
214 214 def unifunc(obj):
215 215 return unifromlocal(bytesfunc(obj))
216 216 return unifunc
217 217
218 218 # converter functions between native str and byte string. use these if the
219 219 # character encoding is not aware (e.g. exception message) or is known to
220 220 # be locale dependent (e.g. date formatting.)
221 221 if pycompat.ispy3:
222 222 strtolocal = unitolocal
223 223 strfromlocal = unifromlocal
224 224 strmethod = unimethod
225 225 else:
226 226 strtolocal = pycompat.identity
227 227 strfromlocal = pycompat.identity
228 228 strmethod = pycompat.identity
229 229
230 230 if not _nativeenviron:
231 231 # now encoding and helper functions are available, recreate the environ
232 232 # dict to be exported to other modules
233 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
233 environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
234 234 for k, v in os.environ.items()) # re-exports
235 235
236 236 if pycompat.ispy3:
237 237 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
238 238 # returns bytes.
239 239 if pycompat.iswindows:
240 240 # Python 3 on Windows issues a DeprecationWarning about using the bytes
241 241 # API when os.getcwdb() is called.
242 242 getcwd = lambda: strtolocal(os.getcwd()) # re-exports
243 243 else:
244 244 getcwd = os.getcwdb # re-exports
245 245 else:
246 246 getcwd = os.getcwd # re-exports
247 247
248 248 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
249 249 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
250 250 and "WFA" or "WF")
251 251
252 252 def colwidth(s):
253 253 "Find the column width of a string for display in the local encoding"
254 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
254 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
255 255
256 256 def ucolwidth(d):
257 257 "Find the column width of a Unicode string for display"
258 258 eaw = getattr(unicodedata, 'east_asian_width', None)
259 259 if eaw is not None:
260 260 return sum([eaw(c) in _wide and 2 or 1 for c in d])
261 261 return len(d)
262 262
263 263 def getcols(s, start, c):
264 264 '''Use colwidth to find a c-column substring of s starting at byte
265 265 index start'''
266 266 for x in pycompat.xrange(start + c, len(s)):
267 267 t = s[start:x]
268 268 if colwidth(t) == c:
269 269 return t
270 270
271 271 def trim(s, width, ellipsis='', leftside=False):
272 272 """Trim string 's' to at most 'width' columns (including 'ellipsis').
273 273
274 274 If 'leftside' is True, left side of string 's' is trimmed.
275 275 'ellipsis' is always placed at trimmed side.
276 276
277 277 >>> from .node import bin
278 278 >>> def bprint(s):
279 279 ... print(pycompat.sysstr(s))
280 280 >>> ellipsis = b'+++'
281 281 >>> from . import encoding
282 282 >>> encoding.encoding = b'utf-8'
283 283 >>> t = b'1234567890'
284 284 >>> bprint(trim(t, 12, ellipsis=ellipsis))
285 285 1234567890
286 286 >>> bprint(trim(t, 10, ellipsis=ellipsis))
287 287 1234567890
288 288 >>> bprint(trim(t, 8, ellipsis=ellipsis))
289 289 12345+++
290 290 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
291 291 +++67890
292 292 >>> bprint(trim(t, 8))
293 293 12345678
294 294 >>> bprint(trim(t, 8, leftside=True))
295 295 34567890
296 296 >>> bprint(trim(t, 3, ellipsis=ellipsis))
297 297 +++
298 298 >>> bprint(trim(t, 1, ellipsis=ellipsis))
299 299 +
300 300 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
301 301 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
302 302 >>> bprint(trim(t, 12, ellipsis=ellipsis))
303 303 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
304 304 >>> bprint(trim(t, 10, ellipsis=ellipsis))
305 305 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
306 306 >>> bprint(trim(t, 8, ellipsis=ellipsis))
307 307 \xe3\x81\x82\xe3\x81\x84+++
308 308 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
309 309 +++\xe3\x81\x88\xe3\x81\x8a
310 310 >>> bprint(trim(t, 5))
311 311 \xe3\x81\x82\xe3\x81\x84
312 312 >>> bprint(trim(t, 5, leftside=True))
313 313 \xe3\x81\x88\xe3\x81\x8a
314 314 >>> bprint(trim(t, 4, ellipsis=ellipsis))
315 315 +++
316 316 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
317 317 +++
318 318 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
319 319 >>> bprint(trim(t, 12, ellipsis=ellipsis))
320 320 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
321 321 >>> bprint(trim(t, 10, ellipsis=ellipsis))
322 322 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
323 323 >>> bprint(trim(t, 8, ellipsis=ellipsis))
324 324 \x11\x22\x33\x44\x55+++
325 325 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
326 326 +++\x66\x77\x88\x99\xaa
327 327 >>> bprint(trim(t, 8))
328 328 \x11\x22\x33\x44\x55\x66\x77\x88
329 329 >>> bprint(trim(t, 8, leftside=True))
330 330 \x33\x44\x55\x66\x77\x88\x99\xaa
331 331 >>> bprint(trim(t, 3, ellipsis=ellipsis))
332 332 +++
333 333 >>> bprint(trim(t, 1, ellipsis=ellipsis))
334 334 +
335 335 """
336 336 try:
337 337 u = s.decode(_sysstr(encoding))
338 338 except UnicodeDecodeError:
339 339 if len(s) <= width: # trimming is not needed
340 340 return s
341 341 width -= len(ellipsis)
342 342 if width <= 0: # no enough room even for ellipsis
343 343 return ellipsis[:width + len(ellipsis)]
344 344 if leftside:
345 345 return ellipsis + s[-width:]
346 346 return s[:width] + ellipsis
347 347
348 348 if ucolwidth(u) <= width: # trimming is not needed
349 349 return s
350 350
351 351 width -= len(ellipsis)
352 352 if width <= 0: # no enough room even for ellipsis
353 353 return ellipsis[:width + len(ellipsis)]
354 354
355 355 if leftside:
356 356 uslice = lambda i: u[i:]
357 357 concat = lambda s: ellipsis + s
358 358 else:
359 359 uslice = lambda i: u[:-i]
360 360 concat = lambda s: s + ellipsis
361 361 for i in pycompat.xrange(1, len(u)):
362 362 usub = uslice(i)
363 363 if ucolwidth(usub) <= width:
364 364 return concat(usub.encode(_sysstr(encoding)))
365 365 return ellipsis # no enough room for multi-column characters
366 366
367 367 def lower(s):
368 368 "best-effort encoding-aware case-folding of local string s"
369 369 try:
370 370 return asciilower(s)
371 371 except UnicodeDecodeError:
372 372 pass
373 373 try:
374 374 if isinstance(s, localstr):
375 375 u = s._utf8.decode("utf-8")
376 376 else:
377 377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
378 378
379 379 lu = u.lower()
380 380 if u == lu:
381 381 return s # preserve localstring
382 382 return lu.encode(_sysstr(encoding))
383 383 except UnicodeError:
384 384 return s.lower() # we don't know how to fold this except in ASCII
385 385 except LookupError as k:
386 386 raise error.Abort(k, hint="please check your locale settings")
387 387
388 388 def upper(s):
389 389 "best-effort encoding-aware case-folding of local string s"
390 390 try:
391 391 return asciiupper(s)
392 392 except UnicodeDecodeError:
393 393 return upperfallback(s)
394 394
395 395 def upperfallback(s):
396 396 try:
397 397 if isinstance(s, localstr):
398 398 u = s._utf8.decode("utf-8")
399 399 else:
400 400 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
401 401
402 402 uu = u.upper()
403 403 if u == uu:
404 404 return s # preserve localstring
405 405 return uu.encode(_sysstr(encoding))
406 406 except UnicodeError:
407 407 return s.upper() # we don't know how to fold this except in ASCII
408 408 except LookupError as k:
409 409 raise error.Abort(k, hint="please check your locale settings")
410 410
411 411 class normcasespecs(object):
412 412 '''what a platform's normcase does to ASCII strings
413 413
414 414 This is specified per platform, and should be consistent with what normcase
415 415 on that platform actually does.
416 416
417 417 lower: normcase lowercases ASCII strings
418 418 upper: normcase uppercases ASCII strings
419 419 other: the fallback function should always be called
420 420
421 421 This should be kept in sync with normcase_spec in util.h.'''
422 422 lower = -1
423 423 upper = 1
424 424 other = 0
425 425
426 426 def jsonescape(s, paranoid=False):
427 427 '''returns a string suitable for JSON
428 428
429 429 JSON is problematic for us because it doesn't support non-Unicode
430 430 bytes. To deal with this, we take the following approach:
431 431
432 432 - localstr/safelocalstr objects are converted back to UTF-8
433 433 - valid UTF-8/ASCII strings are passed as-is
434 434 - other strings are converted to UTF-8b surrogate encoding
435 435 - apply JSON-specified string escaping
436 436
437 437 (escapes are doubled in these tests)
438 438
439 439 >>> jsonescape(b'this is a test')
440 440 'this is a test'
441 441 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
442 442 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
443 443 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
444 444 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
445 445 >>> jsonescape(b'a weird byte: \\xdd')
446 446 'a weird byte: \\xed\\xb3\\x9d'
447 447 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
448 448 'utf-8: caf\\xc3\\xa9'
449 449 >>> jsonescape(b'')
450 450 ''
451 451
452 452 If paranoid, non-ascii and common troublesome characters are also escaped.
453 453 This is suitable for web output.
454 454
455 455 >>> s = b'escape characters: \\0 \\x0b \\x7f'
456 456 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
457 457 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
458 458 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
459 459 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
460 460 'escape boundary: ~ \\\\u007f \\\\u0080'
461 461 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
462 462 'a weird byte: \\\\udcdd'
463 463 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
464 464 'utf-8: caf\\\\u00e9'
465 465 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
466 466 'non-BMP: \\\\ud834\\\\udd1e'
467 467 >>> jsonescape(b'<foo@example.org>', paranoid=True)
468 468 '\\\\u003cfoo@example.org\\\\u003e'
469 469 '''
470 470
471 471 u8chars = toutf8b(s)
472 472 try:
473 473 return _jsonescapeu8fast(u8chars, paranoid)
474 474 except ValueError:
475 475 pass
476 476 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
477 477
478 478 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
479 479 # bytes are mapped to that range.
480 480 if pycompat.ispy3:
481 481 _utf8strict = r'surrogatepass'
482 482 else:
483 483 _utf8strict = r'strict'
484 484
485 485 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
486 486
487 487 def getutf8char(s, pos):
488 488 '''get the next full utf-8 character in the given string, starting at pos
489 489
490 490 Raises a UnicodeError if the given location does not start a valid
491 491 utf-8 character.
492 492 '''
493 493
494 494 # find how many bytes to attempt decoding from first nibble
495 495 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
496 496 if not l: # ascii
497 497 return s[pos:pos + 1]
498 498
499 499 c = s[pos:pos + l]
500 500 # validate with attempted decode
501 501 c.decode("utf-8", _utf8strict)
502 502 return c
503 503
504 504 def toutf8b(s):
505 505 '''convert a local, possibly-binary string into UTF-8b
506 506
507 507 This is intended as a generic method to preserve data when working
508 508 with schemes like JSON and XML that have no provision for
509 509 arbitrary byte strings. As Mercurial often doesn't know
510 510 what encoding data is in, we use so-called UTF-8b.
511 511
512 512 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
513 513 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
514 514 uDC00-uDCFF.
515 515
516 516 Principles of operation:
517 517
518 518 - ASCII and UTF-8 data successfully round-trips and is understood
519 519 by Unicode-oriented clients
520 520 - filenames and file contents in arbitrary other encodings can have
521 521 be round-tripped or recovered by clueful clients
522 522 - local strings that have a cached known UTF-8 encoding (aka
523 523 localstr) get sent as UTF-8 so Unicode-oriented clients get the
524 524 Unicode data they want
525 525 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
526 526 - because we must preserve UTF-8 bytestring in places such as
527 527 filenames, metadata can't be roundtripped without help
528 528
529 529 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
530 530 arbitrary bytes into an internal Unicode format that can be
531 531 re-encoded back into the original. Here we are exposing the
532 532 internal surrogate encoding as a UTF-8 string.)
533 533 '''
534 534
535 535 if isinstance(s, localstr):
536 536 # assume that the original UTF-8 sequence would never contain
537 537 # invalid characters in U+DCxx range
538 538 return s._utf8
539 539 elif isinstance(s, safelocalstr):
540 540 # already verified that s is non-lossy in legacy encoding, which
541 541 # shouldn't contain characters in U+DCxx range
542 542 return fromlocal(s)
543 543 elif isasciistr(s):
544 544 return s
545 545 if "\xed" not in s:
546 546 try:
547 547 s.decode('utf-8', _utf8strict)
548 548 return s
549 549 except UnicodeDecodeError:
550 550 pass
551 551
552 552 s = pycompat.bytestr(s)
553 553 r = ""
554 554 pos = 0
555 555 l = len(s)
556 556 while pos < l:
557 557 try:
558 558 c = getutf8char(s, pos)
559 559 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
560 560 # have to re-escape existing U+DCxx characters
561 561 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
562 562 pos += 1
563 563 else:
564 564 pos += len(c)
565 565 except UnicodeDecodeError:
566 566 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
567 567 pos += 1
568 568 r += c
569 569 return r
570 570
571 571 def fromutf8b(s):
572 572 '''Given a UTF-8b string, return a local, possibly-binary string.
573 573
574 574 return the original binary string. This
575 575 is a round-trip process for strings like filenames, but metadata
576 576 that's was passed through tolocal will remain in UTF-8.
577 577
578 578 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
579 579 >>> m = b"\\xc3\\xa9\\x99abcd"
580 580 >>> toutf8b(m)
581 581 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
582 582 >>> roundtrip(m)
583 583 True
584 584 >>> roundtrip(b"\\xc2\\xc2\\x80")
585 585 True
586 586 >>> roundtrip(b"\\xef\\xbf\\xbd")
587 587 True
588 588 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
589 589 True
590 590 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
591 591 True
592 592 '''
593 593
594 594 if isasciistr(s):
595 595 return s
596 596 # fast path - look for uDxxx prefixes in s
597 597 if "\xed" not in s:
598 598 return s
599 599
600 600 # We could do this with the unicode type but some Python builds
601 601 # use UTF-16 internally (issue5031) which causes non-BMP code
602 602 # points to be escaped. Instead, we use our handy getutf8char
603 603 # helper again to walk the string without "decoding" it.
604 604
605 605 s = pycompat.bytestr(s)
606 606 r = ""
607 607 pos = 0
608 608 l = len(s)
609 609 while pos < l:
610 610 c = getutf8char(s, pos)
611 611 pos += len(c)
612 612 # unescape U+DCxx characters
613 613 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
614 614 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
615 615 r += c
616 616 return r
General Comments 0
You need to be logged in to leave comments. Login now