##// END OF EJS Templates
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara -
r34213:1c601df9 default
parent child Browse files
Show More
@@ -1,588 +1,590 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import, print_function
9 9
10 10 import io
11 11 import locale
12 12 import os
13 13 import unicodedata
14 14
15 15 from . import (
16 16 error,
17 17 policy,
18 18 pycompat,
19 19 )
20 20
21 21 from .pure import (
22 22 charencode as charencodepure,
23 23 )
24 24
25 25 charencode = policy.importmod(r'charencode')
26 26
27 27 isasciistr = charencode.isasciistr
28 28 asciilower = charencode.asciilower
29 29 asciiupper = charencode.asciiupper
30 30 _jsonescapeu8fast = charencode.jsonescapeu8fast
31 31
32 32 _sysstr = pycompat.sysstr
33 33
34 34 if pycompat.ispy3:
35 35 unichr = chr
36 36
37 37 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
38 38 # "Unicode Subtleties"), so we need to ignore them in some places for
39 39 # sanity.
40 40 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
41 41 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
42 42 "206a 206b 206c 206d 206e 206f feff".split()]
43 43 # verify the next function will work
44 44 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
45 45
46 46 def hfsignoreclean(s):
47 47 """Remove codepoints ignored by HFS+ from s.
48 48
49 49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 50 '.hg'
51 51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 52 '.hg'
53 53 """
54 54 if "\xe2" in s or "\xef" in s:
55 55 for c in _ignore:
56 56 s = s.replace(c, '')
57 57 return s
58 58
59 59 # encoding.environ is provided read-only, which may not be used to modify
60 60 # the process environment
61 61 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
62 62 if not pycompat.ispy3:
63 63 environ = os.environ # re-exports
64 64 elif _nativeenviron:
65 65 environ = os.environb # re-exports
66 66 else:
67 67 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
68 68 # and recreate it once encoding is settled
69 69 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
70 70 for k, v in os.environ.items()) # re-exports
71 71
72 72 _encodingfixers = {
73 73 '646': lambda: 'ascii',
74 74 'ANSI_X3.4-1968': lambda: 'ascii',
75 75 }
76 76
77 77 try:
78 78 encoding = environ.get("HGENCODING")
79 79 if not encoding:
80 80 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
81 81 encoding = _encodingfixers.get(encoding, lambda: encoding)()
82 82 except locale.Error:
83 83 encoding = 'ascii'
84 84 encodingmode = environ.get("HGENCODINGMODE", "strict")
85 85 fallbackencoding = 'ISO-8859-1'
86 86
87 87 class localstr(bytes):
88 88 '''This class allows strings that are unmodified to be
89 89 round-tripped to the local encoding and back'''
90 90 def __new__(cls, u, l):
91 91 s = bytes.__new__(cls, l)
92 92 s._utf8 = u
93 93 return s
94 94 def __hash__(self):
95 95 return hash(self._utf8) # avoid collisions in local string space
96 96
97 97 def tolocal(s):
98 98 """
99 99 Convert a string from internal UTF-8 to local encoding
100 100
101 101 All internal strings should be UTF-8 but some repos before the
102 102 implementation of locale support may contain latin1 or possibly
103 103 other character sets. We attempt to decode everything strictly
104 104 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
105 105 replace unknown characters.
106 106
107 107 The localstr class is used to cache the known UTF-8 encoding of
108 108 strings next to their local representation to allow lossless
109 109 round-trip conversion back to UTF-8.
110 110
111 111 >>> u = b'foo: \\xc3\\xa4' # utf-8
112 112 >>> l = tolocal(u)
113 113 >>> l
114 114 'foo: ?'
115 115 >>> fromlocal(l)
116 116 'foo: \\xc3\\xa4'
117 117 >>> u2 = b'foo: \\xc3\\xa1'
118 118 >>> d = { l: 1, tolocal(u2): 2 }
119 119 >>> len(d) # no collision
120 120 2
121 121 >>> b'foo: ?' in d
122 122 False
123 123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
124 124 >>> l = tolocal(l1)
125 125 >>> l
126 126 'foo: ?'
127 127 >>> fromlocal(l) # magically in utf-8
128 128 'foo: \\xc3\\xa4'
129 129 """
130 130
131 131 if isasciistr(s):
132 132 return s
133 133
134 134 try:
135 135 try:
136 136 # make sure string is actually stored in UTF-8
137 137 u = s.decode('UTF-8')
138 138 if encoding == 'UTF-8':
139 139 # fast path
140 140 return s
141 141 r = u.encode(_sysstr(encoding), u"replace")
142 142 if u == r.decode(_sysstr(encoding)):
143 143 # r is a safe, non-lossy encoding of s
144 144 return r
145 145 return localstr(s, r)
146 146 except UnicodeDecodeError:
147 147 # we should only get here if we're looking at an ancient changeset
148 148 try:
149 149 u = s.decode(_sysstr(fallbackencoding))
150 150 r = u.encode(_sysstr(encoding), u"replace")
151 151 if u == r.decode(_sysstr(encoding)):
152 152 # r is a safe, non-lossy encoding of s
153 153 return r
154 154 return localstr(u.encode('UTF-8'), r)
155 155 except UnicodeDecodeError:
156 156 u = s.decode("utf-8", "replace") # last ditch
157 157 # can't round-trip
158 158 return u.encode(_sysstr(encoding), u"replace")
159 159 except LookupError as k:
160 160 raise error.Abort(k, hint="please check your locale settings")
161 161
162 162 def fromlocal(s):
163 163 """
164 164 Convert a string from the local character encoding to UTF-8
165 165
166 166 We attempt to decode strings using the encoding mode set by
167 167 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
168 168 characters will cause an error message. Other modes include
169 169 'replace', which replaces unknown characters with a special
170 170 Unicode character, and 'ignore', which drops the character.
171 171 """
172 172
173 173 # can we do a lossless round-trip?
174 174 if isinstance(s, localstr):
175 175 return s._utf8
176 176 if isasciistr(s):
177 177 return s
178 178
179 179 try:
180 180 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
181 181 return u.encode("utf-8")
182 182 except UnicodeDecodeError as inst:
183 183 sub = s[max(0, inst.start - 10):inst.start + 10]
184 184 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
185 185 except LookupError as k:
186 186 raise error.Abort(k, hint="please check your locale settings")
187 187
188 188 def unitolocal(u):
189 189 """Convert a unicode string to a byte string of local encoding"""
190 190 return tolocal(u.encode('utf-8'))
191 191
192 192 def unifromlocal(s):
193 193 """Convert a byte string of local encoding to a unicode string"""
194 194 return fromlocal(s).decode('utf-8')
195 195
196 196 def unimethod(bytesfunc):
197 197 """Create a proxy method that forwards __unicode__() and __str__() of
198 198 Python 3 to __bytes__()"""
199 199 def unifunc(obj):
200 200 return unifromlocal(bytesfunc(obj))
201 201 return unifunc
202 202
203 203 # converter functions between native str and byte string. use these if the
204 204 # character encoding is not aware (e.g. exception message) or is known to
205 205 # be locale dependent (e.g. date formatting.)
206 206 if pycompat.ispy3:
207 207 strtolocal = unitolocal
208 208 strfromlocal = unifromlocal
209 209 strmethod = unimethod
210 210 else:
211 211 strtolocal = pycompat.identity
212 212 strfromlocal = pycompat.identity
213 213 strmethod = pycompat.identity
214 214
215 215 if not _nativeenviron:
216 216 # now encoding and helper functions are available, recreate the environ
217 217 # dict to be exported to other modules
218 218 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
219 219 for k, v in os.environ.items()) # re-exports
220 220
221 221 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
222 222 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
223 223 and "WFA" or "WF")
224 224
225 225 def colwidth(s):
226 226 "Find the column width of a string for display in the local encoding"
227 227 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
228 228
229 229 def ucolwidth(d):
230 230 "Find the column width of a Unicode string for display"
231 231 eaw = getattr(unicodedata, 'east_asian_width', None)
232 232 if eaw is not None:
233 233 return sum([eaw(c) in _wide and 2 or 1 for c in d])
234 234 return len(d)
235 235
236 236 def getcols(s, start, c):
237 237 '''Use colwidth to find a c-column substring of s starting at byte
238 238 index start'''
239 239 for x in xrange(start + c, len(s)):
240 240 t = s[start:x]
241 241 if colwidth(t) == c:
242 242 return t
243 243
244 244 def trim(s, width, ellipsis='', leftside=False):
245 245 """Trim string 's' to at most 'width' columns (including 'ellipsis').
246 246
247 247 If 'leftside' is True, left side of string 's' is trimmed.
248 248 'ellipsis' is always placed at trimmed side.
249 249
250 250 >>> from .node import bin
251 251 >>> def bprint(s):
252 252 ... print(pycompat.sysstr(s))
253 253 >>> ellipsis = b'+++'
254 254 >>> from . import encoding
255 255 >>> encoding.encoding = b'utf-8'
256 256 >>> t = b'1234567890'
257 257 >>> bprint(trim(t, 12, ellipsis=ellipsis))
258 258 1234567890
259 259 >>> bprint(trim(t, 10, ellipsis=ellipsis))
260 260 1234567890
261 261 >>> bprint(trim(t, 8, ellipsis=ellipsis))
262 262 12345+++
263 263 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
264 264 +++67890
265 265 >>> bprint(trim(t, 8))
266 266 12345678
267 267 >>> bprint(trim(t, 8, leftside=True))
268 268 34567890
269 269 >>> bprint(trim(t, 3, ellipsis=ellipsis))
270 270 +++
271 271 >>> bprint(trim(t, 1, ellipsis=ellipsis))
272 272 +
273 273 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
274 274 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
275 275 >>> bprint(trim(t, 12, ellipsis=ellipsis))
276 276 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
277 277 >>> bprint(trim(t, 10, ellipsis=ellipsis))
278 278 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
279 279 >>> bprint(trim(t, 8, ellipsis=ellipsis))
280 280 \xe3\x81\x82\xe3\x81\x84+++
281 281 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
282 282 +++\xe3\x81\x88\xe3\x81\x8a
283 283 >>> bprint(trim(t, 5))
284 284 \xe3\x81\x82\xe3\x81\x84
285 285 >>> bprint(trim(t, 5, leftside=True))
286 286 \xe3\x81\x88\xe3\x81\x8a
287 287 >>> bprint(trim(t, 4, ellipsis=ellipsis))
288 288 +++
289 289 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
290 290 +++
291 291 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
292 292 >>> bprint(trim(t, 12, ellipsis=ellipsis))
293 293 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
294 294 >>> bprint(trim(t, 10, ellipsis=ellipsis))
295 295 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
296 296 >>> bprint(trim(t, 8, ellipsis=ellipsis))
297 297 \x11\x22\x33\x44\x55+++
298 298 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
299 299 +++\x66\x77\x88\x99\xaa
300 300 >>> bprint(trim(t, 8))
301 301 \x11\x22\x33\x44\x55\x66\x77\x88
302 302 >>> bprint(trim(t, 8, leftside=True))
303 303 \x33\x44\x55\x66\x77\x88\x99\xaa
304 304 >>> bprint(trim(t, 3, ellipsis=ellipsis))
305 305 +++
306 306 >>> bprint(trim(t, 1, ellipsis=ellipsis))
307 307 +
308 308 """
309 309 try:
310 310 u = s.decode(_sysstr(encoding))
311 311 except UnicodeDecodeError:
312 312 if len(s) <= width: # trimming is not needed
313 313 return s
314 314 width -= len(ellipsis)
315 315 if width <= 0: # no enough room even for ellipsis
316 316 return ellipsis[:width + len(ellipsis)]
317 317 if leftside:
318 318 return ellipsis + s[-width:]
319 319 return s[:width] + ellipsis
320 320
321 321 if ucolwidth(u) <= width: # trimming is not needed
322 322 return s
323 323
324 324 width -= len(ellipsis)
325 325 if width <= 0: # no enough room even for ellipsis
326 326 return ellipsis[:width + len(ellipsis)]
327 327
328 328 if leftside:
329 329 uslice = lambda i: u[i:]
330 330 concat = lambda s: ellipsis + s
331 331 else:
332 332 uslice = lambda i: u[:-i]
333 333 concat = lambda s: s + ellipsis
334 334 for i in xrange(1, len(u)):
335 335 usub = uslice(i)
336 336 if ucolwidth(usub) <= width:
337 337 return concat(usub.encode(_sysstr(encoding)))
338 338 return ellipsis # no enough room for multi-column characters
339 339
340 340 def lower(s):
341 341 "best-effort encoding-aware case-folding of local string s"
342 342 try:
343 343 return asciilower(s)
344 344 except UnicodeDecodeError:
345 345 pass
346 346 try:
347 347 if isinstance(s, localstr):
348 348 u = s._utf8.decode("utf-8")
349 349 else:
350 350 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
351 351
352 352 lu = u.lower()
353 353 if u == lu:
354 354 return s # preserve localstring
355 355 return lu.encode(_sysstr(encoding))
356 356 except UnicodeError:
357 357 return s.lower() # we don't know how to fold this except in ASCII
358 358 except LookupError as k:
359 359 raise error.Abort(k, hint="please check your locale settings")
360 360
361 361 def upper(s):
362 362 "best-effort encoding-aware case-folding of local string s"
363 363 try:
364 364 return asciiupper(s)
365 365 except UnicodeDecodeError:
366 366 return upperfallback(s)
367 367
368 368 def upperfallback(s):
369 369 try:
370 370 if isinstance(s, localstr):
371 371 u = s._utf8.decode("utf-8")
372 372 else:
373 373 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
374 374
375 375 uu = u.upper()
376 376 if u == uu:
377 377 return s # preserve localstring
378 378 return uu.encode(_sysstr(encoding))
379 379 except UnicodeError:
380 380 return s.upper() # we don't know how to fold this except in ASCII
381 381 except LookupError as k:
382 382 raise error.Abort(k, hint="please check your locale settings")
383 383
384 384 class normcasespecs(object):
385 385 '''what a platform's normcase does to ASCII strings
386 386
387 387 This is specified per platform, and should be consistent with what normcase
388 388 on that platform actually does.
389 389
390 390 lower: normcase lowercases ASCII strings
391 391 upper: normcase uppercases ASCII strings
392 392 other: the fallback function should always be called
393 393
394 394 This should be kept in sync with normcase_spec in util.h.'''
395 395 lower = -1
396 396 upper = 1
397 397 other = 0
398 398
399 399 def jsonescape(s, paranoid=False):
400 400 '''returns a string suitable for JSON
401 401
402 402 JSON is problematic for us because it doesn't support non-Unicode
403 403 bytes. To deal with this, we take the following approach:
404 404
405 405 - localstr objects are converted back to UTF-8
406 406 - valid UTF-8/ASCII strings are passed as-is
407 407 - other strings are converted to UTF-8b surrogate encoding
408 408 - apply JSON-specified string escaping
409 409
410 410 (escapes are doubled in these tests)
411 411
412 412 >>> jsonescape(b'this is a test')
413 413 'this is a test'
414 414 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
415 415 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
416 416 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
417 417 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
418 418 >>> jsonescape(b'a weird byte: \\xdd')
419 419 'a weird byte: \\xed\\xb3\\x9d'
420 420 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
421 421 'utf-8: caf\\xc3\\xa9'
422 422 >>> jsonescape(b'')
423 423 ''
424 424
425 425 If paranoid, non-ascii and common troublesome characters are also escaped.
426 426 This is suitable for web output.
427 427
428 428 >>> s = b'escape characters: \\0 \\x0b \\x7f'
429 429 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
430 430 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
431 431 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
432 432 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
433 433 'escape boundary: ~ \\\\u007f \\\\u0080'
434 434 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
435 435 'a weird byte: \\\\udcdd'
436 436 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
437 437 'utf-8: caf\\\\u00e9'
438 438 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
439 439 'non-BMP: \\\\ud834\\\\udd1e'
440 440 >>> jsonescape(b'<foo@example.org>', paranoid=True)
441 441 '\\\\u003cfoo@example.org\\\\u003e'
442 442 '''
443 443
444 444 u8chars = toutf8b(s)
445 445 try:
446 446 return _jsonescapeu8fast(u8chars, paranoid)
447 447 except ValueError:
448 448 pass
449 449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
450 450
451 451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
452 452
453 453 def getutf8char(s, pos):
454 454 '''get the next full utf-8 character in the given string, starting at pos
455 455
456 456 Raises a UnicodeError if the given location does not start a valid
457 457 utf-8 character.
458 458 '''
459 459
460 460 # find how many bytes to attempt decoding from first nibble
461 461 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
462 462 if not l: # ascii
463 463 return s[pos:pos + 1]
464 464
465 465 c = s[pos:pos + l]
466 466 # validate with attempted decode
467 467 c.decode("utf-8")
468 468 return c
469 469
470 470 def toutf8b(s):
471 471 '''convert a local, possibly-binary string into UTF-8b
472 472
473 473 This is intended as a generic method to preserve data when working
474 474 with schemes like JSON and XML that have no provision for
475 475 arbitrary byte strings. As Mercurial often doesn't know
476 476 what encoding data is in, we use so-called UTF-8b.
477 477
478 478 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
479 479 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
480 480 uDC00-uDCFF.
481 481
482 482 Principles of operation:
483 483
484 484 - ASCII and UTF-8 data successfully round-trips and is understood
485 485 by Unicode-oriented clients
486 486 - filenames and file contents in arbitrary other encodings can have
487 487 be round-tripped or recovered by clueful clients
488 488 - local strings that have a cached known UTF-8 encoding (aka
489 489 localstr) get sent as UTF-8 so Unicode-oriented clients get the
490 490 Unicode data they want
491 491 - because we must preserve UTF-8 bytestring in places such as
492 492 filenames, metadata can't be roundtripped without help
493 493
494 494 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
495 495 arbitrary bytes into an internal Unicode format that can be
496 496 re-encoded back into the original. Here we are exposing the
497 497 internal surrogate encoding as a UTF-8 string.)
498 498 '''
499 499
500 500 if not isinstance(s, localstr) and isasciistr(s):
501 501 return s
502 502 if "\xed" not in s:
503 503 if isinstance(s, localstr):
504 504 return s._utf8
505 505 try:
506 506 s.decode('utf-8')
507 507 return s
508 508 except UnicodeDecodeError:
509 509 pass
510 510
511 s = pycompat.bytestr(s)
511 512 r = ""
512 513 pos = 0
513 514 l = len(s)
514 515 while pos < l:
515 516 try:
516 517 c = getutf8char(s, pos)
517 518 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
518 519 # have to re-escape existing U+DCxx characters
519 520 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
520 521 pos += 1
521 522 else:
522 523 pos += len(c)
523 524 except UnicodeDecodeError:
524 525 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
525 526 pos += 1
526 527 r += c
527 528 return r
528 529
529 530 def fromutf8b(s):
530 531 '''Given a UTF-8b string, return a local, possibly-binary string.
531 532
532 533 return the original binary string. This
533 534 is a round-trip process for strings like filenames, but metadata
534 535 that's was passed through tolocal will remain in UTF-8.
535 536
536 537 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
537 538 >>> m = b"\\xc3\\xa9\\x99abcd"
538 539 >>> toutf8b(m)
539 540 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
540 541 >>> roundtrip(m)
541 542 True
542 543 >>> roundtrip(b"\\xc2\\xc2\\x80")
543 544 True
544 545 >>> roundtrip(b"\\xef\\xbf\\xbd")
545 546 True
546 547 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
547 548 True
548 549 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
549 550 True
550 551 '''
551 552
552 553 if isasciistr(s):
553 554 return s
554 555 # fast path - look for uDxxx prefixes in s
555 556 if "\xed" not in s:
556 557 return s
557 558
558 559 # We could do this with the unicode type but some Python builds
559 560 # use UTF-16 internally (issue5031) which causes non-BMP code
560 561 # points to be escaped. Instead, we use our handy getutf8char
561 562 # helper again to walk the string without "decoding" it.
562 563
564 s = pycompat.bytestr(s)
563 565 r = ""
564 566 pos = 0
565 567 l = len(s)
566 568 while pos < l:
567 569 c = getutf8char(s, pos)
568 570 pos += len(c)
569 571 # unescape U+DCxx characters
570 572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
571 573 c = chr(ord(c.decode("utf-8")) & 0xff)
572 574 r += c
573 575 return r
574 576
575 577 if pycompat.ispy3:
576 578 class strio(io.TextIOWrapper):
577 579 """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
578 580
579 581 Also works around Python closing streams.
580 582 """
581 583
582 584 def __init__(self, buffer):
583 585 super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
584 586
585 587 def __del__(self):
586 588 """Override __del__ so it doesn't close the underlying stream."""
587 589 else:
588 590 strio = pycompat.identity
General Comments 0
You need to be logged in to leave comments. Login now