##// END OF EJS Templates
encoding: use range() instead of xrange()...
Gregory Szorc -
r28508:3c6e94d0 default
parent child Browse files
Show More
@@ -1,579 +1,579
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import array
11 11 import locale
12 12 import os
13 13 import sys
14 14 import unicodedata
15 15
16 16 from . import (
17 17 error,
18 18 )
19 19
20 20 if sys.version_info[0] >= 3:
21 21 unichr = chr
22 22
23 23 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
24 24 # "Unicode Subtleties"), so we need to ignore them in some places for
25 25 # sanity.
26 26 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
27 27 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
28 28 "206a 206b 206c 206d 206e 206f feff".split()]
29 29 # verify the next function will work
30 30 if sys.version_info[0] >= 3:
31 31 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
32 32 else:
33 33 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
34 34
35 35 def hfsignoreclean(s):
36 36 """Remove codepoints ignored by HFS+ from s.
37 37
38 38 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
39 39 '.hg'
40 40 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
41 41 '.hg'
42 42 """
43 43 if "\xe2" in s or "\xef" in s:
44 44 for c in _ignore:
45 45 s = s.replace(c, '')
46 46 return s
47 47
48 48 def _getpreferredencoding():
49 49 '''
50 50 On darwin, getpreferredencoding ignores the locale environment and
51 51 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
52 52 for Python 2.7 and up. This is the same corrected code for earlier
53 53 Python versions.
54 54
55 55 However, we can't use a version check for this method, as some distributions
56 56 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
57 57 encoding, as it is unlikely that this encoding is the actually expected.
58 58 '''
59 59 try:
60 60 locale.CODESET
61 61 except AttributeError:
62 62 # Fall back to parsing environment variables :-(
63 63 return locale.getdefaultlocale()[1]
64 64
65 65 oldloc = locale.setlocale(locale.LC_CTYPE)
66 66 locale.setlocale(locale.LC_CTYPE, "")
67 67 result = locale.nl_langinfo(locale.CODESET)
68 68 locale.setlocale(locale.LC_CTYPE, oldloc)
69 69
70 70 return result
71 71
72 72 _encodingfixers = {
73 73 '646': lambda: 'ascii',
74 74 'ANSI_X3.4-1968': lambda: 'ascii',
75 75 'mac-roman': _getpreferredencoding
76 76 }
77 77
78 78 try:
79 79 encoding = os.environ.get("HGENCODING")
80 80 if not encoding:
81 81 encoding = locale.getpreferredencoding() or 'ascii'
82 82 encoding = _encodingfixers.get(encoding, lambda: encoding)()
83 83 except locale.Error:
84 84 encoding = 'ascii'
85 85 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
86 86 fallbackencoding = 'ISO-8859-1'
87 87
88 88 class localstr(str):
89 89 '''This class allows strings that are unmodified to be
90 90 round-tripped to the local encoding and back'''
91 91 def __new__(cls, u, l):
92 92 s = str.__new__(cls, l)
93 93 s._utf8 = u
94 94 return s
95 95 def __hash__(self):
96 96 return hash(self._utf8) # avoid collisions in local string space
97 97
98 98 def tolocal(s):
99 99 """
100 100 Convert a string from internal UTF-8 to local encoding
101 101
102 102 All internal strings should be UTF-8 but some repos before the
103 103 implementation of locale support may contain latin1 or possibly
104 104 other character sets. We attempt to decode everything strictly
105 105 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
106 106 replace unknown characters.
107 107
108 108 The localstr class is used to cache the known UTF-8 encoding of
109 109 strings next to their local representation to allow lossless
110 110 round-trip conversion back to UTF-8.
111 111
112 112 >>> u = 'foo: \\xc3\\xa4' # utf-8
113 113 >>> l = tolocal(u)
114 114 >>> l
115 115 'foo: ?'
116 116 >>> fromlocal(l)
117 117 'foo: \\xc3\\xa4'
118 118 >>> u2 = 'foo: \\xc3\\xa1'
119 119 >>> d = { l: 1, tolocal(u2): 2 }
120 120 >>> len(d) # no collision
121 121 2
122 122 >>> 'foo: ?' in d
123 123 False
124 124 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
125 125 >>> l = tolocal(l1)
126 126 >>> l
127 127 'foo: ?'
128 128 >>> fromlocal(l) # magically in utf-8
129 129 'foo: \\xc3\\xa4'
130 130 """
131 131
132 132 try:
133 133 try:
134 134 # make sure string is actually stored in UTF-8
135 135 u = s.decode('UTF-8')
136 136 if encoding == 'UTF-8':
137 137 # fast path
138 138 return s
139 139 r = u.encode(encoding, "replace")
140 140 if u == r.decode(encoding):
141 141 # r is a safe, non-lossy encoding of s
142 142 return r
143 143 return localstr(s, r)
144 144 except UnicodeDecodeError:
145 145 # we should only get here if we're looking at an ancient changeset
146 146 try:
147 147 u = s.decode(fallbackencoding)
148 148 r = u.encode(encoding, "replace")
149 149 if u == r.decode(encoding):
150 150 # r is a safe, non-lossy encoding of s
151 151 return r
152 152 return localstr(u.encode('UTF-8'), r)
153 153 except UnicodeDecodeError:
154 154 u = s.decode("utf-8", "replace") # last ditch
155 155 return u.encode(encoding, "replace") # can't round-trip
156 156 except LookupError as k:
157 157 raise error.Abort(k, hint="please check your locale settings")
158 158
159 159 def fromlocal(s):
160 160 """
161 161 Convert a string from the local character encoding to UTF-8
162 162
163 163 We attempt to decode strings using the encoding mode set by
164 164 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
165 165 characters will cause an error message. Other modes include
166 166 'replace', which replaces unknown characters with a special
167 167 Unicode character, and 'ignore', which drops the character.
168 168 """
169 169
170 170 # can we do a lossless round-trip?
171 171 if isinstance(s, localstr):
172 172 return s._utf8
173 173
174 174 try:
175 175 return s.decode(encoding, encodingmode).encode("utf-8")
176 176 except UnicodeDecodeError as inst:
177 177 sub = s[max(0, inst.start - 10):inst.start + 10]
178 178 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
179 179 except LookupError as k:
180 180 raise error.Abort(k, hint="please check your locale settings")
181 181
182 182 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
183 183 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
184 184 and "WFA" or "WF")
185 185
186 186 def colwidth(s):
187 187 "Find the column width of a string for display in the local encoding"
188 188 return ucolwidth(s.decode(encoding, 'replace'))
189 189
190 190 def ucolwidth(d):
191 191 "Find the column width of a Unicode string for display"
192 192 eaw = getattr(unicodedata, 'east_asian_width', None)
193 193 if eaw is not None:
194 194 return sum([eaw(c) in wide and 2 or 1 for c in d])
195 195 return len(d)
196 196
197 197 def getcols(s, start, c):
198 198 '''Use colwidth to find a c-column substring of s starting at byte
199 199 index start'''
200 200 for x in xrange(start + c, len(s)):
201 201 t = s[start:x]
202 202 if colwidth(t) == c:
203 203 return t
204 204
205 205 def trim(s, width, ellipsis='', leftside=False):
206 206 """Trim string 's' to at most 'width' columns (including 'ellipsis').
207 207
208 208 If 'leftside' is True, left side of string 's' is trimmed.
209 209 'ellipsis' is always placed at trimmed side.
210 210
211 211 >>> ellipsis = '+++'
212 212 >>> from . import encoding
213 213 >>> encoding.encoding = 'utf-8'
214 214 >>> t= '1234567890'
215 215 >>> print trim(t, 12, ellipsis=ellipsis)
216 216 1234567890
217 217 >>> print trim(t, 10, ellipsis=ellipsis)
218 218 1234567890
219 219 >>> print trim(t, 8, ellipsis=ellipsis)
220 220 12345+++
221 221 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
222 222 +++67890
223 223 >>> print trim(t, 8)
224 224 12345678
225 225 >>> print trim(t, 8, leftside=True)
226 226 34567890
227 227 >>> print trim(t, 3, ellipsis=ellipsis)
228 228 +++
229 229 >>> print trim(t, 1, ellipsis=ellipsis)
230 230 +
231 231 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
232 232 >>> t = u.encode(encoding.encoding)
233 233 >>> print trim(t, 12, ellipsis=ellipsis)
234 234 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
235 235 >>> print trim(t, 10, ellipsis=ellipsis)
236 236 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
237 237 >>> print trim(t, 8, ellipsis=ellipsis)
238 238 \xe3\x81\x82\xe3\x81\x84+++
239 239 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
240 240 +++\xe3\x81\x88\xe3\x81\x8a
241 241 >>> print trim(t, 5)
242 242 \xe3\x81\x82\xe3\x81\x84
243 243 >>> print trim(t, 5, leftside=True)
244 244 \xe3\x81\x88\xe3\x81\x8a
245 245 >>> print trim(t, 4, ellipsis=ellipsis)
246 246 +++
247 247 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
248 248 +++
249 249 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
250 250 >>> print trim(t, 12, ellipsis=ellipsis)
251 251 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
252 252 >>> print trim(t, 10, ellipsis=ellipsis)
253 253 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
254 254 >>> print trim(t, 8, ellipsis=ellipsis)
255 255 \x11\x22\x33\x44\x55+++
256 256 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
257 257 +++\x66\x77\x88\x99\xaa
258 258 >>> print trim(t, 8)
259 259 \x11\x22\x33\x44\x55\x66\x77\x88
260 260 >>> print trim(t, 8, leftside=True)
261 261 \x33\x44\x55\x66\x77\x88\x99\xaa
262 262 >>> print trim(t, 3, ellipsis=ellipsis)
263 263 +++
264 264 >>> print trim(t, 1, ellipsis=ellipsis)
265 265 +
266 266 """
267 267 try:
268 268 u = s.decode(encoding)
269 269 except UnicodeDecodeError:
270 270 if len(s) <= width: # trimming is not needed
271 271 return s
272 272 width -= len(ellipsis)
273 273 if width <= 0: # no enough room even for ellipsis
274 274 return ellipsis[:width + len(ellipsis)]
275 275 if leftside:
276 276 return ellipsis + s[-width:]
277 277 return s[:width] + ellipsis
278 278
279 279 if ucolwidth(u) <= width: # trimming is not needed
280 280 return s
281 281
282 282 width -= len(ellipsis)
283 283 if width <= 0: # no enough room even for ellipsis
284 284 return ellipsis[:width + len(ellipsis)]
285 285
286 286 if leftside:
287 287 uslice = lambda i: u[i:]
288 288 concat = lambda s: ellipsis + s
289 289 else:
290 290 uslice = lambda i: u[:-i]
291 291 concat = lambda s: s + ellipsis
292 292 for i in xrange(1, len(u)):
293 293 usub = uslice(i)
294 294 if ucolwidth(usub) <= width:
295 295 return concat(usub.encode(encoding))
296 296 return ellipsis # no enough room for multi-column characters
297 297
298 298 def _asciilower(s):
299 299 '''convert a string to lowercase if ASCII
300 300
301 301 Raises UnicodeDecodeError if non-ASCII characters are found.'''
302 302 s.decode('ascii')
303 303 return s.lower()
304 304
305 305 def asciilower(s):
306 306 # delay importing avoids cyclic dependency around "parsers" in
307 307 # pure Python build (util => i18n => encoding => parsers => util)
308 308 from . import parsers
309 309 impl = getattr(parsers, 'asciilower', _asciilower)
310 310 global asciilower
311 311 asciilower = impl
312 312 return impl(s)
313 313
314 314 def _asciiupper(s):
315 315 '''convert a string to uppercase if ASCII
316 316
317 317 Raises UnicodeDecodeError if non-ASCII characters are found.'''
318 318 s.decode('ascii')
319 319 return s.upper()
320 320
321 321 def asciiupper(s):
322 322 # delay importing avoids cyclic dependency around "parsers" in
323 323 # pure Python build (util => i18n => encoding => parsers => util)
324 324 from . import parsers
325 325 impl = getattr(parsers, 'asciiupper', _asciiupper)
326 326 global asciiupper
327 327 asciiupper = impl
328 328 return impl(s)
329 329
330 330 def lower(s):
331 331 "best-effort encoding-aware case-folding of local string s"
332 332 try:
333 333 return asciilower(s)
334 334 except UnicodeDecodeError:
335 335 pass
336 336 try:
337 337 if isinstance(s, localstr):
338 338 u = s._utf8.decode("utf-8")
339 339 else:
340 340 u = s.decode(encoding, encodingmode)
341 341
342 342 lu = u.lower()
343 343 if u == lu:
344 344 return s # preserve localstring
345 345 return lu.encode(encoding)
346 346 except UnicodeError:
347 347 return s.lower() # we don't know how to fold this except in ASCII
348 348 except LookupError as k:
349 349 raise error.Abort(k, hint="please check your locale settings")
350 350
351 351 def upper(s):
352 352 "best-effort encoding-aware case-folding of local string s"
353 353 try:
354 354 return asciiupper(s)
355 355 except UnicodeDecodeError:
356 356 return upperfallback(s)
357 357
358 358 def upperfallback(s):
359 359 try:
360 360 if isinstance(s, localstr):
361 361 u = s._utf8.decode("utf-8")
362 362 else:
363 363 u = s.decode(encoding, encodingmode)
364 364
365 365 uu = u.upper()
366 366 if u == uu:
367 367 return s # preserve localstring
368 368 return uu.encode(encoding)
369 369 except UnicodeError:
370 370 return s.upper() # we don't know how to fold this except in ASCII
371 371 except LookupError as k:
372 372 raise error.Abort(k, hint="please check your locale settings")
373 373
374 374 class normcasespecs(object):
375 375 '''what a platform's normcase does to ASCII strings
376 376
377 377 This is specified per platform, and should be consistent with what normcase
378 378 on that platform actually does.
379 379
380 380 lower: normcase lowercases ASCII strings
381 381 upper: normcase uppercases ASCII strings
382 382 other: the fallback function should always be called
383 383
384 384 This should be kept in sync with normcase_spec in util.h.'''
385 385 lower = -1
386 386 upper = 1
387 387 other = 0
388 388
389 389 _jsonmap = []
390 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
391 _jsonmap.extend(chr(x) for x in xrange(32, 127))
390 _jsonmap.extend("\\u%04x" % x for x in range(32))
391 _jsonmap.extend(chr(x) for x in range(32, 127))
392 392 _jsonmap.append('\\u007f')
393 393 _jsonmap[0x09] = '\\t'
394 394 _jsonmap[0x0a] = '\\n'
395 395 _jsonmap[0x22] = '\\"'
396 396 _jsonmap[0x5c] = '\\\\'
397 397 _jsonmap[0x08] = '\\b'
398 398 _jsonmap[0x0c] = '\\f'
399 399 _jsonmap[0x0d] = '\\r'
400 400 _paranoidjsonmap = _jsonmap[:]
401 401 _paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
402 402 _paranoidjsonmap[0x3e] = '\\u003e' # '>'
403 _jsonmap.extend(chr(x) for x in xrange(128, 256))
403 _jsonmap.extend(chr(x) for x in range(128, 256))
404 404
405 405 def jsonescape(s, paranoid=False):
406 406 '''returns a string suitable for JSON
407 407
408 408 JSON is problematic for us because it doesn't support non-Unicode
409 409 bytes. To deal with this, we take the following approach:
410 410
411 411 - localstr objects are converted back to UTF-8
412 412 - valid UTF-8/ASCII strings are passed as-is
413 413 - other strings are converted to UTF-8b surrogate encoding
414 414 - apply JSON-specified string escaping
415 415
416 416 (escapes are doubled in these tests)
417 417
418 418 >>> jsonescape('this is a test')
419 419 'this is a test'
420 420 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
421 421 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
422 422 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
423 423 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
424 424 >>> jsonescape('a weird byte: \\xdd')
425 425 'a weird byte: \\xed\\xb3\\x9d'
426 426 >>> jsonescape('utf-8: caf\\xc3\\xa9')
427 427 'utf-8: caf\\xc3\\xa9'
428 428 >>> jsonescape('')
429 429 ''
430 430
431 431 If paranoid, non-ascii and common troublesome characters are also escaped.
432 432 This is suitable for web output.
433 433
434 434 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
435 435 'escape boundary: ~ \\\\u007f \\\\u0080'
436 436 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
437 437 'a weird byte: \\\\udcdd'
438 438 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
439 439 'utf-8: caf\\\\u00e9'
440 440 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
441 441 'non-BMP: \\\\ud834\\\\udd1e'
442 442 >>> jsonescape('<foo@example.org>', paranoid=True)
443 443 '\\\\u003cfoo@example.org\\\\u003e'
444 444 '''
445 445
446 446 if paranoid:
447 447 jm = _paranoidjsonmap
448 448 else:
449 449 jm = _jsonmap
450 450
451 451 u8chars = toutf8b(s)
452 452 try:
453 453 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
454 454 except IndexError:
455 455 pass
456 456 # non-BMP char is represented as UTF-16 surrogate pair
457 457 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
458 458 u16codes.pop(0) # drop BOM
459 459 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
460 460
461 461 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
462 462
463 463 def getutf8char(s, pos):
464 464 '''get the next full utf-8 character in the given string, starting at pos
465 465
466 466 Raises a UnicodeError if the given location does not start a valid
467 467 utf-8 character.
468 468 '''
469 469
470 470 # find how many bytes to attempt decoding from first nibble
471 471 l = _utf8len[ord(s[pos]) >> 4]
472 472 if not l: # ascii
473 473 return s[pos]
474 474
475 475 c = s[pos:pos + l]
476 476 # validate with attempted decode
477 477 c.decode("utf-8")
478 478 return c
479 479
480 480 def toutf8b(s):
481 481 '''convert a local, possibly-binary string into UTF-8b
482 482
483 483 This is intended as a generic method to preserve data when working
484 484 with schemes like JSON and XML that have no provision for
485 485 arbitrary byte strings. As Mercurial often doesn't know
486 486 what encoding data is in, we use so-called UTF-8b.
487 487
488 488 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
489 489 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
490 490 uDC00-uDCFF.
491 491
492 492 Principles of operation:
493 493
494 494 - ASCII and UTF-8 data successfully round-trips and is understood
495 495 by Unicode-oriented clients
496 496 - filenames and file contents in arbitrary other encodings can have
497 497 be round-tripped or recovered by clueful clients
498 498 - local strings that have a cached known UTF-8 encoding (aka
499 499 localstr) get sent as UTF-8 so Unicode-oriented clients get the
500 500 Unicode data they want
501 501 - because we must preserve UTF-8 bytestring in places such as
502 502 filenames, metadata can't be roundtripped without help
503 503
504 504 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
505 505 arbitrary bytes into an internal Unicode format that can be
506 506 re-encoded back into the original. Here we are exposing the
507 507 internal surrogate encoding as a UTF-8 string.)
508 508 '''
509 509
510 510 if "\xed" not in s:
511 511 if isinstance(s, localstr):
512 512 return s._utf8
513 513 try:
514 514 s.decode('utf-8')
515 515 return s
516 516 except UnicodeDecodeError:
517 517 pass
518 518
519 519 r = ""
520 520 pos = 0
521 521 l = len(s)
522 522 while pos < l:
523 523 try:
524 524 c = getutf8char(s, pos)
525 525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
526 526 # have to re-escape existing U+DCxx characters
527 527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
528 528 pos += 1
529 529 else:
530 530 pos += len(c)
531 531 except UnicodeDecodeError:
532 532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
533 533 pos += 1
534 534 r += c
535 535 return r
536 536
537 537 def fromutf8b(s):
538 538 '''Given a UTF-8b string, return a local, possibly-binary string.
539 539
540 540 return the original binary string. This
541 541 is a round-trip process for strings like filenames, but metadata
542 542 that's was passed through tolocal will remain in UTF-8.
543 543
544 544 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
545 545 >>> m = "\\xc3\\xa9\\x99abcd"
546 546 >>> toutf8b(m)
547 547 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
548 548 >>> roundtrip(m)
549 549 True
550 550 >>> roundtrip("\\xc2\\xc2\\x80")
551 551 True
552 552 >>> roundtrip("\\xef\\xbf\\xbd")
553 553 True
554 554 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
555 555 True
556 556 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
557 557 True
558 558 '''
559 559
560 560 # fast path - look for uDxxx prefixes in s
561 561 if "\xed" not in s:
562 562 return s
563 563
564 564 # We could do this with the unicode type but some Python builds
565 565 # use UTF-16 internally (issue5031) which causes non-BMP code
566 566 # points to be escaped. Instead, we use our handy getutf8char
567 567 # helper again to walk the string without "decoding" it.
568 568
569 569 r = ""
570 570 pos = 0
571 571 l = len(s)
572 572 while pos < l:
573 573 c = getutf8char(s, pos)
574 574 pos += len(c)
575 575 # unescape U+DCxx characters
576 576 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
577 577 c = chr(ord(c.decode("utf-8")) & 0xff)
578 578 r += c
579 579 return r
General Comments 0
You need to be logged in to leave comments. Login now