##// END OF EJS Templates
encoding: extend test cases for utf8b...
Matt Mackall -
r26963:de5ae97c default
parent child Browse files
Show More
@@ -1,519 +1,525 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 13 # sanity.
14 14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 16 "206a 206b 206c 206d 206e 206f feff".split()]
17 17 # verify the next function will work
18 18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19 19
20 20 def hfsignoreclean(s):
21 21 """Remove codepoints ignored by HFS+ from s.
22 22
23 23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 24 '.hg'
25 25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 26 '.hg'
27 27 """
28 28 if "\xe2" in s or "\xef" in s:
29 29 for c in _ignore:
30 30 s = s.replace(c, '')
31 31 return s
32 32
33 33 def _getpreferredencoding():
34 34 '''
35 35 On darwin, getpreferredencoding ignores the locale environment and
36 36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
37 37 for Python 2.7 and up. This is the same corrected code for earlier
38 38 Python versions.
39 39
40 40 However, we can't use a version check for this method, as some distributions
41 41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
42 42 encoding, as it is unlikely that this encoding is the actually expected.
43 43 '''
44 44 try:
45 45 locale.CODESET
46 46 except AttributeError:
47 47 # Fall back to parsing environment variables :-(
48 48 return locale.getdefaultlocale()[1]
49 49
50 50 oldloc = locale.setlocale(locale.LC_CTYPE)
51 51 locale.setlocale(locale.LC_CTYPE, "")
52 52 result = locale.nl_langinfo(locale.CODESET)
53 53 locale.setlocale(locale.LC_CTYPE, oldloc)
54 54
55 55 return result
56 56
57 57 _encodingfixers = {
58 58 '646': lambda: 'ascii',
59 59 'ANSI_X3.4-1968': lambda: 'ascii',
60 60 'mac-roman': _getpreferredencoding
61 61 }
62 62
63 63 try:
64 64 encoding = os.environ.get("HGENCODING")
65 65 if not encoding:
66 66 encoding = locale.getpreferredencoding() or 'ascii'
67 67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
68 68 except locale.Error:
69 69 encoding = 'ascii'
70 70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
71 71 fallbackencoding = 'ISO-8859-1'
72 72
73 73 class localstr(str):
74 74 '''This class allows strings that are unmodified to be
75 75 round-tripped to the local encoding and back'''
76 76 def __new__(cls, u, l):
77 77 s = str.__new__(cls, l)
78 78 s._utf8 = u
79 79 return s
80 80 def __hash__(self):
81 81 return hash(self._utf8) # avoid collisions in local string space
82 82
83 83 def tolocal(s):
84 84 """
85 85 Convert a string from internal UTF-8 to local encoding
86 86
87 87 All internal strings should be UTF-8 but some repos before the
88 88 implementation of locale support may contain latin1 or possibly
89 89 other character sets. We attempt to decode everything strictly
90 90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
91 91 replace unknown characters.
92 92
93 93 The localstr class is used to cache the known UTF-8 encoding of
94 94 strings next to their local representation to allow lossless
95 95 round-trip conversion back to UTF-8.
96 96
97 97 >>> u = 'foo: \\xc3\\xa4' # utf-8
98 98 >>> l = tolocal(u)
99 99 >>> l
100 100 'foo: ?'
101 101 >>> fromlocal(l)
102 102 'foo: \\xc3\\xa4'
103 103 >>> u2 = 'foo: \\xc3\\xa1'
104 104 >>> d = { l: 1, tolocal(u2): 2 }
105 105 >>> len(d) # no collision
106 106 2
107 107 >>> 'foo: ?' in d
108 108 False
109 109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
110 110 >>> l = tolocal(l1)
111 111 >>> l
112 112 'foo: ?'
113 113 >>> fromlocal(l) # magically in utf-8
114 114 'foo: \\xc3\\xa4'
115 115 """
116 116
117 117 try:
118 118 try:
119 119 # make sure string is actually stored in UTF-8
120 120 u = s.decode('UTF-8')
121 121 if encoding == 'UTF-8':
122 122 # fast path
123 123 return s
124 124 r = u.encode(encoding, "replace")
125 125 if u == r.decode(encoding):
126 126 # r is a safe, non-lossy encoding of s
127 127 return r
128 128 return localstr(s, r)
129 129 except UnicodeDecodeError:
130 130 # we should only get here if we're looking at an ancient changeset
131 131 try:
132 132 u = s.decode(fallbackencoding)
133 133 r = u.encode(encoding, "replace")
134 134 if u == r.decode(encoding):
135 135 # r is a safe, non-lossy encoding of s
136 136 return r
137 137 return localstr(u.encode('UTF-8'), r)
138 138 except UnicodeDecodeError:
139 139 u = s.decode("utf-8", "replace") # last ditch
140 140 return u.encode(encoding, "replace") # can't round-trip
141 141 except LookupError as k:
142 142 raise error.Abort(k, hint="please check your locale settings")
143 143
144 144 def fromlocal(s):
145 145 """
146 146 Convert a string from the local character encoding to UTF-8
147 147
148 148 We attempt to decode strings using the encoding mode set by
149 149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
150 150 characters will cause an error message. Other modes include
151 151 'replace', which replaces unknown characters with a special
152 152 Unicode character, and 'ignore', which drops the character.
153 153 """
154 154
155 155 # can we do a lossless round-trip?
156 156 if isinstance(s, localstr):
157 157 return s._utf8
158 158
159 159 try:
160 160 return s.decode(encoding, encodingmode).encode("utf-8")
161 161 except UnicodeDecodeError as inst:
162 162 sub = s[max(0, inst.start - 10):inst.start + 10]
163 163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
164 164 except LookupError as k:
165 165 raise error.Abort(k, hint="please check your locale settings")
166 166
167 167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
168 168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
169 169 and "WFA" or "WF")
170 170
171 171 def colwidth(s):
172 172 "Find the column width of a string for display in the local encoding"
173 173 return ucolwidth(s.decode(encoding, 'replace'))
174 174
175 175 def ucolwidth(d):
176 176 "Find the column width of a Unicode string for display"
177 177 eaw = getattr(unicodedata, 'east_asian_width', None)
178 178 if eaw is not None:
179 179 return sum([eaw(c) in wide and 2 or 1 for c in d])
180 180 return len(d)
181 181
182 182 def getcols(s, start, c):
183 183 '''Use colwidth to find a c-column substring of s starting at byte
184 184 index start'''
185 185 for x in xrange(start + c, len(s)):
186 186 t = s[start:x]
187 187 if colwidth(t) == c:
188 188 return t
189 189
190 190 def trim(s, width, ellipsis='', leftside=False):
191 191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
192 192
193 193 If 'leftside' is True, left side of string 's' is trimmed.
194 194 'ellipsis' is always placed at trimmed side.
195 195
196 196 >>> ellipsis = '+++'
197 197 >>> from mercurial import encoding
198 198 >>> encoding.encoding = 'utf-8'
199 199 >>> t= '1234567890'
200 200 >>> print trim(t, 12, ellipsis=ellipsis)
201 201 1234567890
202 202 >>> print trim(t, 10, ellipsis=ellipsis)
203 203 1234567890
204 204 >>> print trim(t, 8, ellipsis=ellipsis)
205 205 12345+++
206 206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
207 207 +++67890
208 208 >>> print trim(t, 8)
209 209 12345678
210 210 >>> print trim(t, 8, leftside=True)
211 211 34567890
212 212 >>> print trim(t, 3, ellipsis=ellipsis)
213 213 +++
214 214 >>> print trim(t, 1, ellipsis=ellipsis)
215 215 +
216 216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
217 217 >>> t = u.encode(encoding.encoding)
218 218 >>> print trim(t, 12, ellipsis=ellipsis)
219 219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
220 220 >>> print trim(t, 10, ellipsis=ellipsis)
221 221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
222 222 >>> print trim(t, 8, ellipsis=ellipsis)
223 223 \xe3\x81\x82\xe3\x81\x84+++
224 224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
225 225 +++\xe3\x81\x88\xe3\x81\x8a
226 226 >>> print trim(t, 5)
227 227 \xe3\x81\x82\xe3\x81\x84
228 228 >>> print trim(t, 5, leftside=True)
229 229 \xe3\x81\x88\xe3\x81\x8a
230 230 >>> print trim(t, 4, ellipsis=ellipsis)
231 231 +++
232 232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
233 233 +++
234 234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
235 235 >>> print trim(t, 12, ellipsis=ellipsis)
236 236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
237 237 >>> print trim(t, 10, ellipsis=ellipsis)
238 238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
239 239 >>> print trim(t, 8, ellipsis=ellipsis)
240 240 \x11\x22\x33\x44\x55+++
241 241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
242 242 +++\x66\x77\x88\x99\xaa
243 243 >>> print trim(t, 8)
244 244 \x11\x22\x33\x44\x55\x66\x77\x88
245 245 >>> print trim(t, 8, leftside=True)
246 246 \x33\x44\x55\x66\x77\x88\x99\xaa
247 247 >>> print trim(t, 3, ellipsis=ellipsis)
248 248 +++
249 249 >>> print trim(t, 1, ellipsis=ellipsis)
250 250 +
251 251 """
252 252 try:
253 253 u = s.decode(encoding)
254 254 except UnicodeDecodeError:
255 255 if len(s) <= width: # trimming is not needed
256 256 return s
257 257 width -= len(ellipsis)
258 258 if width <= 0: # no enough room even for ellipsis
259 259 return ellipsis[:width + len(ellipsis)]
260 260 if leftside:
261 261 return ellipsis + s[-width:]
262 262 return s[:width] + ellipsis
263 263
264 264 if ucolwidth(u) <= width: # trimming is not needed
265 265 return s
266 266
267 267 width -= len(ellipsis)
268 268 if width <= 0: # no enough room even for ellipsis
269 269 return ellipsis[:width + len(ellipsis)]
270 270
271 271 if leftside:
272 272 uslice = lambda i: u[i:]
273 273 concat = lambda s: ellipsis + s
274 274 else:
275 275 uslice = lambda i: u[:-i]
276 276 concat = lambda s: s + ellipsis
277 277 for i in xrange(1, len(u)):
278 278 usub = uslice(i)
279 279 if ucolwidth(usub) <= width:
280 280 return concat(usub.encode(encoding))
281 281 return ellipsis # no enough room for multi-column characters
282 282
283 283 def _asciilower(s):
284 284 '''convert a string to lowercase if ASCII
285 285
286 286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
287 287 s.decode('ascii')
288 288 return s.lower()
289 289
290 290 def asciilower(s):
291 291 # delay importing avoids cyclic dependency around "parsers" in
292 292 # pure Python build (util => i18n => encoding => parsers => util)
293 293 import parsers
294 294 impl = getattr(parsers, 'asciilower', _asciilower)
295 295 global asciilower
296 296 asciilower = impl
297 297 return impl(s)
298 298
299 299 def _asciiupper(s):
300 300 '''convert a string to uppercase if ASCII
301 301
302 302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
303 303 s.decode('ascii')
304 304 return s.upper()
305 305
306 306 def asciiupper(s):
307 307 # delay importing avoids cyclic dependency around "parsers" in
308 308 # pure Python build (util => i18n => encoding => parsers => util)
309 309 import parsers
310 310 impl = getattr(parsers, 'asciiupper', _asciiupper)
311 311 global asciiupper
312 312 asciiupper = impl
313 313 return impl(s)
314 314
315 315 def lower(s):
316 316 "best-effort encoding-aware case-folding of local string s"
317 317 try:
318 318 return asciilower(s)
319 319 except UnicodeDecodeError:
320 320 pass
321 321 try:
322 322 if isinstance(s, localstr):
323 323 u = s._utf8.decode("utf-8")
324 324 else:
325 325 u = s.decode(encoding, encodingmode)
326 326
327 327 lu = u.lower()
328 328 if u == lu:
329 329 return s # preserve localstring
330 330 return lu.encode(encoding)
331 331 except UnicodeError:
332 332 return s.lower() # we don't know how to fold this except in ASCII
333 333 except LookupError as k:
334 334 raise error.Abort(k, hint="please check your locale settings")
335 335
336 336 def upper(s):
337 337 "best-effort encoding-aware case-folding of local string s"
338 338 try:
339 339 return asciiupper(s)
340 340 except UnicodeDecodeError:
341 341 return upperfallback(s)
342 342
343 343 def upperfallback(s):
344 344 try:
345 345 if isinstance(s, localstr):
346 346 u = s._utf8.decode("utf-8")
347 347 else:
348 348 u = s.decode(encoding, encodingmode)
349 349
350 350 uu = u.upper()
351 351 if u == uu:
352 352 return s # preserve localstring
353 353 return uu.encode(encoding)
354 354 except UnicodeError:
355 355 return s.upper() # we don't know how to fold this except in ASCII
356 356 except LookupError as k:
357 357 raise error.Abort(k, hint="please check your locale settings")
358 358
359 359 class normcasespecs(object):
360 360 '''what a platform's normcase does to ASCII strings
361 361
362 362 This is specified per platform, and should be consistent with what normcase
363 363 on that platform actually does.
364 364
365 365 lower: normcase lowercases ASCII strings
366 366 upper: normcase uppercases ASCII strings
367 367 other: the fallback function should always be called
368 368
369 369 This should be kept in sync with normcase_spec in util.h.'''
370 370 lower = -1
371 371 upper = 1
372 372 other = 0
373 373
374 374 _jsonmap = {}
375 375
376 376 def jsonescape(s):
377 377 '''returns a string suitable for JSON
378 378
379 379 JSON is problematic for us because it doesn't support non-Unicode
380 380 bytes. To deal with this, we take the following approach:
381 381
382 382 - localstr objects are converted back to UTF-8
383 383 - valid UTF-8/ASCII strings are passed as-is
384 384 - other strings are converted to UTF-8b surrogate encoding
385 385 - apply JSON-specified string escaping
386 386
387 387 (escapes are doubled in these tests)
388 388
389 389 >>> jsonescape('this is a test')
390 390 'this is a test'
391 391 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
392 392 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
393 393 >>> jsonescape('a weird byte: \\xdd')
394 394 'a weird byte: \\xed\\xb3\\x9d'
395 395 >>> jsonescape('utf-8: caf\\xc3\\xa9')
396 396 'utf-8: caf\\xc3\\xa9'
397 397 >>> jsonescape('')
398 398 ''
399 399 '''
400 400
401 401 if not _jsonmap:
402 402 for x in xrange(32):
403 403 _jsonmap[chr(x)] = "\u%04x" %x
404 404 for x in xrange(32, 256):
405 405 c = chr(x)
406 406 _jsonmap[c] = c
407 407 _jsonmap['\t'] = '\\t'
408 408 _jsonmap['\n'] = '\\n'
409 409 _jsonmap['\"'] = '\\"'
410 410 _jsonmap['\\'] = '\\\\'
411 411 _jsonmap['\b'] = '\\b'
412 412 _jsonmap['\f'] = '\\f'
413 413 _jsonmap['\r'] = '\\r'
414 414
415 415 return ''.join(_jsonmap[c] for c in toutf8b(s))
416 416
417 417 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
418 418
419 419 def getutf8char(s, pos):
420 420 '''get the next full utf-8 character in the given string, starting at pos
421 421
422 422 Raises a UnicodeError if the given location does not start a valid
423 423 utf-8 character.
424 424 '''
425 425
426 426 # find how many bytes to attempt decoding from first nibble
427 427 l = _utf8len[ord(s[pos]) >> 4]
428 428 if not l: # ascii
429 429 return s[pos]
430 430
431 431 c = s[pos:pos + l]
432 432 # validate with attempted decode
433 433 c.decode("utf-8")
434 434 return c
435 435
436 436 def toutf8b(s):
437 437 '''convert a local, possibly-binary string into UTF-8b
438 438
439 439 This is intended as a generic method to preserve data when working
440 440 with schemes like JSON and XML that have no provision for
441 441 arbitrary byte strings. As Mercurial often doesn't know
442 442 what encoding data is in, we use so-called UTF-8b.
443 443
444 444 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
445 445 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
446 446 uDC00-uDCFF.
447 447
448 448 Principles of operation:
449 449
450 450 - ASCII and UTF-8 data successfully round-trips and is understood
451 451 by Unicode-oriented clients
452 452 - filenames and file contents in arbitrary other encodings can have
453 453 be round-tripped or recovered by clueful clients
454 454 - local strings that have a cached known UTF-8 encoding (aka
455 455 localstr) get sent as UTF-8 so Unicode-oriented clients get the
456 456 Unicode data they want
457 457 - because we must preserve UTF-8 bytestring in places such as
458 458 filenames, metadata can't be roundtripped without help
459 459
460 460 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
461 461 arbitrary bytes into an internal Unicode format that can be
462 462 re-encoded back into the original. Here we are exposing the
463 463 internal surrogate encoding as a UTF-8 string.)
464 464 '''
465 465
466 466 if "\xed" not in s:
467 467 if isinstance(s, localstr):
468 468 return s._utf8
469 469 try:
470 470 s.decode('utf-8')
471 471 return s
472 472 except UnicodeDecodeError:
473 473 pass
474 474
475 475 r = ""
476 476 pos = 0
477 477 l = len(s)
478 478 while pos < l:
479 479 try:
480 480 c = getutf8char(s, pos)
481 481 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
482 482 # have to re-escape existing U+DCxx characters
483 483 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
484 484 pos += 1
485 485 else:
486 486 pos += len(c)
487 487 except UnicodeDecodeError:
488 488 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
489 489 pos += 1
490 490 r += c
491 491 return r
492 492
493 493 def fromutf8b(s):
494 494 '''Given a UTF-8b string, return a local, possibly-binary string.
495 495
496 496 return the original binary string. This
497 497 is a round-trip process for strings like filenames, but metadata
498 498 that's was passed through tolocal will remain in UTF-8.
499 499
500 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
500 501 >>> m = "\\xc3\\xa9\\x99abcd"
501 >>> n = toutf8b(m)
502 >>> n
502 >>> toutf8b(m)
503 503 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
504 >>> fromutf8b(n) == m
504 >>> roundtrip(m)
505 True
506 >>> roundtrip("\\xc2\\xc2\\x80")
507 True
508 >>> roundtrip("\\xef\\xbf\\xbd")
509 True
510 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
505 511 True
506 512 '''
507 513
508 514 # fast path - look for uDxxx prefixes in s
509 515 if "\xed" not in s:
510 516 return s
511 517
512 518 u = s.decode("utf-8")
513 519 r = ""
514 520 for c in u:
515 521 if ord(c) & 0xffff00 == 0xdc00:
516 522 r += chr(ord(c) & 0xff)
517 523 else:
518 524 r += c.encode("utf-8")
519 525 return r
General Comments 0
You need to be logged in to leave comments. Login now