##// END OF EJS Templates
encoding: use double backslash...
Gregory Szorc -
r27356:c2effd1e default
parent child Browse files
Show More
@@ -1,532 +1,532 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 from __future__ import absolute_import
9 9
10 10 import locale
11 11 import os
12 12 import unicodedata
13 13
14 14 from . import (
15 15 error,
16 16 )
17 17
18 18 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
19 19 # "Unicode Subtleties"), so we need to ignore them in some places for
20 20 # sanity.
21 21 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
22 22 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
23 23 "206a 206b 206c 206d 206e 206f feff".split()]
24 24 # verify the next function will work
25 25 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
26 26
27 27 def hfsignoreclean(s):
28 28 """Remove codepoints ignored by HFS+ from s.
29 29
30 30 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
31 31 '.hg'
32 32 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
33 33 '.hg'
34 34 """
35 35 if "\xe2" in s or "\xef" in s:
36 36 for c in _ignore:
37 37 s = s.replace(c, '')
38 38 return s
39 39
40 40 def _getpreferredencoding():
41 41 '''
42 42 On darwin, getpreferredencoding ignores the locale environment and
43 43 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
44 44 for Python 2.7 and up. This is the same corrected code for earlier
45 45 Python versions.
46 46
47 47 However, we can't use a version check for this method, as some distributions
48 48 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
49 49 encoding, as it is unlikely that this encoding is the actually expected.
50 50 '''
51 51 try:
52 52 locale.CODESET
53 53 except AttributeError:
54 54 # Fall back to parsing environment variables :-(
55 55 return locale.getdefaultlocale()[1]
56 56
57 57 oldloc = locale.setlocale(locale.LC_CTYPE)
58 58 locale.setlocale(locale.LC_CTYPE, "")
59 59 result = locale.nl_langinfo(locale.CODESET)
60 60 locale.setlocale(locale.LC_CTYPE, oldloc)
61 61
62 62 return result
63 63
64 64 _encodingfixers = {
65 65 '646': lambda: 'ascii',
66 66 'ANSI_X3.4-1968': lambda: 'ascii',
67 67 'mac-roman': _getpreferredencoding
68 68 }
69 69
70 70 try:
71 71 encoding = os.environ.get("HGENCODING")
72 72 if not encoding:
73 73 encoding = locale.getpreferredencoding() or 'ascii'
74 74 encoding = _encodingfixers.get(encoding, lambda: encoding)()
75 75 except locale.Error:
76 76 encoding = 'ascii'
77 77 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
78 78 fallbackencoding = 'ISO-8859-1'
79 79
80 80 class localstr(str):
81 81 '''This class allows strings that are unmodified to be
82 82 round-tripped to the local encoding and back'''
83 83 def __new__(cls, u, l):
84 84 s = str.__new__(cls, l)
85 85 s._utf8 = u
86 86 return s
87 87 def __hash__(self):
88 88 return hash(self._utf8) # avoid collisions in local string space
89 89
90 90 def tolocal(s):
91 91 """
92 92 Convert a string from internal UTF-8 to local encoding
93 93
94 94 All internal strings should be UTF-8 but some repos before the
95 95 implementation of locale support may contain latin1 or possibly
96 96 other character sets. We attempt to decode everything strictly
97 97 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
98 98 replace unknown characters.
99 99
100 100 The localstr class is used to cache the known UTF-8 encoding of
101 101 strings next to their local representation to allow lossless
102 102 round-trip conversion back to UTF-8.
103 103
104 104 >>> u = 'foo: \\xc3\\xa4' # utf-8
105 105 >>> l = tolocal(u)
106 106 >>> l
107 107 'foo: ?'
108 108 >>> fromlocal(l)
109 109 'foo: \\xc3\\xa4'
110 110 >>> u2 = 'foo: \\xc3\\xa1'
111 111 >>> d = { l: 1, tolocal(u2): 2 }
112 112 >>> len(d) # no collision
113 113 2
114 114 >>> 'foo: ?' in d
115 115 False
116 116 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
117 117 >>> l = tolocal(l1)
118 118 >>> l
119 119 'foo: ?'
120 120 >>> fromlocal(l) # magically in utf-8
121 121 'foo: \\xc3\\xa4'
122 122 """
123 123
124 124 try:
125 125 try:
126 126 # make sure string is actually stored in UTF-8
127 127 u = s.decode('UTF-8')
128 128 if encoding == 'UTF-8':
129 129 # fast path
130 130 return s
131 131 r = u.encode(encoding, "replace")
132 132 if u == r.decode(encoding):
133 133 # r is a safe, non-lossy encoding of s
134 134 return r
135 135 return localstr(s, r)
136 136 except UnicodeDecodeError:
137 137 # we should only get here if we're looking at an ancient changeset
138 138 try:
139 139 u = s.decode(fallbackencoding)
140 140 r = u.encode(encoding, "replace")
141 141 if u == r.decode(encoding):
142 142 # r is a safe, non-lossy encoding of s
143 143 return r
144 144 return localstr(u.encode('UTF-8'), r)
145 145 except UnicodeDecodeError:
146 146 u = s.decode("utf-8", "replace") # last ditch
147 147 return u.encode(encoding, "replace") # can't round-trip
148 148 except LookupError as k:
149 149 raise error.Abort(k, hint="please check your locale settings")
150 150
151 151 def fromlocal(s):
152 152 """
153 153 Convert a string from the local character encoding to UTF-8
154 154
155 155 We attempt to decode strings using the encoding mode set by
156 156 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
157 157 characters will cause an error message. Other modes include
158 158 'replace', which replaces unknown characters with a special
159 159 Unicode character, and 'ignore', which drops the character.
160 160 """
161 161
162 162 # can we do a lossless round-trip?
163 163 if isinstance(s, localstr):
164 164 return s._utf8
165 165
166 166 try:
167 167 return s.decode(encoding, encodingmode).encode("utf-8")
168 168 except UnicodeDecodeError as inst:
169 169 sub = s[max(0, inst.start - 10):inst.start + 10]
170 170 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
171 171 except LookupError as k:
172 172 raise error.Abort(k, hint="please check your locale settings")
173 173
174 174 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
175 175 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
176 176 and "WFA" or "WF")
177 177
178 178 def colwidth(s):
179 179 "Find the column width of a string for display in the local encoding"
180 180 return ucolwidth(s.decode(encoding, 'replace'))
181 181
182 182 def ucolwidth(d):
183 183 "Find the column width of a Unicode string for display"
184 184 eaw = getattr(unicodedata, 'east_asian_width', None)
185 185 if eaw is not None:
186 186 return sum([eaw(c) in wide and 2 or 1 for c in d])
187 187 return len(d)
188 188
189 189 def getcols(s, start, c):
190 190 '''Use colwidth to find a c-column substring of s starting at byte
191 191 index start'''
192 192 for x in xrange(start + c, len(s)):
193 193 t = s[start:x]
194 194 if colwidth(t) == c:
195 195 return t
196 196
197 197 def trim(s, width, ellipsis='', leftside=False):
198 198 """Trim string 's' to at most 'width' columns (including 'ellipsis').
199 199
200 200 If 'leftside' is True, left side of string 's' is trimmed.
201 201 'ellipsis' is always placed at trimmed side.
202 202
203 203 >>> ellipsis = '+++'
204 204 >>> from . import encoding
205 205 >>> encoding.encoding = 'utf-8'
206 206 >>> t= '1234567890'
207 207 >>> print trim(t, 12, ellipsis=ellipsis)
208 208 1234567890
209 209 >>> print trim(t, 10, ellipsis=ellipsis)
210 210 1234567890
211 211 >>> print trim(t, 8, ellipsis=ellipsis)
212 212 12345+++
213 213 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
214 214 +++67890
215 215 >>> print trim(t, 8)
216 216 12345678
217 217 >>> print trim(t, 8, leftside=True)
218 218 34567890
219 219 >>> print trim(t, 3, ellipsis=ellipsis)
220 220 +++
221 221 >>> print trim(t, 1, ellipsis=ellipsis)
222 222 +
223 223 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
224 224 >>> t = u.encode(encoding.encoding)
225 225 >>> print trim(t, 12, ellipsis=ellipsis)
226 226 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
227 227 >>> print trim(t, 10, ellipsis=ellipsis)
228 228 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
229 229 >>> print trim(t, 8, ellipsis=ellipsis)
230 230 \xe3\x81\x82\xe3\x81\x84+++
231 231 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
232 232 +++\xe3\x81\x88\xe3\x81\x8a
233 233 >>> print trim(t, 5)
234 234 \xe3\x81\x82\xe3\x81\x84
235 235 >>> print trim(t, 5, leftside=True)
236 236 \xe3\x81\x88\xe3\x81\x8a
237 237 >>> print trim(t, 4, ellipsis=ellipsis)
238 238 +++
239 239 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
240 240 +++
241 241 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
242 242 >>> print trim(t, 12, ellipsis=ellipsis)
243 243 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
244 244 >>> print trim(t, 10, ellipsis=ellipsis)
245 245 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
246 246 >>> print trim(t, 8, ellipsis=ellipsis)
247 247 \x11\x22\x33\x44\x55+++
248 248 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
249 249 +++\x66\x77\x88\x99\xaa
250 250 >>> print trim(t, 8)
251 251 \x11\x22\x33\x44\x55\x66\x77\x88
252 252 >>> print trim(t, 8, leftside=True)
253 253 \x33\x44\x55\x66\x77\x88\x99\xaa
254 254 >>> print trim(t, 3, ellipsis=ellipsis)
255 255 +++
256 256 >>> print trim(t, 1, ellipsis=ellipsis)
257 257 +
258 258 """
259 259 try:
260 260 u = s.decode(encoding)
261 261 except UnicodeDecodeError:
262 262 if len(s) <= width: # trimming is not needed
263 263 return s
264 264 width -= len(ellipsis)
265 265 if width <= 0: # no enough room even for ellipsis
266 266 return ellipsis[:width + len(ellipsis)]
267 267 if leftside:
268 268 return ellipsis + s[-width:]
269 269 return s[:width] + ellipsis
270 270
271 271 if ucolwidth(u) <= width: # trimming is not needed
272 272 return s
273 273
274 274 width -= len(ellipsis)
275 275 if width <= 0: # no enough room even for ellipsis
276 276 return ellipsis[:width + len(ellipsis)]
277 277
278 278 if leftside:
279 279 uslice = lambda i: u[i:]
280 280 concat = lambda s: ellipsis + s
281 281 else:
282 282 uslice = lambda i: u[:-i]
283 283 concat = lambda s: s + ellipsis
284 284 for i in xrange(1, len(u)):
285 285 usub = uslice(i)
286 286 if ucolwidth(usub) <= width:
287 287 return concat(usub.encode(encoding))
288 288 return ellipsis # no enough room for multi-column characters
289 289
290 290 def _asciilower(s):
291 291 '''convert a string to lowercase if ASCII
292 292
293 293 Raises UnicodeDecodeError if non-ASCII characters are found.'''
294 294 s.decode('ascii')
295 295 return s.lower()
296 296
297 297 def asciilower(s):
298 298 # delay importing avoids cyclic dependency around "parsers" in
299 299 # pure Python build (util => i18n => encoding => parsers => util)
300 300 from . import parsers
301 301 impl = getattr(parsers, 'asciilower', _asciilower)
302 302 global asciilower
303 303 asciilower = impl
304 304 return impl(s)
305 305
306 306 def _asciiupper(s):
307 307 '''convert a string to uppercase if ASCII
308 308
309 309 Raises UnicodeDecodeError if non-ASCII characters are found.'''
310 310 s.decode('ascii')
311 311 return s.upper()
312 312
313 313 def asciiupper(s):
314 314 # delay importing avoids cyclic dependency around "parsers" in
315 315 # pure Python build (util => i18n => encoding => parsers => util)
316 316 from . import parsers
317 317 impl = getattr(parsers, 'asciiupper', _asciiupper)
318 318 global asciiupper
319 319 asciiupper = impl
320 320 return impl(s)
321 321
322 322 def lower(s):
323 323 "best-effort encoding-aware case-folding of local string s"
324 324 try:
325 325 return asciilower(s)
326 326 except UnicodeDecodeError:
327 327 pass
328 328 try:
329 329 if isinstance(s, localstr):
330 330 u = s._utf8.decode("utf-8")
331 331 else:
332 332 u = s.decode(encoding, encodingmode)
333 333
334 334 lu = u.lower()
335 335 if u == lu:
336 336 return s # preserve localstring
337 337 return lu.encode(encoding)
338 338 except UnicodeError:
339 339 return s.lower() # we don't know how to fold this except in ASCII
340 340 except LookupError as k:
341 341 raise error.Abort(k, hint="please check your locale settings")
342 342
343 343 def upper(s):
344 344 "best-effort encoding-aware case-folding of local string s"
345 345 try:
346 346 return asciiupper(s)
347 347 except UnicodeDecodeError:
348 348 return upperfallback(s)
349 349
350 350 def upperfallback(s):
351 351 try:
352 352 if isinstance(s, localstr):
353 353 u = s._utf8.decode("utf-8")
354 354 else:
355 355 u = s.decode(encoding, encodingmode)
356 356
357 357 uu = u.upper()
358 358 if u == uu:
359 359 return s # preserve localstring
360 360 return uu.encode(encoding)
361 361 except UnicodeError:
362 362 return s.upper() # we don't know how to fold this except in ASCII
363 363 except LookupError as k:
364 364 raise error.Abort(k, hint="please check your locale settings")
365 365
366 366 class normcasespecs(object):
367 367 '''what a platform's normcase does to ASCII strings
368 368
369 369 This is specified per platform, and should be consistent with what normcase
370 370 on that platform actually does.
371 371
372 372 lower: normcase lowercases ASCII strings
373 373 upper: normcase uppercases ASCII strings
374 374 other: the fallback function should always be called
375 375
376 376 This should be kept in sync with normcase_spec in util.h.'''
377 377 lower = -1
378 378 upper = 1
379 379 other = 0
380 380
381 381 _jsonmap = {}
382 382
383 383 def jsonescape(s):
384 384 '''returns a string suitable for JSON
385 385
386 386 JSON is problematic for us because it doesn't support non-Unicode
387 387 bytes. To deal with this, we take the following approach:
388 388
389 389 - localstr objects are converted back to UTF-8
390 390 - valid UTF-8/ASCII strings are passed as-is
391 391 - other strings are converted to UTF-8b surrogate encoding
392 392 - apply JSON-specified string escaping
393 393
394 394 (escapes are doubled in these tests)
395 395
396 396 >>> jsonescape('this is a test')
397 397 'this is a test'
398 398 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
399 399 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
400 400 >>> jsonescape('a weird byte: \\xdd')
401 401 'a weird byte: \\xed\\xb3\\x9d'
402 402 >>> jsonescape('utf-8: caf\\xc3\\xa9')
403 403 'utf-8: caf\\xc3\\xa9'
404 404 >>> jsonescape('')
405 405 ''
406 406 '''
407 407
408 408 if not _jsonmap:
409 409 for x in xrange(32):
410 _jsonmap[chr(x)] = "\u%04x" %x
410 _jsonmap[chr(x)] = "\\u%04x" % x
411 411 for x in xrange(32, 256):
412 412 c = chr(x)
413 413 _jsonmap[c] = c
414 414 _jsonmap['\t'] = '\\t'
415 415 _jsonmap['\n'] = '\\n'
416 416 _jsonmap['\"'] = '\\"'
417 417 _jsonmap['\\'] = '\\\\'
418 418 _jsonmap['\b'] = '\\b'
419 419 _jsonmap['\f'] = '\\f'
420 420 _jsonmap['\r'] = '\\r'
421 421
422 422 return ''.join(_jsonmap[c] for c in toutf8b(s))
423 423
424 424 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
425 425
426 426 def getutf8char(s, pos):
427 427 '''get the next full utf-8 character in the given string, starting at pos
428 428
429 429 Raises a UnicodeError if the given location does not start a valid
430 430 utf-8 character.
431 431 '''
432 432
433 433 # find how many bytes to attempt decoding from first nibble
434 434 l = _utf8len[ord(s[pos]) >> 4]
435 435 if not l: # ascii
436 436 return s[pos]
437 437
438 438 c = s[pos:pos + l]
439 439 # validate with attempted decode
440 440 c.decode("utf-8")
441 441 return c
442 442
443 443 def toutf8b(s):
444 444 '''convert a local, possibly-binary string into UTF-8b
445 445
446 446 This is intended as a generic method to preserve data when working
447 447 with schemes like JSON and XML that have no provision for
448 448 arbitrary byte strings. As Mercurial often doesn't know
449 449 what encoding data is in, we use so-called UTF-8b.
450 450
451 451 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
452 452 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
453 453 uDC00-uDCFF.
454 454
455 455 Principles of operation:
456 456
457 457 - ASCII and UTF-8 data successfully round-trips and is understood
458 458 by Unicode-oriented clients
459 459 - filenames and file contents in arbitrary other encodings can have
460 460 be round-tripped or recovered by clueful clients
461 461 - local strings that have a cached known UTF-8 encoding (aka
462 462 localstr) get sent as UTF-8 so Unicode-oriented clients get the
463 463 Unicode data they want
464 464 - because we must preserve UTF-8 bytestring in places such as
465 465 filenames, metadata can't be roundtripped without help
466 466
467 467 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
468 468 arbitrary bytes into an internal Unicode format that can be
469 469 re-encoded back into the original. Here we are exposing the
470 470 internal surrogate encoding as a UTF-8 string.)
471 471 '''
472 472
473 473 if "\xed" not in s:
474 474 if isinstance(s, localstr):
475 475 return s._utf8
476 476 try:
477 477 s.decode('utf-8')
478 478 return s
479 479 except UnicodeDecodeError:
480 480 pass
481 481
482 482 r = ""
483 483 pos = 0
484 484 l = len(s)
485 485 while pos < l:
486 486 try:
487 487 c = getutf8char(s, pos)
488 488 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
489 489 # have to re-escape existing U+DCxx characters
490 490 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
491 491 pos += 1
492 492 else:
493 493 pos += len(c)
494 494 except UnicodeDecodeError:
495 495 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
496 496 pos += 1
497 497 r += c
498 498 return r
499 499
500 500 def fromutf8b(s):
501 501 '''Given a UTF-8b string, return a local, possibly-binary string.
502 502
503 503 return the original binary string. This
504 504 is a round-trip process for strings like filenames, but metadata
505 505 that's was passed through tolocal will remain in UTF-8.
506 506
507 507 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
508 508 >>> m = "\\xc3\\xa9\\x99abcd"
509 509 >>> toutf8b(m)
510 510 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
511 511 >>> roundtrip(m)
512 512 True
513 513 >>> roundtrip("\\xc2\\xc2\\x80")
514 514 True
515 515 >>> roundtrip("\\xef\\xbf\\xbd")
516 516 True
517 517 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
518 518 True
519 519 '''
520 520
521 521 # fast path - look for uDxxx prefixes in s
522 522 if "\xed" not in s:
523 523 return s
524 524
525 525 u = s.decode("utf-8")
526 526 r = ""
527 527 for c in u:
528 528 if ord(c) & 0xffff00 == 0xdc00:
529 529 r += chr(ord(c) & 0xff)
530 530 else:
531 531 r += c.encode("utf-8")
532 532 return r
General Comments 0
You need to be logged in to leave comments. Login now