##// END OF EJS Templates
encoding: define an enum that specifies what normcase does to ASCII strings...
Siddharth Agarwal -
r24593:f473a1fe default
parent child Browse files
Show More
@@ -1,475 +1,488
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 13 # sanity.
14 14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 16 "206a 206b 206c 206d 206e 206f feff".split()]
17 17 # verify the next function will work
18 18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19 19
20 20 def hfsignoreclean(s):
21 21 """Remove codepoints ignored by HFS+ from s.
22 22
23 23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 24 '.hg'
25 25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 26 '.hg'
27 27 """
28 28 if "\xe2" in s or "\xef" in s:
29 29 for c in _ignore:
30 30 s = s.replace(c, '')
31 31 return s
32 32
33 33 def _getpreferredencoding():
34 34 '''
35 35 On darwin, getpreferredencoding ignores the locale environment and
36 36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
37 37 for Python 2.7 and up. This is the same corrected code for earlier
38 38 Python versions.
39 39
40 40 However, we can't use a version check for this method, as some distributions
41 41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
42 42 encoding, as it is unlikely that this encoding is the actually expected.
43 43 '''
44 44 try:
45 45 locale.CODESET
46 46 except AttributeError:
47 47 # Fall back to parsing environment variables :-(
48 48 return locale.getdefaultlocale()[1]
49 49
50 50 oldloc = locale.setlocale(locale.LC_CTYPE)
51 51 locale.setlocale(locale.LC_CTYPE, "")
52 52 result = locale.nl_langinfo(locale.CODESET)
53 53 locale.setlocale(locale.LC_CTYPE, oldloc)
54 54
55 55 return result
56 56
57 57 _encodingfixers = {
58 58 '646': lambda: 'ascii',
59 59 'ANSI_X3.4-1968': lambda: 'ascii',
60 60 'mac-roman': _getpreferredencoding
61 61 }
62 62
63 63 try:
64 64 encoding = os.environ.get("HGENCODING")
65 65 if not encoding:
66 66 encoding = locale.getpreferredencoding() or 'ascii'
67 67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
68 68 except locale.Error:
69 69 encoding = 'ascii'
70 70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
71 71 fallbackencoding = 'ISO-8859-1'
72 72
73 73 class localstr(str):
74 74 '''This class allows strings that are unmodified to be
75 75 round-tripped to the local encoding and back'''
76 76 def __new__(cls, u, l):
77 77 s = str.__new__(cls, l)
78 78 s._utf8 = u
79 79 return s
80 80 def __hash__(self):
81 81 return hash(self._utf8) # avoid collisions in local string space
82 82
83 83 def tolocal(s):
84 84 """
85 85 Convert a string from internal UTF-8 to local encoding
86 86
87 87 All internal strings should be UTF-8 but some repos before the
88 88 implementation of locale support may contain latin1 or possibly
89 89 other character sets. We attempt to decode everything strictly
90 90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
91 91 replace unknown characters.
92 92
93 93 The localstr class is used to cache the known UTF-8 encoding of
94 94 strings next to their local representation to allow lossless
95 95 round-trip conversion back to UTF-8.
96 96
97 97 >>> u = 'foo: \\xc3\\xa4' # utf-8
98 98 >>> l = tolocal(u)
99 99 >>> l
100 100 'foo: ?'
101 101 >>> fromlocal(l)
102 102 'foo: \\xc3\\xa4'
103 103 >>> u2 = 'foo: \\xc3\\xa1'
104 104 >>> d = { l: 1, tolocal(u2): 2 }
105 105 >>> len(d) # no collision
106 106 2
107 107 >>> 'foo: ?' in d
108 108 False
109 109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
110 110 >>> l = tolocal(l1)
111 111 >>> l
112 112 'foo: ?'
113 113 >>> fromlocal(l) # magically in utf-8
114 114 'foo: \\xc3\\xa4'
115 115 """
116 116
117 117 try:
118 118 try:
119 119 # make sure string is actually stored in UTF-8
120 120 u = s.decode('UTF-8')
121 121 if encoding == 'UTF-8':
122 122 # fast path
123 123 return s
124 124 r = u.encode(encoding, "replace")
125 125 if u == r.decode(encoding):
126 126 # r is a safe, non-lossy encoding of s
127 127 return r
128 128 return localstr(s, r)
129 129 except UnicodeDecodeError:
130 130 # we should only get here if we're looking at an ancient changeset
131 131 try:
132 132 u = s.decode(fallbackencoding)
133 133 r = u.encode(encoding, "replace")
134 134 if u == r.decode(encoding):
135 135 # r is a safe, non-lossy encoding of s
136 136 return r
137 137 return localstr(u.encode('UTF-8'), r)
138 138 except UnicodeDecodeError:
139 139 u = s.decode("utf-8", "replace") # last ditch
140 140 return u.encode(encoding, "replace") # can't round-trip
141 141 except LookupError, k:
142 142 raise error.Abort(k, hint="please check your locale settings")
143 143
144 144 def fromlocal(s):
145 145 """
146 146 Convert a string from the local character encoding to UTF-8
147 147
148 148 We attempt to decode strings using the encoding mode set by
149 149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
150 150 characters will cause an error message. Other modes include
151 151 'replace', which replaces unknown characters with a special
152 152 Unicode character, and 'ignore', which drops the character.
153 153 """
154 154
155 155 # can we do a lossless round-trip?
156 156 if isinstance(s, localstr):
157 157 return s._utf8
158 158
159 159 try:
160 160 return s.decode(encoding, encodingmode).encode("utf-8")
161 161 except UnicodeDecodeError, inst:
162 162 sub = s[max(0, inst.start - 10):inst.start + 10]
163 163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
164 164 except LookupError, k:
165 165 raise error.Abort(k, hint="please check your locale settings")
166 166
167 167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
168 168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
169 169 and "WFA" or "WF")
170 170
171 171 def colwidth(s):
172 172 "Find the column width of a string for display in the local encoding"
173 173 return ucolwidth(s.decode(encoding, 'replace'))
174 174
175 175 def ucolwidth(d):
176 176 "Find the column width of a Unicode string for display"
177 177 eaw = getattr(unicodedata, 'east_asian_width', None)
178 178 if eaw is not None:
179 179 return sum([eaw(c) in wide and 2 or 1 for c in d])
180 180 return len(d)
181 181
182 182 def getcols(s, start, c):
183 183 '''Use colwidth to find a c-column substring of s starting at byte
184 184 index start'''
185 185 for x in xrange(start + c, len(s)):
186 186 t = s[start:x]
187 187 if colwidth(t) == c:
188 188 return t
189 189
190 190 def trim(s, width, ellipsis='', leftside=False):
191 191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
192 192
193 193 If 'leftside' is True, left side of string 's' is trimmed.
194 194 'ellipsis' is always placed at trimmed side.
195 195
196 196 >>> ellipsis = '+++'
197 197 >>> from mercurial import encoding
198 198 >>> encoding.encoding = 'utf-8'
199 199 >>> t= '1234567890'
200 200 >>> print trim(t, 12, ellipsis=ellipsis)
201 201 1234567890
202 202 >>> print trim(t, 10, ellipsis=ellipsis)
203 203 1234567890
204 204 >>> print trim(t, 8, ellipsis=ellipsis)
205 205 12345+++
206 206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
207 207 +++67890
208 208 >>> print trim(t, 8)
209 209 12345678
210 210 >>> print trim(t, 8, leftside=True)
211 211 34567890
212 212 >>> print trim(t, 3, ellipsis=ellipsis)
213 213 +++
214 214 >>> print trim(t, 1, ellipsis=ellipsis)
215 215 +
216 216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
217 217 >>> t = u.encode(encoding.encoding)
218 218 >>> print trim(t, 12, ellipsis=ellipsis)
219 219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
220 220 >>> print trim(t, 10, ellipsis=ellipsis)
221 221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
222 222 >>> print trim(t, 8, ellipsis=ellipsis)
223 223 \xe3\x81\x82\xe3\x81\x84+++
224 224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
225 225 +++\xe3\x81\x88\xe3\x81\x8a
226 226 >>> print trim(t, 5)
227 227 \xe3\x81\x82\xe3\x81\x84
228 228 >>> print trim(t, 5, leftside=True)
229 229 \xe3\x81\x88\xe3\x81\x8a
230 230 >>> print trim(t, 4, ellipsis=ellipsis)
231 231 +++
232 232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
233 233 +++
234 234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
235 235 >>> print trim(t, 12, ellipsis=ellipsis)
236 236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
237 237 >>> print trim(t, 10, ellipsis=ellipsis)
238 238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
239 239 >>> print trim(t, 8, ellipsis=ellipsis)
240 240 \x11\x22\x33\x44\x55+++
241 241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
242 242 +++\x66\x77\x88\x99\xaa
243 243 >>> print trim(t, 8)
244 244 \x11\x22\x33\x44\x55\x66\x77\x88
245 245 >>> print trim(t, 8, leftside=True)
246 246 \x33\x44\x55\x66\x77\x88\x99\xaa
247 247 >>> print trim(t, 3, ellipsis=ellipsis)
248 248 +++
249 249 >>> print trim(t, 1, ellipsis=ellipsis)
250 250 +
251 251 """
252 252 try:
253 253 u = s.decode(encoding)
254 254 except UnicodeDecodeError:
255 255 if len(s) <= width: # trimming is not needed
256 256 return s
257 257 width -= len(ellipsis)
258 258 if width <= 0: # no enough room even for ellipsis
259 259 return ellipsis[:width + len(ellipsis)]
260 260 if leftside:
261 261 return ellipsis + s[-width:]
262 262 return s[:width] + ellipsis
263 263
264 264 if ucolwidth(u) <= width: # trimming is not needed
265 265 return s
266 266
267 267 width -= len(ellipsis)
268 268 if width <= 0: # no enough room even for ellipsis
269 269 return ellipsis[:width + len(ellipsis)]
270 270
271 271 if leftside:
272 272 uslice = lambda i: u[i:]
273 273 concat = lambda s: ellipsis + s
274 274 else:
275 275 uslice = lambda i: u[:-i]
276 276 concat = lambda s: s + ellipsis
277 277 for i in xrange(1, len(u)):
278 278 usub = uslice(i)
279 279 if ucolwidth(usub) <= width:
280 280 return concat(usub.encode(encoding))
281 281 return ellipsis # no enough room for multi-column characters
282 282
283 283 def _asciilower(s):
284 284 '''convert a string to lowercase if ASCII
285 285
286 286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
287 287 s.decode('ascii')
288 288 return s.lower()
289 289
290 290 def asciilower(s):
291 291 # delay importing avoids cyclic dependency around "parsers" in
292 292 # pure Python build (util => i18n => encoding => parsers => util)
293 293 import parsers
294 294 impl = getattr(parsers, 'asciilower', _asciilower)
295 295 global asciilower
296 296 asciilower = impl
297 297 return impl(s)
298 298
299 299 def _asciiupper(s):
300 300 '''convert a string to uppercase if ASCII
301 301
302 302 Raises UnicodeDecodeError if non-ASCII characters are found.'''
303 303 s.decode('ascii')
304 304 return s.upper()
305 305
306 306 def asciiupper(s):
307 307 # delay importing avoids cyclic dependency around "parsers" in
308 308 # pure Python build (util => i18n => encoding => parsers => util)
309 309 import parsers
310 310 impl = getattr(parsers, 'asciiupper', _asciiupper)
311 311 global asciiupper
312 312 asciiupper = impl
313 313 return impl(s)
314 314
315 315 def lower(s):
316 316 "best-effort encoding-aware case-folding of local string s"
317 317 try:
318 318 return asciilower(s)
319 319 except UnicodeDecodeError:
320 320 pass
321 321 try:
322 322 if isinstance(s, localstr):
323 323 u = s._utf8.decode("utf-8")
324 324 else:
325 325 u = s.decode(encoding, encodingmode)
326 326
327 327 lu = u.lower()
328 328 if u == lu:
329 329 return s # preserve localstring
330 330 return lu.encode(encoding)
331 331 except UnicodeError:
332 332 return s.lower() # we don't know how to fold this except in ASCII
333 333 except LookupError, k:
334 334 raise error.Abort(k, hint="please check your locale settings")
335 335
336 336 def upper(s):
337 337 "best-effort encoding-aware case-folding of local string s"
338 338 try:
339 339 return asciiupper(s)
340 340 except UnicodeDecodeError:
341 341 pass
342 342 try:
343 343 if isinstance(s, localstr):
344 344 u = s._utf8.decode("utf-8")
345 345 else:
346 346 u = s.decode(encoding, encodingmode)
347 347
348 348 uu = u.upper()
349 349 if u == uu:
350 350 return s # preserve localstring
351 351 return uu.encode(encoding)
352 352 except UnicodeError:
353 353 return s.upper() # we don't know how to fold this except in ASCII
354 354 except LookupError, k:
355 355 raise error.Abort(k, hint="please check your locale settings")
356 356
357 class normcasespecs(object):
358 '''what a platform's normcase does to ASCII strings
359
360 This is specified per platform, and should be consistent with what normcase
361 on that platform actually does.
362
363 lower: normcase lowercases ASCII strings
364 upper: normcase uppercases ASCII strings
365 other: the fallback function should always be called'''
366 lower = -1
367 upper = 1
368 other = 0
369
357 370 _jsonmap = {}
358 371
359 372 def jsonescape(s):
360 373 '''returns a string suitable for JSON
361 374
362 375 JSON is problematic for us because it doesn't support non-Unicode
363 376 bytes. To deal with this, we take the following approach:
364 377
365 378 - localstr objects are converted back to UTF-8
366 379 - valid UTF-8/ASCII strings are passed as-is
367 380 - other strings are converted to UTF-8b surrogate encoding
368 381 - apply JSON-specified string escaping
369 382
370 383 (escapes are doubled in these tests)
371 384
372 385 >>> jsonescape('this is a test')
373 386 'this is a test'
374 387 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
375 388 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
376 389 >>> jsonescape('a weird byte: \\xdd')
377 390 'a weird byte: \\xed\\xb3\\x9d'
378 391 >>> jsonescape('utf-8: caf\\xc3\\xa9')
379 392 'utf-8: caf\\xc3\\xa9'
380 393 >>> jsonescape('')
381 394 ''
382 395 '''
383 396
384 397 if not _jsonmap:
385 398 for x in xrange(32):
386 399 _jsonmap[chr(x)] = "\u%04x" %x
387 400 for x in xrange(32, 256):
388 401 c = chr(x)
389 402 _jsonmap[c] = c
390 403 _jsonmap['\t'] = '\\t'
391 404 _jsonmap['\n'] = '\\n'
392 405 _jsonmap['\"'] = '\\"'
393 406 _jsonmap['\\'] = '\\\\'
394 407 _jsonmap['\b'] = '\\b'
395 408 _jsonmap['\f'] = '\\f'
396 409 _jsonmap['\r'] = '\\r'
397 410
398 411 return ''.join(_jsonmap[c] for c in toutf8b(s))
399 412
400 413 def toutf8b(s):
401 414 '''convert a local, possibly-binary string into UTF-8b
402 415
403 416 This is intended as a generic method to preserve data when working
404 417 with schemes like JSON and XML that have no provision for
405 418 arbitrary byte strings. As Mercurial often doesn't know
406 419 what encoding data is in, we use so-called UTF-8b.
407 420
408 421 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
409 422 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
410 423 uDC00-uDCFF.
411 424
412 425 Principles of operation:
413 426
414 427 - ASCII and UTF-8 data successfully round-trips and is understood
415 428 by Unicode-oriented clients
416 429 - filenames and file contents in arbitrary other encodings can have
417 430 be round-tripped or recovered by clueful clients
418 431 - local strings that have a cached known UTF-8 encoding (aka
419 432 localstr) get sent as UTF-8 so Unicode-oriented clients get the
420 433 Unicode data they want
421 434 - because we must preserve UTF-8 bytestring in places such as
422 435 filenames, metadata can't be roundtripped without help
423 436
424 437 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
425 438 arbitrary bytes into an internal Unicode format that can be
426 439 re-encoded back into the original. Here we are exposing the
427 440 internal surrogate encoding as a UTF-8 string.)
428 441 '''
429 442
430 443 if isinstance(s, localstr):
431 444 return s._utf8
432 445
433 446 try:
434 447 s.decode('utf-8')
435 448 return s
436 449 except UnicodeDecodeError:
437 450 # surrogate-encode any characters that don't round-trip
438 451 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
439 452 r = ""
440 453 pos = 0
441 454 for c in s:
442 455 if s2[pos:pos + 1] == c:
443 456 r += c
444 457 pos += 1
445 458 else:
446 459 r += unichr(0xdc00 + ord(c)).encode('utf-8')
447 460 return r
448 461
449 462 def fromutf8b(s):
450 463 '''Given a UTF-8b string, return a local, possibly-binary string.
451 464
452 465 return the original binary string. This
453 466 is a round-trip process for strings like filenames, but metadata
454 467 that's was passed through tolocal will remain in UTF-8.
455 468
456 469 >>> m = "\\xc3\\xa9\\x99abcd"
457 470 >>> n = toutf8b(m)
458 471 >>> n
459 472 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
460 473 >>> fromutf8b(n) == m
461 474 True
462 475 '''
463 476
464 477 # fast path - look for uDxxx prefixes in s
465 478 if "\xed" not in s:
466 479 return s
467 480
468 481 u = s.decode("utf-8")
469 482 r = ""
470 483 for c in u:
471 484 if ord(c) & 0xff00 == 0xdc00:
472 485 r += chr(ord(c) & 0xff)
473 486 else:
474 487 r += c.encode("utf-8")
475 488 return r
General Comments 0
You need to be logged in to leave comments. Login now