##// END OF EJS Templates
encoding: add hfsignoreclean to clean out HFS-ignored characters...
Augie Fackler -
r23596:885bd7c5 stable
parent child Browse files
Show More
@@ -1,438 +1,460 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
12 # "Unicode Subtleties"), so we need to ignore them in some places for
13 # sanity.
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e "
16 "206a 206b 206c 206d 206e 206f feff".split()]
17 # verify the next function will work
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
19
20 def hfsignoreclean(s):
21 """Remove codepoints ignored by HFS+ from s.
22
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
24 '.hg'
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
26 '.hg'
27 """
28 if "\xe2" in s or "\xef" in s:
29 for c in _ignore:
30 s = s.replace(c, '')
31 return s
32
11 33 def _getpreferredencoding():
12 34 '''
13 35 On darwin, getpreferredencoding ignores the locale environment and
14 36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 37 for Python 2.7 and up. This is the same corrected code for earlier
16 38 Python versions.
17 39
18 40 However, we can't use a version check for this method, as some distributions
19 41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 42 encoding, as it is unlikely that this encoding is the actually expected.
21 43 '''
22 44 try:
23 45 locale.CODESET
24 46 except AttributeError:
25 47 # Fall back to parsing environment variables :-(
26 48 return locale.getdefaultlocale()[1]
27 49
28 50 oldloc = locale.setlocale(locale.LC_CTYPE)
29 51 locale.setlocale(locale.LC_CTYPE, "")
30 52 result = locale.nl_langinfo(locale.CODESET)
31 53 locale.setlocale(locale.LC_CTYPE, oldloc)
32 54
33 55 return result
34 56
35 57 _encodingfixers = {
36 58 '646': lambda: 'ascii',
37 59 'ANSI_X3.4-1968': lambda: 'ascii',
38 60 'mac-roman': _getpreferredencoding
39 61 }
40 62
41 63 try:
42 64 encoding = os.environ.get("HGENCODING")
43 65 if not encoding:
44 66 encoding = locale.getpreferredencoding() or 'ascii'
45 67 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 68 except locale.Error:
47 69 encoding = 'ascii'
48 70 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 71 fallbackencoding = 'ISO-8859-1'
50 72
51 73 class localstr(str):
52 74 '''This class allows strings that are unmodified to be
53 75 round-tripped to the local encoding and back'''
54 76 def __new__(cls, u, l):
55 77 s = str.__new__(cls, l)
56 78 s._utf8 = u
57 79 return s
58 80 def __hash__(self):
59 81 return hash(self._utf8) # avoid collisions in local string space
60 82
61 83 def tolocal(s):
62 84 """
63 85 Convert a string from internal UTF-8 to local encoding
64 86
65 87 All internal strings should be UTF-8 but some repos before the
66 88 implementation of locale support may contain latin1 or possibly
67 89 other character sets. We attempt to decode everything strictly
68 90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 91 replace unknown characters.
70 92
71 93 The localstr class is used to cache the known UTF-8 encoding of
72 94 strings next to their local representation to allow lossless
73 95 round-trip conversion back to UTF-8.
74 96
75 97 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 98 >>> l = tolocal(u)
77 99 >>> l
78 100 'foo: ?'
79 101 >>> fromlocal(l)
80 102 'foo: \\xc3\\xa4'
81 103 >>> u2 = 'foo: \\xc3\\xa1'
82 104 >>> d = { l: 1, tolocal(u2): 2 }
83 105 >>> len(d) # no collision
84 106 2
85 107 >>> 'foo: ?' in d
86 108 False
87 109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 110 >>> l = tolocal(l1)
89 111 >>> l
90 112 'foo: ?'
91 113 >>> fromlocal(l) # magically in utf-8
92 114 'foo: \\xc3\\xa4'
93 115 """
94 116
95 117 try:
96 118 try:
97 119 # make sure string is actually stored in UTF-8
98 120 u = s.decode('UTF-8')
99 121 if encoding == 'UTF-8':
100 122 # fast path
101 123 return s
102 124 r = u.encode(encoding, "replace")
103 125 if u == r.decode(encoding):
104 126 # r is a safe, non-lossy encoding of s
105 127 return r
106 128 return localstr(s, r)
107 129 except UnicodeDecodeError:
108 130 # we should only get here if we're looking at an ancient changeset
109 131 try:
110 132 u = s.decode(fallbackencoding)
111 133 r = u.encode(encoding, "replace")
112 134 if u == r.decode(encoding):
113 135 # r is a safe, non-lossy encoding of s
114 136 return r
115 137 return localstr(u.encode('UTF-8'), r)
116 138 except UnicodeDecodeError:
117 139 u = s.decode("utf-8", "replace") # last ditch
118 140 return u.encode(encoding, "replace") # can't round-trip
119 141 except LookupError, k:
120 142 raise error.Abort(k, hint="please check your locale settings")
121 143
122 144 def fromlocal(s):
123 145 """
124 146 Convert a string from the local character encoding to UTF-8
125 147
126 148 We attempt to decode strings using the encoding mode set by
127 149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 150 characters will cause an error message. Other modes include
129 151 'replace', which replaces unknown characters with a special
130 152 Unicode character, and 'ignore', which drops the character.
131 153 """
132 154
133 155 # can we do a lossless round-trip?
134 156 if isinstance(s, localstr):
135 157 return s._utf8
136 158
137 159 try:
138 160 return s.decode(encoding, encodingmode).encode("utf-8")
139 161 except UnicodeDecodeError, inst:
140 162 sub = s[max(0, inst.start - 10):inst.start + 10]
141 163 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 164 except LookupError, k:
143 165 raise error.Abort(k, hint="please check your locale settings")
144 166
145 167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 169 and "WFA" or "WF")
148 170
149 171 def colwidth(s):
150 172 "Find the column width of a string for display in the local encoding"
151 173 return ucolwidth(s.decode(encoding, 'replace'))
152 174
153 175 def ucolwidth(d):
154 176 "Find the column width of a Unicode string for display"
155 177 eaw = getattr(unicodedata, 'east_asian_width', None)
156 178 if eaw is not None:
157 179 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 180 return len(d)
159 181
160 182 def getcols(s, start, c):
161 183 '''Use colwidth to find a c-column substring of s starting at byte
162 184 index start'''
163 185 for x in xrange(start + c, len(s)):
164 186 t = s[start:x]
165 187 if colwidth(t) == c:
166 188 return t
167 189
168 190 def trim(s, width, ellipsis='', leftside=False):
169 191 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170 192
171 193 If 'leftside' is True, left side of string 's' is trimmed.
172 194 'ellipsis' is always placed at trimmed side.
173 195
174 196 >>> ellipsis = '+++'
175 197 >>> from mercurial import encoding
176 198 >>> encoding.encoding = 'utf-8'
177 199 >>> t= '1234567890'
178 200 >>> print trim(t, 12, ellipsis=ellipsis)
179 201 1234567890
180 202 >>> print trim(t, 10, ellipsis=ellipsis)
181 203 1234567890
182 204 >>> print trim(t, 8, ellipsis=ellipsis)
183 205 12345+++
184 206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
185 207 +++67890
186 208 >>> print trim(t, 8)
187 209 12345678
188 210 >>> print trim(t, 8, leftside=True)
189 211 34567890
190 212 >>> print trim(t, 3, ellipsis=ellipsis)
191 213 +++
192 214 >>> print trim(t, 1, ellipsis=ellipsis)
193 215 +
194 216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
195 217 >>> t = u.encode(encoding.encoding)
196 218 >>> print trim(t, 12, ellipsis=ellipsis)
197 219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
198 220 >>> print trim(t, 10, ellipsis=ellipsis)
199 221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
200 222 >>> print trim(t, 8, ellipsis=ellipsis)
201 223 \xe3\x81\x82\xe3\x81\x84+++
202 224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
203 225 +++\xe3\x81\x88\xe3\x81\x8a
204 226 >>> print trim(t, 5)
205 227 \xe3\x81\x82\xe3\x81\x84
206 228 >>> print trim(t, 5, leftside=True)
207 229 \xe3\x81\x88\xe3\x81\x8a
208 230 >>> print trim(t, 4, ellipsis=ellipsis)
209 231 +++
210 232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
211 233 +++
212 234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
213 235 >>> print trim(t, 12, ellipsis=ellipsis)
214 236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
215 237 >>> print trim(t, 10, ellipsis=ellipsis)
216 238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
217 239 >>> print trim(t, 8, ellipsis=ellipsis)
218 240 \x11\x22\x33\x44\x55+++
219 241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
220 242 +++\x66\x77\x88\x99\xaa
221 243 >>> print trim(t, 8)
222 244 \x11\x22\x33\x44\x55\x66\x77\x88
223 245 >>> print trim(t, 8, leftside=True)
224 246 \x33\x44\x55\x66\x77\x88\x99\xaa
225 247 >>> print trim(t, 3, ellipsis=ellipsis)
226 248 +++
227 249 >>> print trim(t, 1, ellipsis=ellipsis)
228 250 +
229 251 """
230 252 try:
231 253 u = s.decode(encoding)
232 254 except UnicodeDecodeError:
233 255 if len(s) <= width: # trimming is not needed
234 256 return s
235 257 width -= len(ellipsis)
236 258 if width <= 0: # no enough room even for ellipsis
237 259 return ellipsis[:width + len(ellipsis)]
238 260 if leftside:
239 261 return ellipsis + s[-width:]
240 262 return s[:width] + ellipsis
241 263
242 264 if ucolwidth(u) <= width: # trimming is not needed
243 265 return s
244 266
245 267 width -= len(ellipsis)
246 268 if width <= 0: # no enough room even for ellipsis
247 269 return ellipsis[:width + len(ellipsis)]
248 270
249 271 if leftside:
250 272 uslice = lambda i: u[i:]
251 273 concat = lambda s: ellipsis + s
252 274 else:
253 275 uslice = lambda i: u[:-i]
254 276 concat = lambda s: s + ellipsis
255 277 for i in xrange(1, len(u)):
256 278 usub = uslice(i)
257 279 if ucolwidth(usub) <= width:
258 280 return concat(usub.encode(encoding))
259 281 return ellipsis # no enough room for multi-column characters
260 282
261 283 def _asciilower(s):
262 284 '''convert a string to lowercase if ASCII
263 285
264 286 Raises UnicodeDecodeError if non-ASCII characters are found.'''
265 287 s.decode('ascii')
266 288 return s.lower()
267 289
268 290 def asciilower(s):
269 291 # delay importing avoids cyclic dependency around "parsers" in
270 292 # pure Python build (util => i18n => encoding => parsers => util)
271 293 import parsers
272 294 impl = getattr(parsers, 'asciilower', _asciilower)
273 295 global asciilower
274 296 asciilower = impl
275 297 return impl(s)
276 298
277 299 def lower(s):
278 300 "best-effort encoding-aware case-folding of local string s"
279 301 try:
280 302 return asciilower(s)
281 303 except UnicodeDecodeError:
282 304 pass
283 305 try:
284 306 if isinstance(s, localstr):
285 307 u = s._utf8.decode("utf-8")
286 308 else:
287 309 u = s.decode(encoding, encodingmode)
288 310
289 311 lu = u.lower()
290 312 if u == lu:
291 313 return s # preserve localstring
292 314 return lu.encode(encoding)
293 315 except UnicodeError:
294 316 return s.lower() # we don't know how to fold this except in ASCII
295 317 except LookupError, k:
296 318 raise error.Abort(k, hint="please check your locale settings")
297 319
298 320 def upper(s):
299 321 "best-effort encoding-aware case-folding of local string s"
300 322 try:
301 323 s.decode('ascii') # throw exception for non-ASCII character
302 324 return s.upper()
303 325 except UnicodeDecodeError:
304 326 pass
305 327 try:
306 328 if isinstance(s, localstr):
307 329 u = s._utf8.decode("utf-8")
308 330 else:
309 331 u = s.decode(encoding, encodingmode)
310 332
311 333 uu = u.upper()
312 334 if u == uu:
313 335 return s # preserve localstring
314 336 return uu.encode(encoding)
315 337 except UnicodeError:
316 338 return s.upper() # we don't know how to fold this except in ASCII
317 339 except LookupError, k:
318 340 raise error.Abort(k, hint="please check your locale settings")
319 341
320 342 _jsonmap = {}
321 343
322 344 def jsonescape(s):
323 345 '''returns a string suitable for JSON
324 346
325 347 JSON is problematic for us because it doesn't support non-Unicode
326 348 bytes. To deal with this, we take the following approach:
327 349
328 350 - localstr objects are converted back to UTF-8
329 351 - valid UTF-8/ASCII strings are passed as-is
330 352 - other strings are converted to UTF-8b surrogate encoding
331 353 - apply JSON-specified string escaping
332 354
333 355 (escapes are doubled in these tests)
334 356
335 357 >>> jsonescape('this is a test')
336 358 'this is a test'
337 359 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
338 360 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
339 361 >>> jsonescape('a weird byte: \\xdd')
340 362 'a weird byte: \\xed\\xb3\\x9d'
341 363 >>> jsonescape('utf-8: caf\\xc3\\xa9')
342 364 'utf-8: caf\\xc3\\xa9'
343 365 >>> jsonescape('')
344 366 ''
345 367 '''
346 368
347 369 if not _jsonmap:
348 370 for x in xrange(32):
349 371 _jsonmap[chr(x)] = "\u%04x" %x
350 372 for x in xrange(32, 256):
351 373 c = chr(x)
352 374 _jsonmap[c] = c
353 375 _jsonmap['\t'] = '\\t'
354 376 _jsonmap['\n'] = '\\n'
355 377 _jsonmap['\"'] = '\\"'
356 378 _jsonmap['\\'] = '\\\\'
357 379 _jsonmap['\b'] = '\\b'
358 380 _jsonmap['\f'] = '\\f'
359 381 _jsonmap['\r'] = '\\r'
360 382
361 383 return ''.join(_jsonmap[c] for c in toutf8b(s))
362 384
363 385 def toutf8b(s):
364 386 '''convert a local, possibly-binary string into UTF-8b
365 387
366 388 This is intended as a generic method to preserve data when working
367 389 with schemes like JSON and XML that have no provision for
368 390 arbitrary byte strings. As Mercurial often doesn't know
369 391 what encoding data is in, we use so-called UTF-8b.
370 392
371 393 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
372 394 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
373 395 uDC00-uDCFF.
374 396
375 397 Principles of operation:
376 398
377 399 - ASCII and UTF-8 data successfully round-trips and is understood
378 400 by Unicode-oriented clients
379 401 - filenames and file contents in arbitrary other encodings can have
380 402 be round-tripped or recovered by clueful clients
381 403 - local strings that have a cached known UTF-8 encoding (aka
382 404 localstr) get sent as UTF-8 so Unicode-oriented clients get the
383 405 Unicode data they want
384 406 - because we must preserve UTF-8 bytestring in places such as
385 407 filenames, metadata can't be roundtripped without help
386 408
387 409 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
388 410 arbitrary bytes into an internal Unicode format that can be
389 411 re-encoded back into the original. Here we are exposing the
390 412 internal surrogate encoding as a UTF-8 string.)
391 413 '''
392 414
393 415 if isinstance(s, localstr):
394 416 return s._utf8
395 417
396 418 try:
397 419 s.decode('utf-8')
398 420 return s
399 421 except UnicodeDecodeError:
400 422 # surrogate-encode any characters that don't round-trip
401 423 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
402 424 r = ""
403 425 pos = 0
404 426 for c in s:
405 427 if s2[pos:pos + 1] == c:
406 428 r += c
407 429 pos += 1
408 430 else:
409 431 r += unichr(0xdc00 + ord(c)).encode('utf-8')
410 432 return r
411 433
412 434 def fromutf8b(s):
413 435 '''Given a UTF-8b string, return a local, possibly-binary string.
414 436
415 437 return the original binary string. This
416 438 is a round-trip process for strings like filenames, but metadata
417 439 that's was passed through tolocal will remain in UTF-8.
418 440
419 441 >>> m = "\\xc3\\xa9\\x99abcd"
420 442 >>> n = toutf8b(m)
421 443 >>> n
422 444 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
423 445 >>> fromutf8b(n) == m
424 446 True
425 447 '''
426 448
427 449 # fast path - look for uDxxx prefixes in s
428 450 if "\xed" not in s:
429 451 return s
430 452
431 453 u = s.decode("utf-8")
432 454 r = ""
433 455 for c in u:
434 456 if ord(c) & 0xff00 == 0xdc00:
435 457 r += chr(ord(c) & 0xff)
436 458 else:
437 459 r += c.encode("utf-8")
438 460 return r
General Comments 0
You need to be logged in to leave comments. Login now