##// END OF EJS Templates
encoding: add 'trim' to trim multi-byte characters at most specified columns...
FUJIWARA Katsunori -
r21856:d24969ee default
parent child Browse files
Show More
@@ -1,287 +1,357 b''
1 1 # encoding.py - character transcoding support for Mercurial
2 2 #
3 3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 import error
9 9 import unicodedata, locale, os
10 10
11 11 def _getpreferredencoding():
12 12 '''
13 13 On darwin, getpreferredencoding ignores the locale environment and
14 14 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
15 15 for Python 2.7 and up. This is the same corrected code for earlier
16 16 Python versions.
17 17
18 18 However, we can't use a version check for this method, as some distributions
19 19 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
20 20 encoding, as it is unlikely that this encoding is the actually expected.
21 21 '''
22 22 try:
23 23 locale.CODESET
24 24 except AttributeError:
25 25 # Fall back to parsing environment variables :-(
26 26 return locale.getdefaultlocale()[1]
27 27
28 28 oldloc = locale.setlocale(locale.LC_CTYPE)
29 29 locale.setlocale(locale.LC_CTYPE, "")
30 30 result = locale.nl_langinfo(locale.CODESET)
31 31 locale.setlocale(locale.LC_CTYPE, oldloc)
32 32
33 33 return result
34 34
35 35 _encodingfixers = {
36 36 '646': lambda: 'ascii',
37 37 'ANSI_X3.4-1968': lambda: 'ascii',
38 38 'mac-roman': _getpreferredencoding
39 39 }
40 40
41 41 try:
42 42 encoding = os.environ.get("HGENCODING")
43 43 if not encoding:
44 44 encoding = locale.getpreferredencoding() or 'ascii'
45 45 encoding = _encodingfixers.get(encoding, lambda: encoding)()
46 46 except locale.Error:
47 47 encoding = 'ascii'
48 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 49 fallbackencoding = 'ISO-8859-1'
50 50
51 51 class localstr(str):
52 52 '''This class allows strings that are unmodified to be
53 53 round-tripped to the local encoding and back'''
54 54 def __new__(cls, u, l):
55 55 s = str.__new__(cls, l)
56 56 s._utf8 = u
57 57 return s
58 58 def __hash__(self):
59 59 return hash(self._utf8) # avoid collisions in local string space
60 60
61 61 def tolocal(s):
62 62 """
63 63 Convert a string from internal UTF-8 to local encoding
64 64
65 65 All internal strings should be UTF-8 but some repos before the
66 66 implementation of locale support may contain latin1 or possibly
67 67 other character sets. We attempt to decode everything strictly
68 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
69 69 replace unknown characters.
70 70
71 71 The localstr class is used to cache the known UTF-8 encoding of
72 72 strings next to their local representation to allow lossless
73 73 round-trip conversion back to UTF-8.
74 74
75 75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 76 >>> l = tolocal(u)
77 77 >>> l
78 78 'foo: ?'
79 79 >>> fromlocal(l)
80 80 'foo: \\xc3\\xa4'
81 81 >>> u2 = 'foo: \\xc3\\xa1'
82 82 >>> d = { l: 1, tolocal(u2): 2 }
83 83 >>> len(d) # no collision
84 84 2
85 85 >>> 'foo: ?' in d
86 86 False
87 87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 88 >>> l = tolocal(l1)
89 89 >>> l
90 90 'foo: ?'
91 91 >>> fromlocal(l) # magically in utf-8
92 92 'foo: \\xc3\\xa4'
93 93 """
94 94
95 95 try:
96 96 try:
97 97 # make sure string is actually stored in UTF-8
98 98 u = s.decode('UTF-8')
99 99 if encoding == 'UTF-8':
100 100 # fast path
101 101 return s
102 102 r = u.encode(encoding, "replace")
103 103 if u == r.decode(encoding):
104 104 # r is a safe, non-lossy encoding of s
105 105 return r
106 106 return localstr(s, r)
107 107 except UnicodeDecodeError:
108 108 # we should only get here if we're looking at an ancient changeset
109 109 try:
110 110 u = s.decode(fallbackencoding)
111 111 r = u.encode(encoding, "replace")
112 112 if u == r.decode(encoding):
113 113 # r is a safe, non-lossy encoding of s
114 114 return r
115 115 return localstr(u.encode('UTF-8'), r)
116 116 except UnicodeDecodeError:
117 117 u = s.decode("utf-8", "replace") # last ditch
118 118 return u.encode(encoding, "replace") # can't round-trip
119 119 except LookupError, k:
120 120 raise error.Abort(k, hint="please check your locale settings")
121 121
122 122 def fromlocal(s):
123 123 """
124 124 Convert a string from the local character encoding to UTF-8
125 125
126 126 We attempt to decode strings using the encoding mode set by
127 127 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
128 128 characters will cause an error message. Other modes include
129 129 'replace', which replaces unknown characters with a special
130 130 Unicode character, and 'ignore', which drops the character.
131 131 """
132 132
133 133 # can we do a lossless round-trip?
134 134 if isinstance(s, localstr):
135 135 return s._utf8
136 136
137 137 try:
138 138 return s.decode(encoding, encodingmode).encode("utf-8")
139 139 except UnicodeDecodeError, inst:
140 140 sub = s[max(0, inst.start - 10):inst.start + 10]
141 141 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
142 142 except LookupError, k:
143 143 raise error.Abort(k, hint="please check your locale settings")
144 144
145 145 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
146 146 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
147 147 and "WFA" or "WF")
148 148
149 149 def colwidth(s):
150 150 "Find the column width of a string for display in the local encoding"
151 151 return ucolwidth(s.decode(encoding, 'replace'))
152 152
153 153 def ucolwidth(d):
154 154 "Find the column width of a Unicode string for display"
155 155 eaw = getattr(unicodedata, 'east_asian_width', None)
156 156 if eaw is not None:
157 157 return sum([eaw(c) in wide and 2 or 1 for c in d])
158 158 return len(d)
159 159
160 160 def getcols(s, start, c):
161 161 '''Use colwidth to find a c-column substring of s starting at byte
162 162 index start'''
163 163 for x in xrange(start + c, len(s)):
164 164 t = s[start:x]
165 165 if colwidth(t) == c:
166 166 return t
167 167
168 def trim(s, width, ellipsis=''):
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170
171 >>> ellipsis = '+++'
172 >>> from mercurial import encoding
173 >>> encoding.encoding = 'utf-8'
174 >>> t= '1234567890'
175 >>> print trim(t, 12, ellipsis=ellipsis)
176 1234567890
177 >>> print trim(t, 10, ellipsis=ellipsis)
178 1234567890
179 >>> print trim(t, 8, ellipsis=ellipsis)
180 12345+++
181 >>> print trim(t, 8)
182 12345678
183 >>> print trim(t, 3, ellipsis=ellipsis)
184 +++
185 >>> print trim(t, 1, ellipsis=ellipsis)
186 +
187 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
188 >>> t = u.encode(encoding.encoding)
189 >>> print trim(t, 12, ellipsis=ellipsis)
190 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
191 >>> print trim(t, 10, ellipsis=ellipsis)
192 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
193 >>> print trim(t, 8, ellipsis=ellipsis)
194 \xe3\x81\x82\xe3\x81\x84+++
195 >>> print trim(t, 5)
196 \xe3\x81\x82\xe3\x81\x84
197 >>> print trim(t, 4, ellipsis=ellipsis)
198 +++
199 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
200 >>> print trim(t, 12, ellipsis=ellipsis)
201 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
202 >>> print trim(t, 10, ellipsis=ellipsis)
203 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
204 >>> print trim(t, 8, ellipsis=ellipsis)
205 \x11\x22\x33\x44\x55+++
206 >>> print trim(t, 8)
207 \x11\x22\x33\x44\x55\x66\x77\x88
208 >>> print trim(t, 3, ellipsis=ellipsis)
209 +++
210 >>> print trim(t, 1, ellipsis=ellipsis)
211 +
212 """
213 try:
214 u = s.decode(encoding)
215 except UnicodeDecodeError:
216 if len(s) <= width: # trimming is not needed
217 return s
218 width -= len(ellipsis)
219 if width <= 0: # no enough room even for ellipsis
220 return ellipsis[:width + len(ellipsis)]
221 return s[:width] + ellipsis
222
223 if ucolwidth(u) <= width: # trimming is not needed
224 return s
225
226 width -= len(ellipsis)
227 if width <= 0: # no enough room even for ellipsis
228 return ellipsis[:width + len(ellipsis)]
229
230 uslice = lambda i: u[:-i]
231 concat = lambda s: s + ellipsis
232 for i in xrange(1, len(u)):
233 usub = uslice(i)
234 if ucolwidth(usub) <= width:
235 return concat(usub.encode(encoding))
236 return ellipsis # no enough room for multi-column characters
237
168 238 def lower(s):
169 239 "best-effort encoding-aware case-folding of local string s"
170 240 try:
171 241 s.decode('ascii') # throw exception for non-ASCII character
172 242 return s.lower()
173 243 except UnicodeDecodeError:
174 244 pass
175 245 try:
176 246 if isinstance(s, localstr):
177 247 u = s._utf8.decode("utf-8")
178 248 else:
179 249 u = s.decode(encoding, encodingmode)
180 250
181 251 lu = u.lower()
182 252 if u == lu:
183 253 return s # preserve localstring
184 254 return lu.encode(encoding)
185 255 except UnicodeError:
186 256 return s.lower() # we don't know how to fold this except in ASCII
187 257 except LookupError, k:
188 258 raise error.Abort(k, hint="please check your locale settings")
189 259
190 260 def upper(s):
191 261 "best-effort encoding-aware case-folding of local string s"
192 262 try:
193 263 s.decode('ascii') # throw exception for non-ASCII character
194 264 return s.upper()
195 265 except UnicodeDecodeError:
196 266 pass
197 267 try:
198 268 if isinstance(s, localstr):
199 269 u = s._utf8.decode("utf-8")
200 270 else:
201 271 u = s.decode(encoding, encodingmode)
202 272
203 273 uu = u.upper()
204 274 if u == uu:
205 275 return s # preserve localstring
206 276 return uu.encode(encoding)
207 277 except UnicodeError:
208 278 return s.upper() # we don't know how to fold this except in ASCII
209 279 except LookupError, k:
210 280 raise error.Abort(k, hint="please check your locale settings")
211 281
212 282 def toutf8b(s):
213 283 '''convert a local, possibly-binary string into UTF-8b
214 284
215 285 This is intended as a generic method to preserve data when working
216 286 with schemes like JSON and XML that have no provision for
217 287 arbitrary byte strings. As Mercurial often doesn't know
218 288 what encoding data is in, we use so-called UTF-8b.
219 289
220 290 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
221 291 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
222 292 uDC00-uDCFF.
223 293
224 294 Principles of operation:
225 295
226 296 - ASCII and UTF-8 data successfully round-trips and is understood
227 297 by Unicode-oriented clients
228 298 - filenames and file contents in arbitrary other encodings can have
229 299 be round-tripped or recovered by clueful clients
230 300 - local strings that have a cached known UTF-8 encoding (aka
231 301 localstr) get sent as UTF-8 so Unicode-oriented clients get the
232 302 Unicode data they want
233 303 - because we must preserve UTF-8 bytestring in places such as
234 304 filenames, metadata can't be roundtripped without help
235 305
236 306 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
237 307 arbitrary bytes into an internal Unicode format that can be
238 308 re-encoded back into the original. Here we are exposing the
239 309 internal surrogate encoding as a UTF-8 string.)
240 310 '''
241 311
242 312 if isinstance(s, localstr):
243 313 return s._utf8
244 314
245 315 try:
246 316 if s.decode('utf-8'):
247 317 return s
248 318 except UnicodeDecodeError:
249 319 # surrogate-encode any characters that don't round-trip
250 320 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
251 321 r = ""
252 322 pos = 0
253 323 for c in s:
254 324 if s2[pos:pos + 1] == c:
255 325 r += c
256 326 pos += 1
257 327 else:
258 328 r += unichr(0xdc00 + ord(c)).encode('utf-8')
259 329 return r
260 330
261 331 def fromutf8b(s):
262 332 '''Given a UTF-8b string, return a local, possibly-binary string.
263 333
264 334 return the original binary string. This
265 335 is a round-trip process for strings like filenames, but metadata
266 336 that's was passed through tolocal will remain in UTF-8.
267 337
268 338 >>> m = "\\xc3\\xa9\\x99abcd"
269 339 >>> n = toutf8b(m)
270 340 >>> n
271 341 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
272 342 >>> fromutf8b(n) == m
273 343 True
274 344 '''
275 345
276 346 # fast path - look for uDxxx prefixes in s
277 347 if "\xed" not in s:
278 348 return s
279 349
280 350 u = s.decode("utf-8")
281 351 r = ""
282 352 for c in u:
283 353 if ord(c) & 0xff00 == 0xdc00:
284 354 r += chr(ord(c) & 0xff)
285 355 else:
286 356 r += c.encode("utf-8")
287 357 return r
General Comments 0
You need to be logged in to leave comments. Login now