##// END OF EJS Templates
encoding: introduce utf8-b helpers
Matt Mackall -
r16133:84c58da3 default
parent child Browse files
Show More
@@ -190,3 +190,80 b' def upper(s):'
190 return s.upper() # we don't know how to fold this except in ASCII
190 return s.upper() # we don't know how to fold this except in ASCII
191 except LookupError, k:
191 except LookupError, k:
192 raise error.Abort(k, hint="please check your locale settings")
192 raise error.Abort(k, hint="please check your locale settings")
193
194 def toutf8b(s):
195 '''convert a local, possibly-binary string into UTF-8b
196
197 This is intended as a generic method to preserve data when working
198 with schemes like JSON and XML that have no provision for
199 arbitrary byte strings. As Mercurial often doesn't know
200 what encoding data is in, we use so-called UTF-8b.
201
202 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
203 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
204 uDC00-uDCFF.
205
206 Principles of operation:
207
208 - ASCII and UTF-8 data sucessfully round-trips and is understood
209 by Unicode-oriented clients
210 - filenames and file contents in arbitrary other encodings can have
211 be round-tripped or recovered by clueful clients
212 - local strings that have a cached known UTF-8 encoding (aka
213 localstr) get sent as UTF-8 so Unicode-oriented clients get the
214 Unicode data they want
215 - because we must preserve UTF-8 bytestring in places such as
216 filenames, metadata can't be roundtripped without help
217
218 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
219 arbitrary bytes into an internal Unicode format that can be
220 re-encoded back into the original. Here we are exposing the
221 internal surrogate encoding as a UTF-8 string.)
222 '''
223
224 if isinstance(s, localstr):
225 return s._utf8
226
227 try:
228 if s.decode('utf-8'):
229 return s
230 except UnicodeDecodeError:
231 # surrogate-encode any characters that don't round-trip
232 s2 = s.decode('utf-8', 'ignore').encode('utf-8')
233 r = ""
234 pos = 0
235 for c in s:
236 if s2[pos:pos + 1] == c:
237 r += c
238 pos += 1
239 else:
240 r += unichr(0xdc00 + ord(c)).encode('utf-8')
241 return r
242
243 def fromutf8b(s):
244 '''Given a UTF-8b string, return a local, possibly-binary string.
245
246 return the original binary string. This
247 is a round-trip process for strings like filenames, but metadata
248 that's was passed through tolocal will remain in UTF-8.
249
250 >>> m = "\\xc3\\xa9\\x99abcd"
251 >>> n = toutf8b(m)
252 >>> n
253 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
254 >>> fromutf8b(n) == m
255 True
256 '''
257
258 # fast path - look for uDxxx prefixes in s
259 if "\xed" not in s:
260 return s
261
262 u = s.decode("utf-8")
263 r = ""
264 for c in u:
265 if ord(c) & 0xff00 == 0xdc00:
266 r += chr(ord(c) & 0xff)
267 else:
268 r += c.encode("utf-8")
269 return r
General Comments 0
You need to be logged in to leave comments. Login now