Show More
@@ -190,3 +190,80 b' def upper(s):' | |||||
190 | return s.upper() # we don't know how to fold this except in ASCII |
|
190 | return s.upper() # we don't know how to fold this except in ASCII | |
191 | except LookupError, k: |
|
191 | except LookupError, k: | |
192 | raise error.Abort(k, hint="please check your locale settings") |
|
192 | raise error.Abort(k, hint="please check your locale settings") | |
|
193 | ||||
|
194 | def toutf8b(s): | |||
|
195 | '''convert a local, possibly-binary string into UTF-8b | |||
|
196 | ||||
|
197 | This is intended as a generic method to preserve data when working | |||
|
198 | with schemes like JSON and XML that have no provision for | |||
|
199 | arbitrary byte strings. As Mercurial often doesn't know | |||
|
200 | what encoding data is in, we use so-called UTF-8b. | |||
|
201 | ||||
|
202 | If a string is already valid UTF-8 (or ASCII), it passes unmodified. | |||
|
203 | Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | |||
|
204 | uDC00-uDCFF. | |||
|
205 | ||||
|
206 | Principles of operation: | |||
|
207 | ||||
|
208 | - ASCII and UTF-8 data sucessfully round-trips and is understood | |||
|
209 | by Unicode-oriented clients | |||
|
210 | - filenames and file contents in arbitrary other encodings can have | |||
|
211 | be round-tripped or recovered by clueful clients | |||
|
212 | - local strings that have a cached known UTF-8 encoding (aka | |||
|
213 | localstr) get sent as UTF-8 so Unicode-oriented clients get the | |||
|
214 | Unicode data they want | |||
|
215 | - because we must preserve UTF-8 bytestring in places such as | |||
|
216 | filenames, metadata can't be roundtripped without help | |||
|
217 | ||||
|
218 | (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | |||
|
219 | arbitrary bytes into an internal Unicode format that can be | |||
|
220 | re-encoded back into the original. Here we are exposing the | |||
|
221 | internal surrogate encoding as a UTF-8 string.) | |||
|
222 | ''' | |||
|
223 | ||||
|
224 | if isinstance(s, localstr): | |||
|
225 | return s._utf8 | |||
|
226 | ||||
|
227 | try: | |||
|
228 | if s.decode('utf-8'): | |||
|
229 | return s | |||
|
230 | except UnicodeDecodeError: | |||
|
231 | # surrogate-encode any characters that don't round-trip | |||
|
232 | s2 = s.decode('utf-8', 'ignore').encode('utf-8') | |||
|
233 | r = "" | |||
|
234 | pos = 0 | |||
|
235 | for c in s: | |||
|
236 | if s2[pos:pos + 1] == c: | |||
|
237 | r += c | |||
|
238 | pos += 1 | |||
|
239 | else: | |||
|
240 | r += unichr(0xdc00 + ord(c)).encode('utf-8') | |||
|
241 | return r | |||
|
242 | ||||
|
243 | def fromutf8b(s): | |||
|
244 | '''Given a UTF-8b string, return a local, possibly-binary string. | |||
|
245 | ||||
|
246 | return the original binary string. This | |||
|
247 | is a round-trip process for strings like filenames, but metadata | |||
|
248 | that's was passed through tolocal will remain in UTF-8. | |||
|
249 | ||||
|
250 | >>> m = "\\xc3\\xa9\\x99abcd" | |||
|
251 | >>> n = toutf8b(m) | |||
|
252 | >>> n | |||
|
253 | '\\xc3\\xa9\\xed\\xb2\\x99abcd' | |||
|
254 | >>> fromutf8b(n) == m | |||
|
255 | True | |||
|
256 | ''' | |||
|
257 | ||||
|
258 | # fast path - look for uDxxx prefixes in s | |||
|
259 | if "\xed" not in s: | |||
|
260 | return s | |||
|
261 | ||||
|
262 | u = s.decode("utf-8") | |||
|
263 | r = "" | |||
|
264 | for c in u: | |||
|
265 | if ord(c) & 0xff00 == 0xdc00: | |||
|
266 | r += chr(ord(c) & 0xff) | |||
|
267 | else: | |||
|
268 | r += c.encode("utf-8") | |||
|
269 | return r |
General Comments 0
You need to be logged in to leave comments.
Login now