Show More
@@ -190,3 +190,80 b' def upper(s):' | |||
|
190 | 190 | return s.upper() # we don't know how to fold this except in ASCII |
|
191 | 191 | except LookupError, k: |
|
192 | 192 | raise error.Abort(k, hint="please check your locale settings") |
|
193 | ||
|
194 | def toutf8b(s): | |
|
195 | '''convert a local, possibly-binary string into UTF-8b | |
|
196 | ||
|
197 | This is intended as a generic method to preserve data when working | |
|
198 | with schemes like JSON and XML that have no provision for | |
|
199 | arbitrary byte strings. As Mercurial often doesn't know | |
|
200 | what encoding data is in, we use so-called UTF-8b. | |
|
201 | ||
|
202 | If a string is already valid UTF-8 (or ASCII), it passes unmodified. | |
|
203 | Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, | |
|
204 | uDC00-uDCFF. | |
|
205 | ||
|
206 | Principles of operation: | |
|
207 | ||
|
208 | - ASCII and UTF-8 data sucessfully round-trips and is understood | |
|
209 | by Unicode-oriented clients | |
|
210 | - filenames and file contents in arbitrary other encodings can have | |
|
211 | be round-tripped or recovered by clueful clients | |
|
212 | - local strings that have a cached known UTF-8 encoding (aka | |
|
213 | localstr) get sent as UTF-8 so Unicode-oriented clients get the | |
|
214 | Unicode data they want | |
|
215 | - because we must preserve UTF-8 bytestring in places such as | |
|
216 | filenames, metadata can't be roundtripped without help | |
|
217 | ||
|
218 | (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and | |
|
219 | arbitrary bytes into an internal Unicode format that can be | |
|
220 | re-encoded back into the original. Here we are exposing the | |
|
221 | internal surrogate encoding as a UTF-8 string.) | |
|
222 | ''' | |
|
223 | ||
|
224 | if isinstance(s, localstr): | |
|
225 | return s._utf8 | |
|
226 | ||
|
227 | try: | |
|
228 | if s.decode('utf-8'): | |
|
229 | return s | |
|
230 | except UnicodeDecodeError: | |
|
231 | # surrogate-encode any characters that don't round-trip | |
|
232 | s2 = s.decode('utf-8', 'ignore').encode('utf-8') | |
|
233 | r = "" | |
|
234 | pos = 0 | |
|
235 | for c in s: | |
|
236 | if s2[pos:pos + 1] == c: | |
|
237 | r += c | |
|
238 | pos += 1 | |
|
239 | else: | |
|
240 | r += unichr(0xdc00 + ord(c)).encode('utf-8') | |
|
241 | return r | |
|
242 | ||
|
243 | def fromutf8b(s): | |
|
244 | '''Given a UTF-8b string, return a local, possibly-binary string. | |
|
245 | ||
|
246 | return the original binary string. This | |
|
247 | is a round-trip process for strings like filenames, but metadata | |
|
248 | that's was passed through tolocal will remain in UTF-8. | |
|
249 | ||
|
250 | >>> m = "\\xc3\\xa9\\x99abcd" | |
|
251 | >>> n = toutf8b(m) | |
|
252 | >>> n | |
|
253 | '\\xc3\\xa9\\xed\\xb2\\x99abcd' | |
|
254 | >>> fromutf8b(n) == m | |
|
255 | True | |
|
256 | ''' | |
|
257 | ||
|
258 | # fast path - look for uDxxx prefixes in s | |
|
259 | if "\xed" not in s: | |
|
260 | return s | |
|
261 | ||
|
262 | u = s.decode("utf-8") | |
|
263 | r = "" | |
|
264 | for c in u: | |
|
265 | if ord(c) & 0xff00 == 0xdc00: | |
|
266 | r += chr(ord(c) & 0xff) | |
|
267 | else: | |
|
268 | r += c.encode("utf-8") | |
|
269 | return r |
General Comments 0
You need to be logged in to leave comments.
Login now