upstream/mercurial-mirror Commit - r16133:84c58da3

encoding: introduce utf8-b helpers

Matt Mackall -

r16133:84c58da3 default

parent child

mercurial/encoding.py

0 +77 0

		@@ -190,3 +190,80 b' def upper(s):'
190	190	return s.upper() # we don't know how to fold this except in ASCII
191	191	except LookupError, k:
192	192	raise error.Abort(k, hint="please check your locale settings")
	193
	194	def toutf8b(s):
	195	'''convert a local, possibly-binary string into UTF-8b
	196
	197	This is intended as a generic method to preserve data when working
	198	with schemes like JSON and XML that have no provision for
	199	arbitrary byte strings. As Mercurial often doesn't know
	200	what encoding data is in, we use so-called UTF-8b.
	201
	202	If a string is already valid UTF-8 (or ASCII), it passes unmodified.
	203	Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
	204	uDC00-uDCFF.
	205
	206	Principles of operation:
	207
	208	- ASCII and UTF-8 data sucessfully round-trips and is understood
	209	by Unicode-oriented clients
	210	- filenames and file contents in arbitrary other encodings can have
	211	be round-tripped or recovered by clueful clients
	212	- local strings that have a cached known UTF-8 encoding (aka
	213	localstr) get sent as UTF-8 so Unicode-oriented clients get the
	214	Unicode data they want
	215	- because we must preserve UTF-8 bytestring in places such as
	216	filenames, metadata can't be roundtripped without help
	217
	218	(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
	219	arbitrary bytes into an internal Unicode format that can be
	220	re-encoded back into the original. Here we are exposing the
	221	internal surrogate encoding as a UTF-8 string.)
	222	'''
	223
	224	if isinstance(s, localstr):
	225	return s._utf8
	226
	227	try:
	228	if s.decode('utf-8'):
	229	return s
	230	except UnicodeDecodeError:
	231	# surrogate-encode any characters that don't round-trip
	232	s2 = s.decode('utf-8', 'ignore').encode('utf-8')
	233	r = ""
	234	pos = 0
	235	for c in s:
	236	if s2[pos:pos + 1] == c:
	237	r += c
	238	pos += 1
	239	else:
	240	r += unichr(0xdc00 + ord(c)).encode('utf-8')
	241	return r
	242
	243	def fromutf8b(s):
	244	'''Given a UTF-8b string, return a local, possibly-binary string.
	245
	246	return the original binary string. This
	247	is a round-trip process for strings like filenames, but metadata
	248	that's was passed through tolocal will remain in UTF-8.
	249
	250	>>> m = "\\xc3\\xa9\\x99abcd"
	251	>>> n = toutf8b(m)
	252	>>> n
	253	'\\xc3\\xa9\\xed\\xb2\\x99abcd'
	254	>>> fromutf8b(n) == m
	255	True
	256	'''
	257
	258	# fast path - look for uDxxx prefixes in s
	259	if "\xed" not in s:
	260	return s
	261
	262	u = s.decode("utf-8")
	263	r = ""
	264	for c in u:
	265	if ord(c) & 0xff00 == 0xdc00:
	266	r += chr(ord(c) & 0xff)
	267	else:
	268	r += c.encode("utf-8")
	269	return r

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages