upstream/mercurial-mirror Commit - r16133:84c58da3

encoding: introduce utf8-b helpers

Matt Mackall -

r16133:84c58da3 default

parent child

mercurial/encoding.py

0 +77 0

@@ -190,3 +190,80 b' def upper(s):'
190	return s.upper() # we don't know how to fold this except in ASCII	190	return s.upper() # we don't know how to fold this except in ASCII
191	except LookupError, k:	191	except LookupError, k:
192	raise error.Abort(k, hint="please check your locale settings")	192	raise error.Abort(k, hint="please check your locale settings")
		193
		194	def toutf8b(s):
		195	'''convert a local, possibly-binary string into UTF-8b
		196
		197	This is intended as a generic method to preserve data when working
		198	with schemes like JSON and XML that have no provision for
		199	arbitrary byte strings. As Mercurial often doesn't know
		200	what encoding data is in, we use so-called UTF-8b.
		201
		202	If a string is already valid UTF-8 (or ASCII), it passes unmodified.
		203	Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
		204	uDC00-uDCFF.
		205
		206	Principles of operation:
		207
		208	- ASCII and UTF-8 data sucessfully round-trips and is understood
		209	by Unicode-oriented clients
		210	- filenames and file contents in arbitrary other encodings can have
		211	be round-tripped or recovered by clueful clients
		212	- local strings that have a cached known UTF-8 encoding (aka
		213	localstr) get sent as UTF-8 so Unicode-oriented clients get the
		214	Unicode data they want
		215	- because we must preserve UTF-8 bytestring in places such as
		216	filenames, metadata can't be roundtripped without help
		217
		218	(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
		219	arbitrary bytes into an internal Unicode format that can be
		220	re-encoded back into the original. Here we are exposing the
		221	internal surrogate encoding as a UTF-8 string.)
		222	'''
		223
		224	if isinstance(s, localstr):
		225	return s._utf8
		226
		227	try:
		228	if s.decode('utf-8'):
		229	return s
		230	except UnicodeDecodeError:
		231	# surrogate-encode any characters that don't round-trip
		232	s2 = s.decode('utf-8', 'ignore').encode('utf-8')
		233	r = ""
		234	pos = 0
		235	for c in s:
		236	if s2[pos:pos + 1] == c:
		237	r += c
		238	pos += 1
		239	else:
		240	r += unichr(0xdc00 + ord(c)).encode('utf-8')
		241	return r
		242
		243	def fromutf8b(s):
		244	'''Given a UTF-8b string, return a local, possibly-binary string.
		245
		246	return the original binary string. This
		247	is a round-trip process for strings like filenames, but metadata
		248	that's was passed through tolocal will remain in UTF-8.
		249
		250	>>> m = "\\xc3\\xa9\\x99abcd"
		251	>>> n = toutf8b(m)
		252	>>> n
		253	'\\xc3\\xa9\\xed\\xb2\\x99abcd'
		254	>>> fromutf8b(n) == m
		255	True
		256	'''
		257
		258	# fast path - look for uDxxx prefixes in s
		259	if "\xed" not in s:
		260	return s
		261
		262	u = s.decode("utf-8")
		263	r = ""
		264	for c in u:
		265	if ord(c) & 0xff00 == 0xdc00:
		266	r += chr(ord(c) & 0xff)
		267	else:
		268	r += c.encode("utf-8")
		269	return r

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages