upstream/mercurial-mirror Commit - r16133:84c58da3

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

def _getpreferredencoding():

11

def _getpreferredencoding():

12

'''

12

'''

13

On darwin, getpreferredencoding ignores the locale environment and

13

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

15

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

16

Python versions.

17

18

However, we can't use a version check for this method, as some distributions

18

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

20

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

21

'''

22

try:

22

try:

23

locale.CODESET

23

locale.CODESET

24

except AttributeError:

24

except AttributeError:

25

# Fall back to parsing environment variables :-(

25

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

26

return locale.getdefaultlocale()[1]

27

28

oldloc = locale.setlocale(locale.LC_CTYPE)

28

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

29

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

30

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

32

33

return result

33

return result

34

35

_encodingfixers = {

35

_encodingfixers = {

36

'646': lambda: 'ascii',

36

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

38

'mac-roman': _getpreferredencoding

39

}

39

}

40

41

try:

41

try:

42

encoding = os.environ.get("HGENCODING")

42

encoding = os.environ.get("HGENCODING")

43

if not encoding:

43

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

44

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

46

except locale.Error:

47

encoding = 'ascii'

47

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

49

fallbackencoding = 'ISO-8859-1'

50

51

class localstr(str):

51

class localstr(str):

52

'''This class allows strings that are unmodified to be

52

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

53

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

54

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

55

s = str.__new__(cls, l)

56

s._utf8 = u

56

s._utf8 = u

57

return s

57

return s

58

def __hash__(self):

58

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

59

return hash(self._utf8) # avoid collisions in local string space

60

61

def tolocal(s):

61

def tolocal(s):

62

"""

62

"""

63

Convert a string from internal UTF-8 to local encoding

63

Convert a string from internal UTF-8 to local encoding

64

65

All internal strings should be UTF-8 but some repos before the

65

All internal strings should be UTF-8 but some repos before the

66

implementation of locale support may contain latin1 or possibly

66

implementation of locale support may contain latin1 or possibly

67

other character sets. We attempt to decode everything strictly

67

other character sets. We attempt to decode everything strictly

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

69

replace unknown characters.

69

replace unknown characters.

70

71

The localstr class is used to cache the known UTF-8 encoding of

71

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

72

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

73

round-trip conversion back to UTF-8.

74

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

76

>>> l = tolocal(u)

77

>>> l

77

>>> l

78

'foo: ?'

78

'foo: ?'

79

>>> fromlocal(l)

79

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

80

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

81

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

82

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> d # no collision

83

>>> d # no collision

84

{'foo: ?': 1, 'foo: ?': 2}

84

{'foo: ?': 1, 'foo: ?': 2}

85

>>> 'foo: ?' in d

85

>>> 'foo: ?' in d

86

False

86

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

88

>>> l = tolocal(l1)

89

>>> l

89

>>> l

90

'foo: ?'

90

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

91

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

92

'foo: \\xc3\\xa4'

93

"""

93

"""

94

95

for e in ('UTF-8', fallbackencoding):

95

for e in ('UTF-8', fallbackencoding):

96

try:

96

try:

97

u = s.decode(e) # attempt strict decoding

97

u = s.decode(e) # attempt strict decoding

98

r = u.encode(encoding, "replace")

98

r = u.encode(encoding, "replace")

99

if u == r.decode(encoding):

99

if u == r.decode(encoding):

100

# r is a safe, non-lossy encoding of s

100

# r is a safe, non-lossy encoding of s

101

return r

101

return r

102

elif e == 'UTF-8':

102

elif e == 'UTF-8':

103

return localstr(s, r)

103

return localstr(s, r)

104

else:

104

else:

105

return localstr(u.encode('UTF-8'), r)

105

return localstr(u.encode('UTF-8'), r)

106

107

except LookupError, k:

107

except LookupError, k:

108

raise error.Abort(k, hint="please check your locale settings")

108

raise error.Abort(k, hint="please check your locale settings")

109

except UnicodeDecodeError:

109

except UnicodeDecodeError:

110

pass

110

pass

111

u = s.decode("utf-8", "replace") # last ditch

111

u = s.decode("utf-8", "replace") # last ditch

112

return u.encode(encoding, "replace") # can't round-trip

112

return u.encode(encoding, "replace") # can't round-trip

113

114

def fromlocal(s):

114

def fromlocal(s):

115

"""

115

"""

116

Convert a string from the local character encoding to UTF-8

116

Convert a string from the local character encoding to UTF-8

117

118

We attempt to decode strings using the encoding mode set by

118

We attempt to decode strings using the encoding mode set by

119

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

119

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

120

characters will cause an error message. Other modes include

120

characters will cause an error message. Other modes include

121

'replace', which replaces unknown characters with a special

121

'replace', which replaces unknown characters with a special

122

Unicode character, and 'ignore', which drops the character.

122

Unicode character, and 'ignore', which drops the character.

123

"""

123

"""

124

125

# can we do a lossless round-trip?

125

# can we do a lossless round-trip?

126

if isinstance(s, localstr):

126

if isinstance(s, localstr):

127

return s._utf8

127

return s._utf8

128

129

try:

129

try:

130

return s.decode(encoding, encodingmode).encode("utf-8")

130

return s.decode(encoding, encodingmode).encode("utf-8")

131

except UnicodeDecodeError, inst:

131

except UnicodeDecodeError, inst:

132

sub = s[max(0, inst.start - 10):inst.start + 10]

132

sub = s[max(0, inst.start - 10):inst.start + 10]

133

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

133

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

134

except LookupError, k:

134

except LookupError, k:

135

raise error.Abort(k, hint="please check your locale settings")

135

raise error.Abort(k, hint="please check your locale settings")

136

137

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

137

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

138

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

138

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

139

and "WFA" or "WF")

139

and "WFA" or "WF")

140

141

def colwidth(s):

141

def colwidth(s):

142

"Find the column width of a string for display in the local encoding"

142

"Find the column width of a string for display in the local encoding"

143

return ucolwidth(s.decode(encoding, 'replace'))

143

return ucolwidth(s.decode(encoding, 'replace'))

144

145

def ucolwidth(d):

145

def ucolwidth(d):

146

"Find the column width of a Unicode string for display"

146

"Find the column width of a Unicode string for display"

147

eaw = getattr(unicodedata, 'east_asian_width', None)

147

eaw = getattr(unicodedata, 'east_asian_width', None)

148

if eaw is not None:

148

if eaw is not None:

149

return sum([eaw(c) in wide and 2 or 1 for c in d])

149

return sum([eaw(c) in wide and 2 or 1 for c in d])

150

return len(d)

150

return len(d)

151

152

def getcols(s, start, c):

152

def getcols(s, start, c):

153

'''Use colwidth to find a c-column substring of s starting at byte

153

'''Use colwidth to find a c-column substring of s starting at byte

154

index start'''

154

index start'''

155

for x in xrange(start + c, len(s)):

155

for x in xrange(start + c, len(s)):

156

t = s[start:x]

156

t = s[start:x]

157

if colwidth(t) == c:

157

if colwidth(t) == c:

158

return t

158

return t

159

160

def lower(s):

160

def lower(s):

161

"best-effort encoding-aware case-folding of local string s"

161

"best-effort encoding-aware case-folding of local string s"

162

try:

162

try:

163

if isinstance(s, localstr):

163

if isinstance(s, localstr):

164

u = s._utf8.decode("utf-8")

164

u = s._utf8.decode("utf-8")

165

else:

165

else:

166

u = s.decode(encoding, encodingmode)

166

u = s.decode(encoding, encodingmode)

167

168

lu = u.lower()

168

lu = u.lower()

169

if u == lu:

169

if u == lu:

170

return s # preserve localstring

170

return s # preserve localstring

171

return lu.encode(encoding)

171

return lu.encode(encoding)

172

except UnicodeError:

172

except UnicodeError:

173

return s.lower() # we don't know how to fold this except in ASCII

173

return s.lower() # we don't know how to fold this except in ASCII

174

except LookupError, k:

174

except LookupError, k:

175

raise error.Abort(k, hint="please check your locale settings")

175

raise error.Abort(k, hint="please check your locale settings")

176

177

def upper(s):

177

def upper(s):

178

"best-effort encoding-aware case-folding of local string s"

178

"best-effort encoding-aware case-folding of local string s"

179

try:

179

try:

180

if isinstance(s, localstr):

180

if isinstance(s, localstr):

181

u = s._utf8.decode("utf-8")

181

u = s._utf8.decode("utf-8")

182

else:

182

else:

183

u = s.decode(encoding, encodingmode)

183

u = s.decode(encoding, encodingmode)

184

185

uu = u.upper()

185

uu = u.upper()

186

if u == uu:

186

if u == uu:

187

return s # preserve localstring

187

return s # preserve localstring

188

return uu.encode(encoding)

188

return uu.encode(encoding)

189

except UnicodeError:

189

except UnicodeError:

190

return s.upper() # we don't know how to fold this except in ASCII

190

return s.upper() # we don't know how to fold this except in ASCII

191

except LookupError, k:

191

except LookupError, k:

192

raise error.Abort(k, hint="please check your locale settings")

192

raise error.Abort(k, hint="please check your locale settings")

193

194

def toutf8b(s):

195

'''convert a local, possibly-binary string into UTF-8b

196

197

This is intended as a generic method to preserve data when working

198

with schemes like JSON and XML that have no provision for

199

arbitrary byte strings. As Mercurial often doesn't know

200

what encoding data is in, we use so-called UTF-8b.

201

202

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

203

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

204

uDC00-uDCFF.

205

206

Principles of operation:

207

208

- ASCII and UTF-8 data sucessfully round-trips and is understood

209

by Unicode-oriented clients

210

- filenames and file contents in arbitrary other encodings can have

211

be round-tripped or recovered by clueful clients

212

- local strings that have a cached known UTF-8 encoding (aka

213

localstr) get sent as UTF-8 so Unicode-oriented clients get the

214

Unicode data they want

215

- because we must preserve UTF-8 bytestring in places such as

216

filenames, metadata can't be roundtripped without help

217

218

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

219

arbitrary bytes into an internal Unicode format that can be

220

re-encoded back into the original. Here we are exposing the

221

internal surrogate encoding as a UTF-8 string.)

222

'''

223

224

if isinstance(s, localstr):

225

return s._utf8

226

227

try:

228

if s.decode('utf-8'):

229

return s

230

except UnicodeDecodeError:

231

# surrogate-encode any characters that don't round-trip

232

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

233

r = ""

234

pos = 0

235

for c in s:

236

if s2[pos:pos + 1] == c:

237

r += c

238

pos += 1

239

else:

240

r += unichr(0xdc00 + ord(c)).encode('utf-8')

241

return r

242

243

def fromutf8b(s):

244

'''Given a UTF-8b string, return a local, possibly-binary string.

245

246

return the original binary string. This

247

is a round-trip process for strings like filenames, but metadata

248

that's was passed through tolocal will remain in UTF-8.

249

250

>>> m = "\\xc3\\xa9\\x99abcd"

251

>>> n = toutf8b(m)

252

>>> n

253

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

254

>>> fromutf8b(n) == m

255

True

256

'''

257

258

# fast path - look for uDxxx prefixes in s

259

if "\xed" not in s:

260

return s

261

262

u = s.decode("utf-8")

263

r = ""

264

for c in u:

265

if ord(c) & 0xff00 == 0xdc00:

266

r += chr(ord(c) & 0xff)

267

else:

268

r += c.encode("utf-8")

269

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> d # no collision
                 {'foo: ?': 1, 'foo: ?': 2}
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 for e in ('UTF-8', fallbackencoding):
                     try:
                         u = s.decode(e) # attempt strict decoding
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         elif e == 'UTF-8':
                             return localstr(s, r)
                         else:
                             return localstr(u.encode('UTF-8'), r)
                     except LookupError, k:
                         raise error.Abort(k, hint="please check your locale settings")
                     except UnicodeDecodeError:
                         pass
                 u = s.decode("utf-8", "replace") # last ditch
                 return u.encode(encoding, "replace") # can't round-trip
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
+            def toutf8b(s):
+                '''convert a local, possibly-binary string into UTF-8b
+                This is intended as a generic method to preserve data when working
+                with schemes like JSON and XML that have no provision for
+                arbitrary byte strings. As Mercurial often doesn't know
+                what encoding data is in, we use so-called UTF-8b.
+                If a string is already valid UTF-8 (or ASCII), it passes unmodified.
+                Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
+                uDC00-uDCFF.
+                Principles of operation:
+                - ASCII and UTF-8 data sucessfully round-trips and is understood
+                  by Unicode-oriented clients
+                - filenames and file contents in arbitrary other encodings can have
+                  be round-tripped or recovered by clueful clients
+                - local strings that have a cached known UTF-8 encoding (aka
+                  localstr) get sent as UTF-8 so Unicode-oriented clients get the
+                  Unicode data they want
+                - because we must preserve UTF-8 bytestring in places such as
+                  filenames, metadata can't be roundtripped without help
+                (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
+                arbitrary bytes into an internal Unicode format that can be
+                re-encoded back into the original. Here we are exposing the
+                internal surrogate encoding as a UTF-8 string.)
+                '''
+                if isinstance(s, localstr):
+                    return s._utf8
+                try:
+                    if s.decode('utf-8'):
+                        return s
+                except UnicodeDecodeError:
+                    # surrogate-encode any characters that don't round-trip
+                    s2 = s.decode('utf-8', 'ignore').encode('utf-8')
+                    r = ""
+                    pos = 0
+                    for c in s:
+                        if s2[pos:pos + 1] == c:
+                            r += c
+                            pos += 1
+                        else:
+                            r += unichr(0xdc00 + ord(c)).encode('utf-8')
+                    return r
+            def fromutf8b(s):
+                '''Given a UTF-8b string, return a local, possibly-binary string.
+                return the original binary string. This
+                is a round-trip process for strings like filenames, but metadata
+                that's was passed through tolocal will remain in UTF-8.
+                >>> m = "\\xc3\\xa9\\x99abcd"
+                >>> n = toutf8b(m)
+                >>> n
+                '\\xc3\\xa9\\xed\\xb2\\x99abcd'
+                >>> fromutf8b(n) == m
+                True
+                '''
+                # fast path - look for uDxxx prefixes in s
+                if "\xed" not in s:
+                    return s
+                u = s.decode("utf-8")
+                r = ""
+                for c in u:
+                    if ord(c) & 0xff00 == 0xdc00:
+                        r += chr(ord(c) & 0xff)
+                    else:
+                        r += c.encode("utf-8")
+                return r