upstream/mercurial-mirror Commit - r23596:885bd7c5

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

12

# "Unicode Subtleties"), so we need to ignore them in some places for

13

# sanity.

14

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

15

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

16

"206a 206b 206c 206d 206e 206f feff".split()]

17

# verify the next function will work

18

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

19

20

def hfsignoreclean(s):

21

"""Remove codepoints ignored by HFS+ from s.

22

23

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

24

'.hg'

25

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

26

'.hg'

27

"""

28

if "\xe2" in s or "\xef" in s:

29

for c in _ignore:

30

s = s.replace(c, '')

31

return s

32

11

def _getpreferredencoding():

33

def _getpreferredencoding():

12

'''

34

'''

13

On darwin, getpreferredencoding ignores the locale environment and

35

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

36

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

37

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

38

Python versions.

17

39

18

However, we can't use a version check for this method, as some distributions

40

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

41

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

42

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

43

'''

22

try:

44

try:

23

locale.CODESET

45

locale.CODESET

24

except AttributeError:

46

except AttributeError:

25

# Fall back to parsing environment variables :-(

47

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

48

return locale.getdefaultlocale()[1]

27

49

28

oldloc = locale.setlocale(locale.LC_CTYPE)

50

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

51

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

52

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

53

locale.setlocale(locale.LC_CTYPE, oldloc)

32

54

33

return result

55

return result

34

56

35

_encodingfixers = {

57

_encodingfixers = {

36

'646': lambda: 'ascii',

58

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

59

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

60

'mac-roman': _getpreferredencoding

39

}

61

}

40

62

41

try:

63

try:

42

encoding = os.environ.get("HGENCODING")

64

encoding = os.environ.get("HGENCODING")

43

if not encoding:

65

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

66

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

67

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

68

except locale.Error:

47

encoding = 'ascii'

69

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

70

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

71

fallbackencoding = 'ISO-8859-1'

50

72

51

class localstr(str):

73

class localstr(str):

52

'''This class allows strings that are unmodified to be

74

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

75

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

76

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

77

s = str.__new__(cls, l)

56

s._utf8 = u

78

s._utf8 = u

57

return s

79

return s

58

def __hash__(self):

80

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

81

return hash(self._utf8) # avoid collisions in local string space

60

82

61

def tolocal(s):

83

def tolocal(s):

62

"""

84

"""

63

Convert a string from internal UTF-8 to local encoding

85

Convert a string from internal UTF-8 to local encoding

64

86

65

All internal strings should be UTF-8 but some repos before the

87

All internal strings should be UTF-8 but some repos before the

66

implementation of locale support may contain latin1 or possibly

88

implementation of locale support may contain latin1 or possibly

67

other character sets. We attempt to decode everything strictly

89

other character sets. We attempt to decode everything strictly

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

90

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

69

replace unknown characters.

91

replace unknown characters.

70

92

71

The localstr class is used to cache the known UTF-8 encoding of

93

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

94

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

95

round-trip conversion back to UTF-8.

74

96

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

97

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

98

>>> l = tolocal(u)

77

>>> l

99

>>> l

78

'foo: ?'

100

'foo: ?'

79

>>> fromlocal(l)

101

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

102

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

103

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

104

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> len(d) # no collision

105

>>> len(d) # no collision

84

2

106

2

85

>>> 'foo: ?' in d

107

>>> 'foo: ?' in d

86

False

108

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

109

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

110

>>> l = tolocal(l1)

89

>>> l

111

>>> l

90

'foo: ?'

112

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

113

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

114

'foo: \\xc3\\xa4'

93

"""

115

"""

94

116

95

try:

117

try:

96

try:

118

try:

97

# make sure string is actually stored in UTF-8

119

# make sure string is actually stored in UTF-8

98

u = s.decode('UTF-8')

120

u = s.decode('UTF-8')

99

if encoding == 'UTF-8':

121

if encoding == 'UTF-8':

100

# fast path

122

# fast path

101

return s

123

return s

102

r = u.encode(encoding, "replace")

124

r = u.encode(encoding, "replace")

103

if u == r.decode(encoding):

125

if u == r.decode(encoding):

104

# r is a safe, non-lossy encoding of s

126

# r is a safe, non-lossy encoding of s

105

return r

127

return r

106

return localstr(s, r)

128

return localstr(s, r)

107

except UnicodeDecodeError:

129

except UnicodeDecodeError:

108

# we should only get here if we're looking at an ancient changeset

130

# we should only get here if we're looking at an ancient changeset

109

try:

131

try:

110

u = s.decode(fallbackencoding)

132

u = s.decode(fallbackencoding)

111

r = u.encode(encoding, "replace")

133

r = u.encode(encoding, "replace")

112

if u == r.decode(encoding):

134

if u == r.decode(encoding):

113

# r is a safe, non-lossy encoding of s

135

# r is a safe, non-lossy encoding of s

114

return r

136

return r

115

return localstr(u.encode('UTF-8'), r)

137

return localstr(u.encode('UTF-8'), r)

116

except UnicodeDecodeError:

138

except UnicodeDecodeError:

117

u = s.decode("utf-8", "replace") # last ditch

139

u = s.decode("utf-8", "replace") # last ditch

118

return u.encode(encoding, "replace") # can't round-trip

140

return u.encode(encoding, "replace") # can't round-trip

119

except LookupError, k:

141

except LookupError, k:

120

raise error.Abort(k, hint="please check your locale settings")

142

raise error.Abort(k, hint="please check your locale settings")

121

143

122

def fromlocal(s):

144

def fromlocal(s):

123

"""

145

"""

124

Convert a string from the local character encoding to UTF-8

146

Convert a string from the local character encoding to UTF-8

125

147

126

We attempt to decode strings using the encoding mode set by

148

We attempt to decode strings using the encoding mode set by

127

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

149

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

128

characters will cause an error message. Other modes include

150

characters will cause an error message. Other modes include

129

'replace', which replaces unknown characters with a special

151

'replace', which replaces unknown characters with a special

130

Unicode character, and 'ignore', which drops the character.

152

Unicode character, and 'ignore', which drops the character.

131

"""

153

"""

132

154

133

# can we do a lossless round-trip?

155

# can we do a lossless round-trip?

134

if isinstance(s, localstr):

156

if isinstance(s, localstr):

135

return s._utf8

157

return s._utf8

136

158

137

try:

159

try:

138

return s.decode(encoding, encodingmode).encode("utf-8")

160

return s.decode(encoding, encodingmode).encode("utf-8")

139

except UnicodeDecodeError, inst:

161

except UnicodeDecodeError, inst:

140

sub = s[max(0, inst.start - 10):inst.start + 10]

162

sub = s[max(0, inst.start - 10):inst.start + 10]

141

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

163

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

142

except LookupError, k:

164

except LookupError, k:

143

raise error.Abort(k, hint="please check your locale settings")

165

raise error.Abort(k, hint="please check your locale settings")

144

166

145

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

167

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

146

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

168

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

147

and "WFA" or "WF")

169

and "WFA" or "WF")

148

170

149

def colwidth(s):

171

def colwidth(s):

150

"Find the column width of a string for display in the local encoding"

172

"Find the column width of a string for display in the local encoding"

151

return ucolwidth(s.decode(encoding, 'replace'))

173

return ucolwidth(s.decode(encoding, 'replace'))

152

174

153

def ucolwidth(d):

175

def ucolwidth(d):

154

"Find the column width of a Unicode string for display"

176

"Find the column width of a Unicode string for display"

155

eaw = getattr(unicodedata, 'east_asian_width', None)

177

eaw = getattr(unicodedata, 'east_asian_width', None)

156

if eaw is not None:

178

if eaw is not None:

157

return sum([eaw(c) in wide and 2 or 1 for c in d])

179

return sum([eaw(c) in wide and 2 or 1 for c in d])

158

return len(d)

180

return len(d)

159

181

160

def getcols(s, start, c):

182

def getcols(s, start, c):

161

'''Use colwidth to find a c-column substring of s starting at byte

183

'''Use colwidth to find a c-column substring of s starting at byte

162

index start'''

184

index start'''

163

for x in xrange(start + c, len(s)):

185

for x in xrange(start + c, len(s)):

164

t = s[start:x]

186

t = s[start:x]

165

if colwidth(t) == c:

187

if colwidth(t) == c:

166

return t

188

return t

167

189

168

def trim(s, width, ellipsis='', leftside=False):

190

def trim(s, width, ellipsis='', leftside=False):

169

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

191

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

170

192

171

If 'leftside' is True, left side of string 's' is trimmed.

193

If 'leftside' is True, left side of string 's' is trimmed.

172

'ellipsis' is always placed at trimmed side.

194

'ellipsis' is always placed at trimmed side.

173

195

174

>>> ellipsis = '+++'

196

>>> ellipsis = '+++'

175

>>> from mercurial import encoding

197

>>> from mercurial import encoding

176

>>> encoding.encoding = 'utf-8'

198

>>> encoding.encoding = 'utf-8'

177

>>> t= '1234567890'

199

>>> t= '1234567890'

178

>>> print trim(t, 12, ellipsis=ellipsis)

200

>>> print trim(t, 12, ellipsis=ellipsis)

179

1234567890

201

1234567890

180

>>> print trim(t, 10, ellipsis=ellipsis)

202

>>> print trim(t, 10, ellipsis=ellipsis)

181

1234567890

203

1234567890

182

>>> print trim(t, 8, ellipsis=ellipsis)

204

>>> print trim(t, 8, ellipsis=ellipsis)

183

12345+++

205

12345+++

184

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

206

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

185

+++67890

207

+++67890

186

>>> print trim(t, 8)

208

>>> print trim(t, 8)

187

12345678

209

12345678

188

>>> print trim(t, 8, leftside=True)

210

>>> print trim(t, 8, leftside=True)

189

34567890

211

34567890

190

>>> print trim(t, 3, ellipsis=ellipsis)

212

>>> print trim(t, 3, ellipsis=ellipsis)

191

+++

213

+++

192

>>> print trim(t, 1, ellipsis=ellipsis)

214

>>> print trim(t, 1, ellipsis=ellipsis)

193

+

215

+

194

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

216

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

195

>>> t = u.encode(encoding.encoding)

217

>>> t = u.encode(encoding.encoding)

196

>>> print trim(t, 12, ellipsis=ellipsis)

218

>>> print trim(t, 12, ellipsis=ellipsis)

197

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

219

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

198

>>> print trim(t, 10, ellipsis=ellipsis)

220

>>> print trim(t, 10, ellipsis=ellipsis)

199

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

221

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

200

>>> print trim(t, 8, ellipsis=ellipsis)

222

>>> print trim(t, 8, ellipsis=ellipsis)

201

\xe3\x81\x82\xe3\x81\x84+++

223

\xe3\x81\x82\xe3\x81\x84+++

202

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

224

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

203

+++\xe3\x81\x88\xe3\x81\x8a

225

+++\xe3\x81\x88\xe3\x81\x8a

204

>>> print trim(t, 5)

226

>>> print trim(t, 5)

205

\xe3\x81\x82\xe3\x81\x84

227

\xe3\x81\x82\xe3\x81\x84

206

>>> print trim(t, 5, leftside=True)

228

>>> print trim(t, 5, leftside=True)

207

\xe3\x81\x88\xe3\x81\x8a

229

\xe3\x81\x88\xe3\x81\x8a

208

>>> print trim(t, 4, ellipsis=ellipsis)

230

>>> print trim(t, 4, ellipsis=ellipsis)

209

+++

231

+++

210

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

232

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

211

+++

233

+++

212

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

234

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

213

>>> print trim(t, 12, ellipsis=ellipsis)

235

>>> print trim(t, 12, ellipsis=ellipsis)

214

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

236

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

215

>>> print trim(t, 10, ellipsis=ellipsis)

237

>>> print trim(t, 10, ellipsis=ellipsis)

216

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

238

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

217

>>> print trim(t, 8, ellipsis=ellipsis)

239

>>> print trim(t, 8, ellipsis=ellipsis)

218

\x11\x22\x33\x44\x55+++

240

\x11\x22\x33\x44\x55+++

219

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

241

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

220

+++\x66\x77\x88\x99\xaa

242

+++\x66\x77\x88\x99\xaa

221

>>> print trim(t, 8)

243

>>> print trim(t, 8)

222

\x11\x22\x33\x44\x55\x66\x77\x88

244

\x11\x22\x33\x44\x55\x66\x77\x88

223

>>> print trim(t, 8, leftside=True)

245

>>> print trim(t, 8, leftside=True)

224

\x33\x44\x55\x66\x77\x88\x99\xaa

246

\x33\x44\x55\x66\x77\x88\x99\xaa

225

>>> print trim(t, 3, ellipsis=ellipsis)

247

>>> print trim(t, 3, ellipsis=ellipsis)

226

+++

248

+++

227

>>> print trim(t, 1, ellipsis=ellipsis)

249

>>> print trim(t, 1, ellipsis=ellipsis)

228

+

250

+

229

"""

251

"""

230

try:

252

try:

231

u = s.decode(encoding)

253

u = s.decode(encoding)

232

except UnicodeDecodeError:

254

except UnicodeDecodeError:

233

if len(s) <= width: # trimming is not needed

255

if len(s) <= width: # trimming is not needed

234

return s

256

return s

235

width -= len(ellipsis)

257

width -= len(ellipsis)

236

if width <= 0: # no enough room even for ellipsis

258

if width <= 0: # no enough room even for ellipsis

237

return ellipsis[:width + len(ellipsis)]

259

return ellipsis[:width + len(ellipsis)]

238

if leftside:

260

if leftside:

239

return ellipsis + s[-width:]

261

return ellipsis + s[-width:]

240

return s[:width] + ellipsis

262

return s[:width] + ellipsis

241

263

242

if ucolwidth(u) <= width: # trimming is not needed

264

if ucolwidth(u) <= width: # trimming is not needed

243

return s

265

return s

244

266

245

width -= len(ellipsis)

267

width -= len(ellipsis)

246

if width <= 0: # no enough room even for ellipsis

268

if width <= 0: # no enough room even for ellipsis

247

return ellipsis[:width + len(ellipsis)]

269

return ellipsis[:width + len(ellipsis)]

248

270

249

if leftside:

271

if leftside:

250

uslice = lambda i: u[i:]

272

uslice = lambda i: u[i:]

251

concat = lambda s: ellipsis + s

273

concat = lambda s: ellipsis + s

252

else:

274

else:

253

uslice = lambda i: u[:-i]

275

uslice = lambda i: u[:-i]

254

concat = lambda s: s + ellipsis

276

concat = lambda s: s + ellipsis

255

for i in xrange(1, len(u)):

277

for i in xrange(1, len(u)):

256

usub = uslice(i)

278

usub = uslice(i)

257

if ucolwidth(usub) <= width:

279

if ucolwidth(usub) <= width:

258

return concat(usub.encode(encoding))

280

return concat(usub.encode(encoding))

259

return ellipsis # no enough room for multi-column characters

281

return ellipsis # no enough room for multi-column characters

260

282

261

def _asciilower(s):

283

def _asciilower(s):

262

'''convert a string to lowercase if ASCII

284

'''convert a string to lowercase if ASCII

263

285

264

Raises UnicodeDecodeError if non-ASCII characters are found.'''

286

Raises UnicodeDecodeError if non-ASCII characters are found.'''

265

s.decode('ascii')

287

s.decode('ascii')

266

return s.lower()

288

return s.lower()

267

289

268

def asciilower(s):

290

def asciilower(s):

269

# delay importing avoids cyclic dependency around "parsers" in

291

# delay importing avoids cyclic dependency around "parsers" in

270

# pure Python build (util => i18n => encoding => parsers => util)

292

# pure Python build (util => i18n => encoding => parsers => util)

271

import parsers

293

import parsers

272

impl = getattr(parsers, 'asciilower', _asciilower)

294

impl = getattr(parsers, 'asciilower', _asciilower)

273

global asciilower

295

global asciilower

274

asciilower = impl

296

asciilower = impl

275

return impl(s)

297

return impl(s)

276

298

277

def lower(s):

299

def lower(s):

278

"best-effort encoding-aware case-folding of local string s"

300

"best-effort encoding-aware case-folding of local string s"

279

try:

301

try:

280

return asciilower(s)

302

return asciilower(s)

281

except UnicodeDecodeError:

303

except UnicodeDecodeError:

282

pass

304

pass

283

try:

305

try:

284

if isinstance(s, localstr):

306

if isinstance(s, localstr):

285

u = s._utf8.decode("utf-8")

307

u = s._utf8.decode("utf-8")

286

else:

308

else:

287

u = s.decode(encoding, encodingmode)

309

u = s.decode(encoding, encodingmode)

288

310

289

lu = u.lower()

311

lu = u.lower()

290

if u == lu:

312

if u == lu:

291

return s # preserve localstring

313

return s # preserve localstring

292

return lu.encode(encoding)

314

return lu.encode(encoding)

293

except UnicodeError:

315

except UnicodeError:

294

return s.lower() # we don't know how to fold this except in ASCII

316

return s.lower() # we don't know how to fold this except in ASCII

295

except LookupError, k:

317

except LookupError, k:

296

raise error.Abort(k, hint="please check your locale settings")

318

raise error.Abort(k, hint="please check your locale settings")

297

319

298

def upper(s):

320

def upper(s):

299

"best-effort encoding-aware case-folding of local string s"

321

"best-effort encoding-aware case-folding of local string s"

300

try:

322

try:

301

s.decode('ascii') # throw exception for non-ASCII character

323

s.decode('ascii') # throw exception for non-ASCII character

302

return s.upper()

324

return s.upper()

303

except UnicodeDecodeError:

325

except UnicodeDecodeError:

304

pass

326

pass

305

try:

327

try:

306

if isinstance(s, localstr):

328

if isinstance(s, localstr):

307

u = s._utf8.decode("utf-8")

329

u = s._utf8.decode("utf-8")

308

else:

330

else:

309

u = s.decode(encoding, encodingmode)

331

u = s.decode(encoding, encodingmode)

310

332

311

uu = u.upper()

333

uu = u.upper()

312

if u == uu:

334

if u == uu:

313

return s # preserve localstring

335

return s # preserve localstring

314

return uu.encode(encoding)

336

return uu.encode(encoding)

315

except UnicodeError:

337

except UnicodeError:

316

return s.upper() # we don't know how to fold this except in ASCII

338

return s.upper() # we don't know how to fold this except in ASCII

317

except LookupError, k:

339

except LookupError, k:

318

raise error.Abort(k, hint="please check your locale settings")

340

raise error.Abort(k, hint="please check your locale settings")

319

341

320

_jsonmap = {}

342

_jsonmap = {}

321

343

322

def jsonescape(s):

344

def jsonescape(s):

323

'''returns a string suitable for JSON

345

'''returns a string suitable for JSON

324

346

325

JSON is problematic for us because it doesn't support non-Unicode

347

JSON is problematic for us because it doesn't support non-Unicode

326

bytes. To deal with this, we take the following approach:

348

bytes. To deal with this, we take the following approach:

327

349

328

- localstr objects are converted back to UTF-8

350

- localstr objects are converted back to UTF-8

329

- valid UTF-8/ASCII strings are passed as-is

351

- valid UTF-8/ASCII strings are passed as-is

330

- other strings are converted to UTF-8b surrogate encoding

352

- other strings are converted to UTF-8b surrogate encoding

331

- apply JSON-specified string escaping

353

- apply JSON-specified string escaping

332

354

333

(escapes are doubled in these tests)

355

(escapes are doubled in these tests)

334

356

335

>>> jsonescape('this is a test')

357

>>> jsonescape('this is a test')

336

'this is a test'

358

'this is a test'

337

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

359

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

338

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

360

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

339

>>> jsonescape('a weird byte: \\xdd')

361

>>> jsonescape('a weird byte: \\xdd')

340

'a weird byte: \\xed\\xb3\\x9d'

362

'a weird byte: \\xed\\xb3\\x9d'

341

>>> jsonescape('utf-8: caf\\xc3\\xa9')

363

>>> jsonescape('utf-8: caf\\xc3\\xa9')

342

'utf-8: caf\\xc3\\xa9'

364

'utf-8: caf\\xc3\\xa9'

343

>>> jsonescape('')

365

>>> jsonescape('')

344

''

366

''

345

'''

367

'''

346

368

347

if not _jsonmap:

369

if not _jsonmap:

348

for x in xrange(32):

370

for x in xrange(32):

349

_jsonmap[chr(x)] = "\u%04x" %x

371

_jsonmap[chr(x)] = "\u%04x" %x

350

for x in xrange(32, 256):

372

for x in xrange(32, 256):

351

c = chr(x)

373

c = chr(x)

352

_jsonmap[c] = c

374

_jsonmap[c] = c

353

_jsonmap['\t'] = '\\t'

375

_jsonmap['\t'] = '\\t'

354

_jsonmap['\n'] = '\\n'

376

_jsonmap['\n'] = '\\n'

355

_jsonmap['\"'] = '\\"'

377

_jsonmap['\"'] = '\\"'

356

_jsonmap['\\'] = '\\\\'

378

_jsonmap['\\'] = '\\\\'

357

_jsonmap['\b'] = '\\b'

379

_jsonmap['\b'] = '\\b'

358

_jsonmap['\f'] = '\\f'

380

_jsonmap['\f'] = '\\f'

359

_jsonmap['\r'] = '\\r'

381

_jsonmap['\r'] = '\\r'

360

382

361

return ''.join(_jsonmap[c] for c in toutf8b(s))

383

return ''.join(_jsonmap[c] for c in toutf8b(s))

362

384

363

def toutf8b(s):

385

def toutf8b(s):

364

'''convert a local, possibly-binary string into UTF-8b

386

'''convert a local, possibly-binary string into UTF-8b

365

387

366

This is intended as a generic method to preserve data when working

388

This is intended as a generic method to preserve data when working

367

with schemes like JSON and XML that have no provision for

389

with schemes like JSON and XML that have no provision for

368

arbitrary byte strings. As Mercurial often doesn't know

390

arbitrary byte strings. As Mercurial often doesn't know

369

what encoding data is in, we use so-called UTF-8b.

391

what encoding data is in, we use so-called UTF-8b.

370

392

371

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

393

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

372

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

394

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

373

uDC00-uDCFF.

395

uDC00-uDCFF.

374

396

375

Principles of operation:

397

Principles of operation:

376

398

377

- ASCII and UTF-8 data successfully round-trips and is understood

399

- ASCII and UTF-8 data successfully round-trips and is understood

378

by Unicode-oriented clients

400

by Unicode-oriented clients

379

- filenames and file contents in arbitrary other encodings can have

401

- filenames and file contents in arbitrary other encodings can have

380

be round-tripped or recovered by clueful clients

402

be round-tripped or recovered by clueful clients

381

- local strings that have a cached known UTF-8 encoding (aka

403

- local strings that have a cached known UTF-8 encoding (aka

382

localstr) get sent as UTF-8 so Unicode-oriented clients get the

404

localstr) get sent as UTF-8 so Unicode-oriented clients get the

383

Unicode data they want

405

Unicode data they want

384

- because we must preserve UTF-8 bytestring in places such as

406

- because we must preserve UTF-8 bytestring in places such as

385

filenames, metadata can't be roundtripped without help

407

filenames, metadata can't be roundtripped without help

386

408

387

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

409

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

388

arbitrary bytes into an internal Unicode format that can be

410

arbitrary bytes into an internal Unicode format that can be

389

re-encoded back into the original. Here we are exposing the

411

re-encoded back into the original. Here we are exposing the

390

internal surrogate encoding as a UTF-8 string.)

412

internal surrogate encoding as a UTF-8 string.)

391

'''

413

'''

392

414

393

if isinstance(s, localstr):

415

if isinstance(s, localstr):

394

return s._utf8

416

return s._utf8

395

417

396

try:

418

try:

397

s.decode('utf-8')

419

s.decode('utf-8')

398

return s

420

return s

399

except UnicodeDecodeError:

421

except UnicodeDecodeError:

400

# surrogate-encode any characters that don't round-trip

422

# surrogate-encode any characters that don't round-trip

401

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

423

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

402

r = ""

424

r = ""

403

pos = 0

425

pos = 0

404

for c in s:

426

for c in s:

405

if s2[pos:pos + 1] == c:

427

if s2[pos:pos + 1] == c:

406

r += c

428

r += c

407

pos += 1

429

pos += 1

408

else:

430

else:

409

r += unichr(0xdc00 + ord(c)).encode('utf-8')

431

r += unichr(0xdc00 + ord(c)).encode('utf-8')

410

return r

432

return r

411

433

412

def fromutf8b(s):

434

def fromutf8b(s):

413

'''Given a UTF-8b string, return a local, possibly-binary string.

435

'''Given a UTF-8b string, return a local, possibly-binary string.

414

436

415

return the original binary string. This

437

return the original binary string. This

416

is a round-trip process for strings like filenames, but metadata

438

is a round-trip process for strings like filenames, but metadata

417

that's was passed through tolocal will remain in UTF-8.

439

that's was passed through tolocal will remain in UTF-8.

418

440

419

>>> m = "\\xc3\\xa9\\x99abcd"

441

>>> m = "\\xc3\\xa9\\x99abcd"

420

>>> n = toutf8b(m)

442

>>> n = toutf8b(m)

421

>>> n

443

>>> n

422

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

444

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

423

>>> fromutf8b(n) == m

445

>>> fromutf8b(n) == m

424

True

446

True

425

'''

447

'''

426

448

427

# fast path - look for uDxxx prefixes in s

449

# fast path - look for uDxxx prefixes in s

428

if "\xed" not in s:

450

if "\xed" not in s:

429

return s

451

return s

430

452

431

u = s.decode("utf-8")

453

u = s.decode("utf-8")

432

r = ""

454

r = ""

433

for c in u:

455

for c in u:

434

if ord(c) & 0xff00 == 0xdc00:

456

if ord(c) & 0xff00 == 0xdc00:

435

r += chr(ord(c) & 0xff)

457

r += chr(ord(c) & 0xff)

436

else:

458

else:

437

r += c.encode("utf-8")

459

r += c.encode("utf-8")

438

return r

460

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
+            # These unicode characters are ignored by HFS+ (Apple Technote 1150,
+            # "Unicode Subtleties"), so we need to ignore them in some places for
+            # sanity.
+            _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
+                       "200c 200d 200e 200f 202a 202b 202c 202d 202e "
+                       "206a 206b 206c 206d 206e 206f feff".split()]
+            # verify the next function will work
+            assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
+            def hfsignoreclean(s):
+                """Remove codepoints ignored by HFS+ from s.
+                >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
+                '.hg'
+                >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
+                '.hg'
+                """
+                if "\xe2" in s or "\xef" in s:
+                    for c in _ignore:
+                        s = s.replace(c, '')
+                return s
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from mercurial import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     s.decode('ascii') # throw exception for non-ASCII character
                     return s.upper()
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             _jsonmap = {}
             def jsonescape(s):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 '''
                 if not _jsonmap:
                     for x in xrange(32):
                         _jsonmap[chr(x)] = "\u%04x" %x
                     for x in xrange(32, 256):
                         c = chr(x)
                         _jsonmap[c] = c
                     _jsonmap['\t'] = '\\t'
                     _jsonmap['\n'] = '\\n'
                     _jsonmap['\"'] = '\\"'
                     _jsonmap['\\'] = '\\\\'
                     _jsonmap['\b'] = '\\b'
                     _jsonmap['\f'] = '\\f'
                     _jsonmap['\r'] = '\\r'
                 return ''.join(_jsonmap[c] for c in toutf8b(s))
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     s.decode('utf-8')
                     return s
                 except UnicodeDecodeError:
                     # surrogate-encode any characters that don't round-trip
                     s2 = s.decode('utf-8', 'ignore').encode('utf-8')
                     r = ""
                     pos = 0
                     for c in s:
                         if s2[pos:pos + 1] == c:
                             r += c
                             pos += 1
                         else:
                             r += unichr(0xdc00 + ord(c)).encode('utf-8')
                     return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> n = toutf8b(m)
                 >>> n
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> fromutf8b(n) == m
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 u = s.decode("utf-8")
                 r = ""
                 for c in u:
                     if ord(c) & 0xff00 == 0xdc00:
                         r += chr(ord(c) & 0xff)
                     else:
                         r += c.encode("utf-8")
                 return r