upstream/mercurial-mirror Commit - r27699:c8d3392f

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from . import (

14

from . import (

15

error,

15

error,

16

)

16

)

17

18

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

18

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

19

# "Unicode Subtleties"), so we need to ignore them in some places for

19

# "Unicode Subtleties"), so we need to ignore them in some places for

20

# sanity.

20

# sanity.

21

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

21

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

22

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

22

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

23

"206a 206b 206c 206d 206e 206f feff".split()]

23

"206a 206b 206c 206d 206e 206f feff".split()]

24

# verify the next function will work

24

# verify the next function will work

25

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

25

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

26

27

def hfsignoreclean(s):

27

def hfsignoreclean(s):

28

"""Remove codepoints ignored by HFS+ from s.

28

"""Remove codepoints ignored by HFS+ from s.

29

30

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

30

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

31

'.hg'

31

'.hg'

32

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

32

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

33

'.hg'

33

'.hg'

34

"""

34

"""

35

if "\xe2" in s or "\xef" in s:

35

if "\xe2" in s or "\xef" in s:

36

for c in _ignore:

36

for c in _ignore:

37

s = s.replace(c, '')

37

s = s.replace(c, '')

38

return s

38

return s

39

40

def _getpreferredencoding():

40

def _getpreferredencoding():

41

'''

41

'''

42

On darwin, getpreferredencoding ignores the locale environment and

42

On darwin, getpreferredencoding ignores the locale environment and

43

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

43

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

44

for Python 2.7 and up. This is the same corrected code for earlier

44

for Python 2.7 and up. This is the same corrected code for earlier

45

Python versions.

45

Python versions.

46

47

However, we can't use a version check for this method, as some distributions

47

However, we can't use a version check for this method, as some distributions

48

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

48

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

49

encoding, as it is unlikely that this encoding is the actually expected.

49

encoding, as it is unlikely that this encoding is the actually expected.

50

'''

50

'''

51

try:

51

try:

52

locale.CODESET

52

locale.CODESET

53

except AttributeError:

53

except AttributeError:

54

# Fall back to parsing environment variables :-(

54

# Fall back to parsing environment variables :-(

55

return locale.getdefaultlocale()[1]

55

return locale.getdefaultlocale()[1]

56

57

oldloc = locale.setlocale(locale.LC_CTYPE)

57

oldloc = locale.setlocale(locale.LC_CTYPE)

58

locale.setlocale(locale.LC_CTYPE, "")

58

locale.setlocale(locale.LC_CTYPE, "")

59

result = locale.nl_langinfo(locale.CODESET)

59

result = locale.nl_langinfo(locale.CODESET)

60

locale.setlocale(locale.LC_CTYPE, oldloc)

60

locale.setlocale(locale.LC_CTYPE, oldloc)

61

62

return result

62

return result

63

64

_encodingfixers = {

64

_encodingfixers = {

65

'646': lambda: 'ascii',

65

'646': lambda: 'ascii',

66

'ANSI_X3.4-1968': lambda: 'ascii',

66

'ANSI_X3.4-1968': lambda: 'ascii',

67

'mac-roman': _getpreferredencoding

67

'mac-roman': _getpreferredencoding

68

}

68

}

69

70

try:

70

try:

71

encoding = os.environ.get("HGENCODING")

71

encoding = os.environ.get("HGENCODING")

72

if not encoding:

72

if not encoding:

73

encoding = locale.getpreferredencoding() or 'ascii'

73

encoding = locale.getpreferredencoding() or 'ascii'

74

encoding = _encodingfixers.get(encoding, lambda: encoding)()

74

encoding = _encodingfixers.get(encoding, lambda: encoding)()

75

except locale.Error:

75

except locale.Error:

76

encoding = 'ascii'

76

encoding = 'ascii'

77

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

77

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

78

fallbackencoding = 'ISO-8859-1'

78

fallbackencoding = 'ISO-8859-1'

79

80

class localstr(str):

80

class localstr(str):

81

'''This class allows strings that are unmodified to be

81

'''This class allows strings that are unmodified to be

82

round-tripped to the local encoding and back'''

82

round-tripped to the local encoding and back'''

83

def __new__(cls, u, l):

83

def __new__(cls, u, l):

84

s = str.__new__(cls, l)

84

s = str.__new__(cls, l)

85

s._utf8 = u

85

s._utf8 = u

86

return s

86

return s

87

def __hash__(self):

87

def __hash__(self):

88

return hash(self._utf8) # avoid collisions in local string space

88

return hash(self._utf8) # avoid collisions in local string space

89

90

def tolocal(s):

90

def tolocal(s):

91

"""

91

"""

92

Convert a string from internal UTF-8 to local encoding

92

Convert a string from internal UTF-8 to local encoding

93

94

All internal strings should be UTF-8 but some repos before the

94

All internal strings should be UTF-8 but some repos before the

95

implementation of locale support may contain latin1 or possibly

95

implementation of locale support may contain latin1 or possibly

96

other character sets. We attempt to decode everything strictly

96

other character sets. We attempt to decode everything strictly

97

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

97

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

98

replace unknown characters.

98

replace unknown characters.

99

100

The localstr class is used to cache the known UTF-8 encoding of

100

The localstr class is used to cache the known UTF-8 encoding of

101

strings next to their local representation to allow lossless

101

strings next to their local representation to allow lossless

102

round-trip conversion back to UTF-8.

102

round-trip conversion back to UTF-8.

103

104

>>> u = 'foo: \\xc3\\xa4' # utf-8

104

>>> u = 'foo: \\xc3\\xa4' # utf-8

105

>>> l = tolocal(u)

105

>>> l = tolocal(u)

106

>>> l

106

>>> l

107

'foo: ?'

107

'foo: ?'

108

>>> fromlocal(l)

108

>>> fromlocal(l)

109

'foo: \\xc3\\xa4'

109

'foo: \\xc3\\xa4'

110

>>> u2 = 'foo: \\xc3\\xa1'

110

>>> u2 = 'foo: \\xc3\\xa1'

111

>>> d = { l: 1, tolocal(u2): 2 }

111

>>> d = { l: 1, tolocal(u2): 2 }

112

>>> len(d) # no collision

112

>>> len(d) # no collision

113

2

113

2

114

>>> 'foo: ?' in d

114

>>> 'foo: ?' in d

115

False

115

False

116

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

116

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

117

>>> l = tolocal(l1)

117

>>> l = tolocal(l1)

118

>>> l

118

>>> l

119

'foo: ?'

119

'foo: ?'

120

>>> fromlocal(l) # magically in utf-8

120

>>> fromlocal(l) # magically in utf-8

121

'foo: \\xc3\\xa4'

121

'foo: \\xc3\\xa4'

122

"""

122

"""

123

124

try:

124

try:

125

try:

125

try:

126

# make sure string is actually stored in UTF-8

126

# make sure string is actually stored in UTF-8

127

u = s.decode('UTF-8')

127

u = s.decode('UTF-8')

128

if encoding == 'UTF-8':

128

if encoding == 'UTF-8':

129

# fast path

129

# fast path

130

return s

130

return s

131

r = u.encode(encoding, "replace")

131

r = u.encode(encoding, "replace")

132

if u == r.decode(encoding):

132

if u == r.decode(encoding):

133

# r is a safe, non-lossy encoding of s

133

# r is a safe, non-lossy encoding of s

134

return r

134

return r

135

return localstr(s, r)

135

return localstr(s, r)

136

except UnicodeDecodeError:

136

except UnicodeDecodeError:

137

# we should only get here if we're looking at an ancient changeset

137

# we should only get here if we're looking at an ancient changeset

138

try:

138

try:

139

u = s.decode(fallbackencoding)

139

u = s.decode(fallbackencoding)

140

r = u.encode(encoding, "replace")

140

r = u.encode(encoding, "replace")

141

if u == r.decode(encoding):

141

if u == r.decode(encoding):

142

# r is a safe, non-lossy encoding of s

142

# r is a safe, non-lossy encoding of s

143

return r

143

return r

144

return localstr(u.encode('UTF-8'), r)

144

return localstr(u.encode('UTF-8'), r)

145

except UnicodeDecodeError:

145

except UnicodeDecodeError:

146

u = s.decode("utf-8", "replace") # last ditch

146

u = s.decode("utf-8", "replace") # last ditch

147

return u.encode(encoding, "replace") # can't round-trip

147

return u.encode(encoding, "replace") # can't round-trip

148

except LookupError as k:

148

except LookupError as k:

149

raise error.Abort(k, hint="please check your locale settings")

149

raise error.Abort(k, hint="please check your locale settings")

150

151

def fromlocal(s):

151

def fromlocal(s):

152

"""

152

"""

153

Convert a string from the local character encoding to UTF-8

153

Convert a string from the local character encoding to UTF-8

154

155

We attempt to decode strings using the encoding mode set by

155

We attempt to decode strings using the encoding mode set by

156

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

156

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

157

characters will cause an error message. Other modes include

157

characters will cause an error message. Other modes include

158

'replace', which replaces unknown characters with a special

158

'replace', which replaces unknown characters with a special

159

Unicode character, and 'ignore', which drops the character.

159

Unicode character, and 'ignore', which drops the character.

160

"""

160

"""

161

162

# can we do a lossless round-trip?

162

# can we do a lossless round-trip?

163

if isinstance(s, localstr):

163

if isinstance(s, localstr):

164

return s._utf8

164

return s._utf8

165

166

try:

166

try:

167

return s.decode(encoding, encodingmode).encode("utf-8")

167

return s.decode(encoding, encodingmode).encode("utf-8")

168

except UnicodeDecodeError as inst:

168

except UnicodeDecodeError as inst:

169

sub = s[max(0, inst.start - 10):inst.start + 10]

169

sub = s[max(0, inst.start - 10):inst.start + 10]

170

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

170

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

171

except LookupError as k:

171

except LookupError as k:

172

raise error.Abort(k, hint="please check your locale settings")

172

raise error.Abort(k, hint="please check your locale settings")

173

174

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

174

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

175

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

175

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

176

and "WFA" or "WF")

176

and "WFA" or "WF")

177

178

def colwidth(s):

178

def colwidth(s):

179

"Find the column width of a string for display in the local encoding"

179

"Find the column width of a string for display in the local encoding"

180

return ucolwidth(s.decode(encoding, 'replace'))

180

return ucolwidth(s.decode(encoding, 'replace'))

181

182

def ucolwidth(d):

182

def ucolwidth(d):

183

"Find the column width of a Unicode string for display"

183

"Find the column width of a Unicode string for display"

184

eaw = getattr(unicodedata, 'east_asian_width', None)

184

eaw = getattr(unicodedata, 'east_asian_width', None)

185

if eaw is not None:

185

if eaw is not None:

186

return sum([eaw(c) in wide and 2 or 1 for c in d])

186

return sum([eaw(c) in wide and 2 or 1 for c in d])

187

return len(d)

187

return len(d)

188

189

def getcols(s, start, c):

189

def getcols(s, start, c):

190

'''Use colwidth to find a c-column substring of s starting at byte

190

'''Use colwidth to find a c-column substring of s starting at byte

191

index start'''

191

index start'''

192

for x in xrange(start + c, len(s)):

192

for x in xrange(start + c, len(s)):

193

t = s[start:x]

193

t = s[start:x]

194

if colwidth(t) == c:

194

if colwidth(t) == c:

195

return t

195

return t

196

197

def trim(s, width, ellipsis='', leftside=False):

197

def trim(s, width, ellipsis='', leftside=False):

198

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

198

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

199

200

If 'leftside' is True, left side of string 's' is trimmed.

200

If 'leftside' is True, left side of string 's' is trimmed.

201

'ellipsis' is always placed at trimmed side.

201

'ellipsis' is always placed at trimmed side.

202

203

>>> ellipsis = '+++'

203

>>> ellipsis = '+++'

204

>>> from . import encoding

204

>>> from . import encoding

205

>>> encoding.encoding = 'utf-8'

205

>>> encoding.encoding = 'utf-8'

206

>>> t= '1234567890'

206

>>> t= '1234567890'

207

>>> print trim(t, 12, ellipsis=ellipsis)

207

>>> print trim(t, 12, ellipsis=ellipsis)

208

1234567890

208

1234567890

209

>>> print trim(t, 10, ellipsis=ellipsis)

209

>>> print trim(t, 10, ellipsis=ellipsis)

210

1234567890

210

1234567890

211

>>> print trim(t, 8, ellipsis=ellipsis)

211

>>> print trim(t, 8, ellipsis=ellipsis)

212

12345+++

212

12345+++

213

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

213

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

214

+++67890

214

+++67890

215

>>> print trim(t, 8)

215

>>> print trim(t, 8)

216

12345678

216

12345678

217

>>> print trim(t, 8, leftside=True)

217

>>> print trim(t, 8, leftside=True)

218

34567890

218

34567890

219

>>> print trim(t, 3, ellipsis=ellipsis)

219

>>> print trim(t, 3, ellipsis=ellipsis)

220

+++

220

+++

221

>>> print trim(t, 1, ellipsis=ellipsis)

221

>>> print trim(t, 1, ellipsis=ellipsis)

222

+

222

+

223

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

223

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

224

>>> t = u.encode(encoding.encoding)

224

>>> t = u.encode(encoding.encoding)

225

>>> print trim(t, 12, ellipsis=ellipsis)

225

>>> print trim(t, 12, ellipsis=ellipsis)

226

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

226

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

227

>>> print trim(t, 10, ellipsis=ellipsis)

227

>>> print trim(t, 10, ellipsis=ellipsis)

228

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

228

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

229

>>> print trim(t, 8, ellipsis=ellipsis)

229

>>> print trim(t, 8, ellipsis=ellipsis)

230

\xe3\x81\x82\xe3\x81\x84+++

230

\xe3\x81\x82\xe3\x81\x84+++

231

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

231

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

232

+++\xe3\x81\x88\xe3\x81\x8a

232

+++\xe3\x81\x88\xe3\x81\x8a

233

>>> print trim(t, 5)

233

>>> print trim(t, 5)

234

\xe3\x81\x82\xe3\x81\x84

234

\xe3\x81\x82\xe3\x81\x84

235

>>> print trim(t, 5, leftside=True)

235

>>> print trim(t, 5, leftside=True)

236

\xe3\x81\x88\xe3\x81\x8a

236

\xe3\x81\x88\xe3\x81\x8a

237

>>> print trim(t, 4, ellipsis=ellipsis)

237

>>> print trim(t, 4, ellipsis=ellipsis)

238

+++

238

+++

239

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

239

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

240

+++

240

+++

241

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

241

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

242

>>> print trim(t, 12, ellipsis=ellipsis)

242

>>> print trim(t, 12, ellipsis=ellipsis)

243

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

243

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

244

>>> print trim(t, 10, ellipsis=ellipsis)

244

>>> print trim(t, 10, ellipsis=ellipsis)

245

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

245

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

246

>>> print trim(t, 8, ellipsis=ellipsis)

246

>>> print trim(t, 8, ellipsis=ellipsis)

247

\x11\x22\x33\x44\x55+++

247

\x11\x22\x33\x44\x55+++

248

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

248

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

249

+++\x66\x77\x88\x99\xaa

249

+++\x66\x77\x88\x99\xaa

250

>>> print trim(t, 8)

250

>>> print trim(t, 8)

251

\x11\x22\x33\x44\x55\x66\x77\x88

251

\x11\x22\x33\x44\x55\x66\x77\x88

252

>>> print trim(t, 8, leftside=True)

252

>>> print trim(t, 8, leftside=True)

253

\x33\x44\x55\x66\x77\x88\x99\xaa

253

\x33\x44\x55\x66\x77\x88\x99\xaa

254

>>> print trim(t, 3, ellipsis=ellipsis)

254

>>> print trim(t, 3, ellipsis=ellipsis)

255

+++

255

+++

256

>>> print trim(t, 1, ellipsis=ellipsis)

256

>>> print trim(t, 1, ellipsis=ellipsis)

257

+

257

+

258

"""

258

"""

259

try:

259

try:

260

u = s.decode(encoding)

260

u = s.decode(encoding)

261

except UnicodeDecodeError:

261

except UnicodeDecodeError:

262

if len(s) <= width: # trimming is not needed

262

if len(s) <= width: # trimming is not needed

263

return s

263

return s

264

width -= len(ellipsis)

264

width -= len(ellipsis)

265

if width <= 0: # no enough room even for ellipsis

265

if width <= 0: # no enough room even for ellipsis

266

return ellipsis[:width + len(ellipsis)]

266

return ellipsis[:width + len(ellipsis)]

267

if leftside:

267

if leftside:

268

return ellipsis + s[-width:]

268

return ellipsis + s[-width:]

269

return s[:width] + ellipsis

269

return s[:width] + ellipsis

270

271

if ucolwidth(u) <= width: # trimming is not needed

271

if ucolwidth(u) <= width: # trimming is not needed

272

return s

272

return s

273

274

width -= len(ellipsis)

274

width -= len(ellipsis)

275

if width <= 0: # no enough room even for ellipsis

275

if width <= 0: # no enough room even for ellipsis

276

return ellipsis[:width + len(ellipsis)]

276

return ellipsis[:width + len(ellipsis)]

277

278

if leftside:

278

if leftside:

279

uslice = lambda i: u[i:]

279

uslice = lambda i: u[i:]

280

concat = lambda s: ellipsis + s

280

concat = lambda s: ellipsis + s

281

else:

281

else:

282

uslice = lambda i: u[:-i]

282

uslice = lambda i: u[:-i]

283

concat = lambda s: s + ellipsis

283

concat = lambda s: s + ellipsis

284

for i in xrange(1, len(u)):

284

for i in xrange(1, len(u)):

285

usub = uslice(i)

285

usub = uslice(i)

286

if ucolwidth(usub) <= width:

286

if ucolwidth(usub) <= width:

287

return concat(usub.encode(encoding))

287

return concat(usub.encode(encoding))

288

return ellipsis # no enough room for multi-column characters

288

return ellipsis # no enough room for multi-column characters

289

290

def _asciilower(s):

290

def _asciilower(s):

291

'''convert a string to lowercase if ASCII

291

'''convert a string to lowercase if ASCII

292

293

Raises UnicodeDecodeError if non-ASCII characters are found.'''

293

Raises UnicodeDecodeError if non-ASCII characters are found.'''

294

s.decode('ascii')

294

s.decode('ascii')

295

return s.lower()

295

return s.lower()

296

297

def asciilower(s):

297

def asciilower(s):

298

# delay importing avoids cyclic dependency around "parsers" in

298

# delay importing avoids cyclic dependency around "parsers" in

299

# pure Python build (util => i18n => encoding => parsers => util)

299

# pure Python build (util => i18n => encoding => parsers => util)

300

from . import parsers

300

from . import parsers

301

impl = getattr(parsers, 'asciilower', _asciilower)

301

impl = getattr(parsers, 'asciilower', _asciilower)

302

global asciilower

302

global asciilower

303

asciilower = impl

303

asciilower = impl

304

return impl(s)

304

return impl(s)

305

306

def _asciiupper(s):

306

def _asciiupper(s):

307

'''convert a string to uppercase if ASCII

307

'''convert a string to uppercase if ASCII

308

309

Raises UnicodeDecodeError if non-ASCII characters are found.'''

309

Raises UnicodeDecodeError if non-ASCII characters are found.'''

310

s.decode('ascii')

310

s.decode('ascii')

311

return s.upper()

311

return s.upper()

312

313

def asciiupper(s):

313

def asciiupper(s):

314

# delay importing avoids cyclic dependency around "parsers" in

314

# delay importing avoids cyclic dependency around "parsers" in

315

# pure Python build (util => i18n => encoding => parsers => util)

315

# pure Python build (util => i18n => encoding => parsers => util)

316

from . import parsers

316

from . import parsers

317

impl = getattr(parsers, 'asciiupper', _asciiupper)

317

impl = getattr(parsers, 'asciiupper', _asciiupper)

318

global asciiupper

318

global asciiupper

319

asciiupper = impl

319

asciiupper = impl

320

return impl(s)

320

return impl(s)

321

322

def lower(s):

322

def lower(s):

323

"best-effort encoding-aware case-folding of local string s"

323

"best-effort encoding-aware case-folding of local string s"

324

try:

324

try:

325

return asciilower(s)

325

return asciilower(s)

326

except UnicodeDecodeError:

326

except UnicodeDecodeError:

327

pass

327

pass

328

try:

328

try:

329

if isinstance(s, localstr):

329

if isinstance(s, localstr):

330

u = s._utf8.decode("utf-8")

330

u = s._utf8.decode("utf-8")

331

else:

331

else:

332

u = s.decode(encoding, encodingmode)

332

u = s.decode(encoding, encodingmode)

333

334

lu = u.lower()

334

lu = u.lower()

335

if u == lu:

335

if u == lu:

336

return s # preserve localstring

336

return s # preserve localstring

337

return lu.encode(encoding)

337

return lu.encode(encoding)

338

except UnicodeError:

338

except UnicodeError:

339

return s.lower() # we don't know how to fold this except in ASCII

339

return s.lower() # we don't know how to fold this except in ASCII

340

except LookupError as k:

340

except LookupError as k:

341

raise error.Abort(k, hint="please check your locale settings")

341

raise error.Abort(k, hint="please check your locale settings")

342

343

def upper(s):

343

def upper(s):

344

"best-effort encoding-aware case-folding of local string s"

344

"best-effort encoding-aware case-folding of local string s"

345

try:

345

try:

346

return asciiupper(s)

346

return asciiupper(s)

347

except UnicodeDecodeError:

347

except UnicodeDecodeError:

348

return upperfallback(s)

348

return upperfallback(s)

349

350

def upperfallback(s):

350

def upperfallback(s):

351

try:

351

try:

352

if isinstance(s, localstr):

352

if isinstance(s, localstr):

353

u = s._utf8.decode("utf-8")

353

u = s._utf8.decode("utf-8")

354

else:

354

else:

355

u = s.decode(encoding, encodingmode)

355

u = s.decode(encoding, encodingmode)

356

357

uu = u.upper()

357

uu = u.upper()

358

if u == uu:

358

if u == uu:

359

return s # preserve localstring

359

return s # preserve localstring

360

return uu.encode(encoding)

360

return uu.encode(encoding)

361

except UnicodeError:

361

except UnicodeError:

362

return s.upper() # we don't know how to fold this except in ASCII

362

return s.upper() # we don't know how to fold this except in ASCII

363

except LookupError as k:

363

except LookupError as k:

364

raise error.Abort(k, hint="please check your locale settings")

364

raise error.Abort(k, hint="please check your locale settings")

365

366

class normcasespecs(object):

366

class normcasespecs(object):

367

'''what a platform's normcase does to ASCII strings

367

'''what a platform's normcase does to ASCII strings

368

369

This is specified per platform, and should be consistent with what normcase

369

This is specified per platform, and should be consistent with what normcase

370

on that platform actually does.

370

on that platform actually does.

371

372

lower: normcase lowercases ASCII strings

372

lower: normcase lowercases ASCII strings

373

upper: normcase uppercases ASCII strings

373

upper: normcase uppercases ASCII strings

374

other: the fallback function should always be called

374

other: the fallback function should always be called

375

376

This should be kept in sync with normcase_spec in util.h.'''

376

This should be kept in sync with normcase_spec in util.h.'''

377

lower = -1

377

lower = -1

378

upper = 1

378

upper = 1

379

other = 0

379

other = 0

380

381

_jsonmap = {}

381

_jsonmap = {}

382

383

def jsonescape(s):

383

def jsonescape(s):

384

'''returns a string suitable for JSON

384

'''returns a string suitable for JSON

385

386

JSON is problematic for us because it doesn't support non-Unicode

386

JSON is problematic for us because it doesn't support non-Unicode

387

bytes. To deal with this, we take the following approach:

387

bytes. To deal with this, we take the following approach:

388

389

- localstr objects are converted back to UTF-8

389

- localstr objects are converted back to UTF-8

390

- valid UTF-8/ASCII strings are passed as-is

390

- valid UTF-8/ASCII strings are passed as-is

391

- other strings are converted to UTF-8b surrogate encoding

391

- other strings are converted to UTF-8b surrogate encoding

392

- apply JSON-specified string escaping

392

- apply JSON-specified string escaping

393

394

(escapes are doubled in these tests)

394

(escapes are doubled in these tests)

395

396

>>> jsonescape('this is a test')

396

>>> jsonescape('this is a test')

397

'this is a test'

397

'this is a test'

398

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

398

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

399

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

399

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

400

>>> jsonescape('a weird byte: \\xdd')

400

>>> jsonescape('a weird byte: \\xdd')

401

'a weird byte: \\xed\\xb3\\x9d'

401

'a weird byte: \\xed\\xb3\\x9d'

402

>>> jsonescape('utf-8: caf\\xc3\\xa9')

402

>>> jsonescape('utf-8: caf\\xc3\\xa9')

403

'utf-8: caf\\xc3\\xa9'

403

'utf-8: caf\\xc3\\xa9'

404

>>> jsonescape('')

404

>>> jsonescape('')

405

''

405

''

406

'''

406

'''

407

408

if not _jsonmap:

408

if not _jsonmap:

409

for x in xrange(32):

409

for x in xrange(32):

410

_jsonmap[chr(x)] = "\\u%04x" % x

410

_jsonmap[chr(x)] = "\\u%04x" % x

411

for x in xrange(32, 256):

411

for x in xrange(32, 256):

412

c = chr(x)

412

c = chr(x)

413

_jsonmap[c] = c

413

_jsonmap[c] = c

414

_jsonmap['\t'] = '\\t'

414

_jsonmap['\t'] = '\\t'

415

_jsonmap['\n'] = '\\n'

415

_jsonmap['\n'] = '\\n'

416

_jsonmap['\"'] = '\\"'

416

_jsonmap['\"'] = '\\"'

417

_jsonmap['\\'] = '\\\\'

417

_jsonmap['\\'] = '\\\\'

418

_jsonmap['\b'] = '\\b'

418

_jsonmap['\b'] = '\\b'

419

_jsonmap['\f'] = '\\f'

419

_jsonmap['\f'] = '\\f'

420

_jsonmap['\r'] = '\\r'

420

_jsonmap['\r'] = '\\r'

421

422

return ''.join(_jsonmap[c] for c in toutf8b(s))

422

return ''.join(_jsonmap[c] for c in toutf8b(s))

423

424

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

424

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

425

426

def getutf8char(s, pos):

426

def getutf8char(s, pos):

427

'''get the next full utf-8 character in the given string, starting at pos

427

'''get the next full utf-8 character in the given string, starting at pos

428

429

Raises a UnicodeError if the given location does not start a valid

429

Raises a UnicodeError if the given location does not start a valid

430

utf-8 character.

430

utf-8 character.

431

'''

431

'''

432

433

# find how many bytes to attempt decoding from first nibble

433

# find how many bytes to attempt decoding from first nibble

434

l = _utf8len[ord(s[pos]) >> 4]

434

l = _utf8len[ord(s[pos]) >> 4]

435

if not l: # ascii

435

if not l: # ascii

436

return s[pos]

436

return s[pos]

437

438

c = s[pos:pos + l]

438

c = s[pos:pos + l]

439

# validate with attempted decode

439

# validate with attempted decode

440

c.decode("utf-8")

440

c.decode("utf-8")

441

return c

441

return c

442

443

def toutf8b(s):

443

def toutf8b(s):

444

'''convert a local, possibly-binary string into UTF-8b

444

'''convert a local, possibly-binary string into UTF-8b

445

446

This is intended as a generic method to preserve data when working

446

This is intended as a generic method to preserve data when working

447

with schemes like JSON and XML that have no provision for

447

with schemes like JSON and XML that have no provision for

448

arbitrary byte strings. As Mercurial often doesn't know

448

arbitrary byte strings. As Mercurial often doesn't know

449

what encoding data is in, we use so-called UTF-8b.

449

what encoding data is in, we use so-called UTF-8b.

450

451

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

451

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

452

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

452

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

453

uDC00-uDCFF.

453

uDC00-uDCFF.

454

455

Principles of operation:

455

Principles of operation:

456

457

- ASCII and UTF-8 data successfully round-trips and is understood

457

- ASCII and UTF-8 data successfully round-trips and is understood

458

by Unicode-oriented clients

458

by Unicode-oriented clients

459

- filenames and file contents in arbitrary other encodings can have

459

- filenames and file contents in arbitrary other encodings can have

460

be round-tripped or recovered by clueful clients

460

be round-tripped or recovered by clueful clients

461

- local strings that have a cached known UTF-8 encoding (aka

461

- local strings that have a cached known UTF-8 encoding (aka

462

localstr) get sent as UTF-8 so Unicode-oriented clients get the

462

localstr) get sent as UTF-8 so Unicode-oriented clients get the

463

Unicode data they want

463

Unicode data they want

464

- because we must preserve UTF-8 bytestring in places such as

464

- because we must preserve UTF-8 bytestring in places such as

465

filenames, metadata can't be roundtripped without help

465

filenames, metadata can't be roundtripped without help

466

467

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

467

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

468

arbitrary bytes into an internal Unicode format that can be

468

arbitrary bytes into an internal Unicode format that can be

469

re-encoded back into the original. Here we are exposing the

469

re-encoded back into the original. Here we are exposing the

470

internal surrogate encoding as a UTF-8 string.)

470

internal surrogate encoding as a UTF-8 string.)

471

'''

471

'''

472

473

if "\xed" not in s:

473

if "\xed" not in s:

474

if isinstance(s, localstr):

474

if isinstance(s, localstr):

475

return s._utf8

475

return s._utf8

476

try:

476

try:

477

s.decode('utf-8')

477

s.decode('utf-8')

478

return s

478

return s

479

except UnicodeDecodeError:

479

except UnicodeDecodeError:

480

pass

480

pass

481

482

r = ""

482

r = ""

483

pos = 0

483

pos = 0

484

l = len(s)

484

l = len(s)

485

while pos < l:

485

while pos < l:

486

try:

486

try:

487

c = getutf8char(s, pos)

487

c = getutf8char(s, pos)

488

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

488

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

489

# have to re-escape existing U+DCxx characters

489

# have to re-escape existing U+DCxx characters

490

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

490

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

491

pos += 1

491

pos += 1

492

else:

492

else:

493

pos += len(c)

493

pos += len(c)

494

except UnicodeDecodeError:

494

except UnicodeDecodeError:

495

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

495

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

496

pos += 1

496

pos += 1

497

r += c

497

r += c

498

return r

498

return r

499

500

def fromutf8b(s):

500

def fromutf8b(s):

501

'''Given a UTF-8b string, return a local, possibly-binary string.

501

'''Given a UTF-8b string, return a local, possibly-binary string.

502

503

return the original binary string. This

503

return the original binary string. This

504

is a round-trip process for strings like filenames, but metadata

504

is a round-trip process for strings like filenames, but metadata

505

that's was passed through tolocal will remain in UTF-8.

505

that's was passed through tolocal will remain in UTF-8.

506

507

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

507

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

508

>>> m = "\\xc3\\xa9\\x99abcd"

508

>>> m = "\\xc3\\xa9\\x99abcd"

509

>>> toutf8b(m)

509

>>> toutf8b(m)

510

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

510

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

511

>>> roundtrip(m)

511

>>> roundtrip(m)

512

True

512

True

513

>>> roundtrip("\\xc2\\xc2\\x80")

513

>>> roundtrip("\\xc2\\xc2\\x80")

514

True

514

True

515

>>> roundtrip("\\xef\\xbf\\xbd")

515

>>> roundtrip("\\xef\\xbf\\xbd")

516

True

516

True

517

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

517

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

518

True

518

True

519

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

520

True

519

'''

521

'''

520

522

521

# fast path - look for uDxxx prefixes in s

523

# fast path - look for uDxxx prefixes in s

522

if "\xed" not in s:

524

if "\xed" not in s:

523

return s

525

return s

524

526

525

u = s.decode("utf-8")

527

# We could do this with the unicode type but some Python builds

528

# use UTF-16 internally (issue5031) which causes non-BMP code

529

# points to be escaped. Instead, we use our handy getutf8char

530

# helper again to walk the string without "decoding" it.

531

526

r = ""

532

r = ""

527

for c in u:

533

pos = 0

528

if ord(c) & 0xffff00 == 0xdc00:

534

l = len(s)

529

r += chr(ord(c) & 0xff)

535

while pos < l:

530

else:

536

c = getutf8char(s, pos)

531

r += c.encode("utf-8")

537

pos += len(c)

538

# unescape U+DCxx characters

539

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

540

c = chr(ord(c.decode("utf-8")) & 0xff)

541

r += c

532

return r

542

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import locale
             import os
             import unicodedata
             from . import (
                 error,
             )
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = {}
             def jsonescape(s):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 '''
                 if not _jsonmap:
                     for x in xrange(32):
                         _jsonmap[chr(x)] = "\\u%04x" % x
                     for x in xrange(32, 256):
                         c = chr(x)
                         _jsonmap[c] = c
                     _jsonmap['\t'] = '\\t'
                     _jsonmap['\n'] = '\\n'
                     _jsonmap['\"'] = '\\"'
                     _jsonmap['\\'] = '\\\\'
                     _jsonmap['\b'] = '\\b'
                     _jsonmap['\f'] = '\\f'
                     _jsonmap['\r'] = '\\r'
                 return ''.join(_jsonmap[c] for c in toutf8b(s))
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
+                >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
+                True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
-                u = s.decode("utf-8")
+                # We could do this with the unicode type but some Python builds
+                # use UTF-16 internally (issue5031) which causes non-BMP code
+                # points to be escaped. Instead, we use our handy getutf8char
+                # helper again to walk the string without "decoding" it.
                 r = ""
-                for c in u:
+                pos = 0
-                    if ord(c) & 0xffff00 == 0xdc00:
+                l = len(s)
-                        r += chr(ord(c) & 0xff)
+                while pos < l:
-                    else:
+                    c = getutf8char(s, pos)
-                        r += c.encode("utf-8")
+                    pos += len(c)
+                    # unescape U+DCxx characters
+                    if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
+                        c = chr(ord(c.decode("utf-8")) & 0xff)
+                    r += c
                 return r