upstream/mercurial-mirror Commit - r28068:9ece901f

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import locale

11

import locale

11

import os

12

import os

12

import unicodedata

13

import unicodedata

13

14

from . import (

15

from . import (

15

error,

16

error,

16

)

17

)

17

18

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

19

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

19

# "Unicode Subtleties"), so we need to ignore them in some places for

20

# "Unicode Subtleties"), so we need to ignore them in some places for

20

# sanity.

21

# sanity.

21

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

22

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

22

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

23

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

23

"206a 206b 206c 206d 206e 206f feff".split()]

24

"206a 206b 206c 206d 206e 206f feff".split()]

24

# verify the next function will work

25

# verify the next function will work

25

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

26

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

26

27

def hfsignoreclean(s):

28

def hfsignoreclean(s):

28

"""Remove codepoints ignored by HFS+ from s.

29

"""Remove codepoints ignored by HFS+ from s.

29

30

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

31

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

31

'.hg'

32

'.hg'

32

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

33

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

33

'.hg'

34

'.hg'

34

"""

35

"""

35

if "\xe2" in s or "\xef" in s:

36

if "\xe2" in s or "\xef" in s:

36

for c in _ignore:

37

for c in _ignore:

37

s = s.replace(c, '')

38

s = s.replace(c, '')

38

return s

39

return s

39

40

def _getpreferredencoding():

41

def _getpreferredencoding():

41

'''

42

'''

42

On darwin, getpreferredencoding ignores the locale environment and

43

On darwin, getpreferredencoding ignores the locale environment and

43

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

44

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

44

for Python 2.7 and up. This is the same corrected code for earlier

45

for Python 2.7 and up. This is the same corrected code for earlier

45

Python versions.

46

Python versions.

46

47

However, we can't use a version check for this method, as some distributions

48

However, we can't use a version check for this method, as some distributions

48

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

49

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

49

encoding, as it is unlikely that this encoding is the actually expected.

50

encoding, as it is unlikely that this encoding is the actually expected.

50

'''

51

'''

51

try:

52

try:

52

locale.CODESET

53

locale.CODESET

53

except AttributeError:

54

except AttributeError:

54

# Fall back to parsing environment variables :-(

55

# Fall back to parsing environment variables :-(

55

return locale.getdefaultlocale()[1]

56

return locale.getdefaultlocale()[1]

56

57

oldloc = locale.setlocale(locale.LC_CTYPE)

58

oldloc = locale.setlocale(locale.LC_CTYPE)

58

locale.setlocale(locale.LC_CTYPE, "")

59

locale.setlocale(locale.LC_CTYPE, "")

59

result = locale.nl_langinfo(locale.CODESET)

60

result = locale.nl_langinfo(locale.CODESET)

60

locale.setlocale(locale.LC_CTYPE, oldloc)

61

locale.setlocale(locale.LC_CTYPE, oldloc)

61

62

return result

63

return result

63

64

_encodingfixers = {

65

_encodingfixers = {

65

'646': lambda: 'ascii',

66

'646': lambda: 'ascii',

66

'ANSI_X3.4-1968': lambda: 'ascii',

67

'ANSI_X3.4-1968': lambda: 'ascii',

67

'mac-roman': _getpreferredencoding

68

'mac-roman': _getpreferredencoding

68

}

69

}

69

70

try:

71

try:

71

encoding = os.environ.get("HGENCODING")

72

encoding = os.environ.get("HGENCODING")

72

if not encoding:

73

if not encoding:

73

encoding = locale.getpreferredencoding() or 'ascii'

74

encoding = locale.getpreferredencoding() or 'ascii'

74

encoding = _encodingfixers.get(encoding, lambda: encoding)()

75

encoding = _encodingfixers.get(encoding, lambda: encoding)()

75

except locale.Error:

76

except locale.Error:

76

encoding = 'ascii'

77

encoding = 'ascii'

77

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

78

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

78

fallbackencoding = 'ISO-8859-1'

79

fallbackencoding = 'ISO-8859-1'

79

80

class localstr(str):

81

class localstr(str):

81

'''This class allows strings that are unmodified to be

82

'''This class allows strings that are unmodified to be

82

round-tripped to the local encoding and back'''

83

round-tripped to the local encoding and back'''

83

def __new__(cls, u, l):

84

def __new__(cls, u, l):

84

s = str.__new__(cls, l)

85

s = str.__new__(cls, l)

85

s._utf8 = u

86

s._utf8 = u

86

return s

87

return s

87

def __hash__(self):

88

def __hash__(self):

88

return hash(self._utf8) # avoid collisions in local string space

89

return hash(self._utf8) # avoid collisions in local string space

89

90

def tolocal(s):

91

def tolocal(s):

91

"""

92

"""

92

Convert a string from internal UTF-8 to local encoding

93

Convert a string from internal UTF-8 to local encoding

93

94

All internal strings should be UTF-8 but some repos before the

95

All internal strings should be UTF-8 but some repos before the

95

implementation of locale support may contain latin1 or possibly

96

implementation of locale support may contain latin1 or possibly

96

other character sets. We attempt to decode everything strictly

97

other character sets. We attempt to decode everything strictly

97

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

98

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

98

replace unknown characters.

99

replace unknown characters.

99

100

The localstr class is used to cache the known UTF-8 encoding of

101

The localstr class is used to cache the known UTF-8 encoding of

101

strings next to their local representation to allow lossless

102

strings next to their local representation to allow lossless

102

round-trip conversion back to UTF-8.

103

round-trip conversion back to UTF-8.

103

104

>>> u = 'foo: \\xc3\\xa4' # utf-8

105

>>> u = 'foo: \\xc3\\xa4' # utf-8

105

>>> l = tolocal(u)

106

>>> l = tolocal(u)

106

>>> l

107

>>> l

107

'foo: ?'

108

'foo: ?'

108

>>> fromlocal(l)

109

>>> fromlocal(l)

109

'foo: \\xc3\\xa4'

110

'foo: \\xc3\\xa4'

110

>>> u2 = 'foo: \\xc3\\xa1'

111

>>> u2 = 'foo: \\xc3\\xa1'

111

>>> d = { l: 1, tolocal(u2): 2 }

112

>>> d = { l: 1, tolocal(u2): 2 }

112

>>> len(d) # no collision

113

>>> len(d) # no collision

113

2

114

2

114

>>> 'foo: ?' in d

115

>>> 'foo: ?' in d

115

False

116

False

116

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

117

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

117

>>> l = tolocal(l1)

118

>>> l = tolocal(l1)

118

>>> l

119

>>> l

119

'foo: ?'

120

'foo: ?'

120

>>> fromlocal(l) # magically in utf-8

121

>>> fromlocal(l) # magically in utf-8

121

'foo: \\xc3\\xa4'

122

'foo: \\xc3\\xa4'

122

"""

123

"""

123

124

try:

125

try:

125

try:

126

try:

126

# make sure string is actually stored in UTF-8

127

# make sure string is actually stored in UTF-8

127

u = s.decode('UTF-8')

128

u = s.decode('UTF-8')

128

if encoding == 'UTF-8':

129

if encoding == 'UTF-8':

129

# fast path

130

# fast path

130

return s

131

return s

131

r = u.encode(encoding, "replace")

132

r = u.encode(encoding, "replace")

132

if u == r.decode(encoding):

133

if u == r.decode(encoding):

133

# r is a safe, non-lossy encoding of s

134

# r is a safe, non-lossy encoding of s

134

return r

135

return r

135

return localstr(s, r)

136

return localstr(s, r)

136

except UnicodeDecodeError:

137

except UnicodeDecodeError:

137

# we should only get here if we're looking at an ancient changeset

138

# we should only get here if we're looking at an ancient changeset

138

try:

139

try:

139

u = s.decode(fallbackencoding)

140

u = s.decode(fallbackencoding)

140

r = u.encode(encoding, "replace")

141

r = u.encode(encoding, "replace")

141

if u == r.decode(encoding):

142

if u == r.decode(encoding):

142

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

143

return r

144

return r

144

return localstr(u.encode('UTF-8'), r)

145

return localstr(u.encode('UTF-8'), r)

145

except UnicodeDecodeError:

146

except UnicodeDecodeError:

146

u = s.decode("utf-8", "replace") # last ditch

147

u = s.decode("utf-8", "replace") # last ditch

147

return u.encode(encoding, "replace") # can't round-trip

148

return u.encode(encoding, "replace") # can't round-trip

148

except LookupError as k:

149

except LookupError as k:

149

raise error.Abort(k, hint="please check your locale settings")

150

raise error.Abort(k, hint="please check your locale settings")

150

151

def fromlocal(s):

152

def fromlocal(s):

152

"""

153

"""

153

Convert a string from the local character encoding to UTF-8

154

Convert a string from the local character encoding to UTF-8

154

155

We attempt to decode strings using the encoding mode set by

156

We attempt to decode strings using the encoding mode set by

156

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

157

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

157

characters will cause an error message. Other modes include

158

characters will cause an error message. Other modes include

158

'replace', which replaces unknown characters with a special

159

'replace', which replaces unknown characters with a special

159

Unicode character, and 'ignore', which drops the character.

160

Unicode character, and 'ignore', which drops the character.

160

"""

161

"""

161

162

# can we do a lossless round-trip?

163

# can we do a lossless round-trip?

163

if isinstance(s, localstr):

164

if isinstance(s, localstr):

164

return s._utf8

165

return s._utf8

165

166

try:

167

try:

167

return s.decode(encoding, encodingmode).encode("utf-8")

168

return s.decode(encoding, encodingmode).encode("utf-8")

168

except UnicodeDecodeError as inst:

169

except UnicodeDecodeError as inst:

169

sub = s[max(0, inst.start - 10):inst.start + 10]

170

sub = s[max(0, inst.start - 10):inst.start + 10]

170

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

171

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

171

except LookupError as k:

172

except LookupError as k:

172

raise error.Abort(k, hint="please check your locale settings")

173

raise error.Abort(k, hint="please check your locale settings")

173

174

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

175

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

175

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

176

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

176

and "WFA" or "WF")

177

and "WFA" or "WF")

177

178

def colwidth(s):

179

def colwidth(s):

179

"Find the column width of a string for display in the local encoding"

180

"Find the column width of a string for display in the local encoding"

180

return ucolwidth(s.decode(encoding, 'replace'))

181

return ucolwidth(s.decode(encoding, 'replace'))

181

182

def ucolwidth(d):

183

def ucolwidth(d):

183

"Find the column width of a Unicode string for display"

184

"Find the column width of a Unicode string for display"

184

eaw = getattr(unicodedata, 'east_asian_width', None)

185

eaw = getattr(unicodedata, 'east_asian_width', None)

185

if eaw is not None:

186

if eaw is not None:

186

return sum([eaw(c) in wide and 2 or 1 for c in d])

187

return sum([eaw(c) in wide and 2 or 1 for c in d])

187

return len(d)

188

return len(d)

188

189

def getcols(s, start, c):

190

def getcols(s, start, c):

190

'''Use colwidth to find a c-column substring of s starting at byte

191

'''Use colwidth to find a c-column substring of s starting at byte

191

index start'''

192

index start'''

192

for x in xrange(start + c, len(s)):

193

for x in xrange(start + c, len(s)):

193

t = s[start:x]

194

t = s[start:x]

194

if colwidth(t) == c:

195

if colwidth(t) == c:

195

return t

196

return t

196

197

def trim(s, width, ellipsis='', leftside=False):

198

def trim(s, width, ellipsis='', leftside=False):

198

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

199

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

199

200

If 'leftside' is True, left side of string 's' is trimmed.

201

If 'leftside' is True, left side of string 's' is trimmed.

201

'ellipsis' is always placed at trimmed side.

202

'ellipsis' is always placed at trimmed side.

202

203

>>> ellipsis = '+++'

204

>>> ellipsis = '+++'

204

>>> from . import encoding

205

>>> from . import encoding

205

>>> encoding.encoding = 'utf-8'

206

>>> encoding.encoding = 'utf-8'

206

>>> t= '1234567890'

207

>>> t= '1234567890'

207

>>> print trim(t, 12, ellipsis=ellipsis)

208

>>> print trim(t, 12, ellipsis=ellipsis)

208

1234567890

209

1234567890

209

>>> print trim(t, 10, ellipsis=ellipsis)

210

>>> print trim(t, 10, ellipsis=ellipsis)

210

1234567890

211

1234567890

211

>>> print trim(t, 8, ellipsis=ellipsis)

212

>>> print trim(t, 8, ellipsis=ellipsis)

212

12345+++

213

12345+++

213

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

214

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

214

+++67890

215

+++67890

215

>>> print trim(t, 8)

216

>>> print trim(t, 8)

216

12345678

217

12345678

217

>>> print trim(t, 8, leftside=True)

218

>>> print trim(t, 8, leftside=True)

218

34567890

219

34567890

219

>>> print trim(t, 3, ellipsis=ellipsis)

220

>>> print trim(t, 3, ellipsis=ellipsis)

220

+++

221

+++

221

>>> print trim(t, 1, ellipsis=ellipsis)

222

>>> print trim(t, 1, ellipsis=ellipsis)

222

+

223

+

223

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

224

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

224

>>> t = u.encode(encoding.encoding)

225

>>> t = u.encode(encoding.encoding)

225

>>> print trim(t, 12, ellipsis=ellipsis)

226

>>> print trim(t, 12, ellipsis=ellipsis)

226

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

227

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

227

>>> print trim(t, 10, ellipsis=ellipsis)

228

>>> print trim(t, 10, ellipsis=ellipsis)

228

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

229

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

229

>>> print trim(t, 8, ellipsis=ellipsis)

230

>>> print trim(t, 8, ellipsis=ellipsis)

230

\xe3\x81\x82\xe3\x81\x84+++

231

\xe3\x81\x82\xe3\x81\x84+++

231

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

232

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

232

+++\xe3\x81\x88\xe3\x81\x8a

233

+++\xe3\x81\x88\xe3\x81\x8a

233

>>> print trim(t, 5)

234

>>> print trim(t, 5)

234

\xe3\x81\x82\xe3\x81\x84

235

\xe3\x81\x82\xe3\x81\x84

235

>>> print trim(t, 5, leftside=True)

236

>>> print trim(t, 5, leftside=True)

236

\xe3\x81\x88\xe3\x81\x8a

237

\xe3\x81\x88\xe3\x81\x8a

237

>>> print trim(t, 4, ellipsis=ellipsis)

238

>>> print trim(t, 4, ellipsis=ellipsis)

238

+++

239

+++

239

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

240

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

240

+++

241

+++

241

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

242

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

242

>>> print trim(t, 12, ellipsis=ellipsis)

243

>>> print trim(t, 12, ellipsis=ellipsis)

243

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

244

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

244

>>> print trim(t, 10, ellipsis=ellipsis)

245

>>> print trim(t, 10, ellipsis=ellipsis)

245

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

246

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

246

>>> print trim(t, 8, ellipsis=ellipsis)

247

>>> print trim(t, 8, ellipsis=ellipsis)

247

\x11\x22\x33\x44\x55+++

248

\x11\x22\x33\x44\x55+++

248

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

249

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

249

+++\x66\x77\x88\x99\xaa

250

+++\x66\x77\x88\x99\xaa

250

>>> print trim(t, 8)

251

>>> print trim(t, 8)

251

\x11\x22\x33\x44\x55\x66\x77\x88

252

\x11\x22\x33\x44\x55\x66\x77\x88

252

>>> print trim(t, 8, leftside=True)

253

>>> print trim(t, 8, leftside=True)

253

\x33\x44\x55\x66\x77\x88\x99\xaa

254

\x33\x44\x55\x66\x77\x88\x99\xaa

254

>>> print trim(t, 3, ellipsis=ellipsis)

255

>>> print trim(t, 3, ellipsis=ellipsis)

255

+++

256

+++

256

>>> print trim(t, 1, ellipsis=ellipsis)

257

>>> print trim(t, 1, ellipsis=ellipsis)

257

+

258

+

258

"""

259

"""

259

try:

260

try:

260

u = s.decode(encoding)

261

u = s.decode(encoding)

261

except UnicodeDecodeError:

262

except UnicodeDecodeError:

262

if len(s) <= width: # trimming is not needed

263

if len(s) <= width: # trimming is not needed

263

return s

264

return s

264

width -= len(ellipsis)

265

width -= len(ellipsis)

265

if width <= 0: # no enough room even for ellipsis

266

if width <= 0: # no enough room even for ellipsis

266

return ellipsis[:width + len(ellipsis)]

267

return ellipsis[:width + len(ellipsis)]

267

if leftside:

268

if leftside:

268

return ellipsis + s[-width:]

269

return ellipsis + s[-width:]

269

return s[:width] + ellipsis

270

return s[:width] + ellipsis

270

271

if ucolwidth(u) <= width: # trimming is not needed

272

if ucolwidth(u) <= width: # trimming is not needed

272

return s

273

return s

273

274

width -= len(ellipsis)

275

width -= len(ellipsis)

275

if width <= 0: # no enough room even for ellipsis

276

if width <= 0: # no enough room even for ellipsis

276

return ellipsis[:width + len(ellipsis)]

277

return ellipsis[:width + len(ellipsis)]

277

278

if leftside:

279

if leftside:

279

uslice = lambda i: u[i:]

280

uslice = lambda i: u[i:]

280

concat = lambda s: ellipsis + s

281

concat = lambda s: ellipsis + s

281

else:

282

else:

282

uslice = lambda i: u[:-i]

283

uslice = lambda i: u[:-i]

283

concat = lambda s: s + ellipsis

284

concat = lambda s: s + ellipsis

284

for i in xrange(1, len(u)):

285

for i in xrange(1, len(u)):

285

usub = uslice(i)

286

usub = uslice(i)

286

if ucolwidth(usub) <= width:

287

if ucolwidth(usub) <= width:

287

return concat(usub.encode(encoding))

288

return concat(usub.encode(encoding))

288

return ellipsis # no enough room for multi-column characters

289

return ellipsis # no enough room for multi-column characters

289

290

def _asciilower(s):

291

def _asciilower(s):

291

'''convert a string to lowercase if ASCII

292

'''convert a string to lowercase if ASCII

292

293

Raises UnicodeDecodeError if non-ASCII characters are found.'''

294

Raises UnicodeDecodeError if non-ASCII characters are found.'''

294

s.decode('ascii')

295

s.decode('ascii')

295

return s.lower()

296

return s.lower()

296

297

def asciilower(s):

298

def asciilower(s):

298

# delay importing avoids cyclic dependency around "parsers" in

299

# delay importing avoids cyclic dependency around "parsers" in

299

# pure Python build (util => i18n => encoding => parsers => util)

300

# pure Python build (util => i18n => encoding => parsers => util)

300

from . import parsers

301

from . import parsers

301

impl = getattr(parsers, 'asciilower', _asciilower)

302

impl = getattr(parsers, 'asciilower', _asciilower)

302

global asciilower

303

global asciilower

303

asciilower = impl

304

asciilower = impl

304

return impl(s)

305

return impl(s)

305

306

def _asciiupper(s):

307

def _asciiupper(s):

307

'''convert a string to uppercase if ASCII

308

'''convert a string to uppercase if ASCII

308

309

Raises UnicodeDecodeError if non-ASCII characters are found.'''

310

Raises UnicodeDecodeError if non-ASCII characters are found.'''

310

s.decode('ascii')

311

s.decode('ascii')

311

return s.upper()

312

return s.upper()

312

313

def asciiupper(s):

314

def asciiupper(s):

314

# delay importing avoids cyclic dependency around "parsers" in

315

# delay importing avoids cyclic dependency around "parsers" in

315

# pure Python build (util => i18n => encoding => parsers => util)

316

# pure Python build (util => i18n => encoding => parsers => util)

316

from . import parsers

317

from . import parsers

317

impl = getattr(parsers, 'asciiupper', _asciiupper)

318

impl = getattr(parsers, 'asciiupper', _asciiupper)

318

global asciiupper

319

global asciiupper

319

asciiupper = impl

320

asciiupper = impl

320

return impl(s)

321

return impl(s)

321

322

def lower(s):

323

def lower(s):

323

"best-effort encoding-aware case-folding of local string s"

324

"best-effort encoding-aware case-folding of local string s"

324

try:

325

try:

325

return asciilower(s)

326

return asciilower(s)

326

except UnicodeDecodeError:

327

except UnicodeDecodeError:

327

pass

328

pass

328

try:

329

try:

329

if isinstance(s, localstr):

330

if isinstance(s, localstr):

330

u = s._utf8.decode("utf-8")

331

u = s._utf8.decode("utf-8")

331

else:

332

else:

332

u = s.decode(encoding, encodingmode)

333

u = s.decode(encoding, encodingmode)

333

334

lu = u.lower()

335

lu = u.lower()

335

if u == lu:

336

if u == lu:

336

return s # preserve localstring

337

return s # preserve localstring

337

return lu.encode(encoding)

338

return lu.encode(encoding)

338

except UnicodeError:

339

except UnicodeError:

339

return s.lower() # we don't know how to fold this except in ASCII

340

return s.lower() # we don't know how to fold this except in ASCII

340

except LookupError as k:

341

except LookupError as k:

341

raise error.Abort(k, hint="please check your locale settings")

342

raise error.Abort(k, hint="please check your locale settings")

342

343

def upper(s):

344

def upper(s):

344

"best-effort encoding-aware case-folding of local string s"

345

"best-effort encoding-aware case-folding of local string s"

345

try:

346

try:

346

return asciiupper(s)

347

return asciiupper(s)

347

except UnicodeDecodeError:

348

except UnicodeDecodeError:

348

return upperfallback(s)

349

return upperfallback(s)

349

350

def upperfallback(s):

351

def upperfallback(s):

351

try:

352

try:

352

if isinstance(s, localstr):

353

if isinstance(s, localstr):

353

u = s._utf8.decode("utf-8")

354

u = s._utf8.decode("utf-8")

354

else:

355

else:

355

u = s.decode(encoding, encodingmode)

356

u = s.decode(encoding, encodingmode)

356

357

uu = u.upper()

358

uu = u.upper()

358

if u == uu:

359

if u == uu:

359

return s # preserve localstring

360

return s # preserve localstring

360

return uu.encode(encoding)

361

return uu.encode(encoding)

361

except UnicodeError:

362

except UnicodeError:

362

return s.upper() # we don't know how to fold this except in ASCII

363

return s.upper() # we don't know how to fold this except in ASCII

363

except LookupError as k:

364

except LookupError as k:

364

raise error.Abort(k, hint="please check your locale settings")

365

raise error.Abort(k, hint="please check your locale settings")

365

366

class normcasespecs(object):

367

class normcasespecs(object):

367

'''what a platform's normcase does to ASCII strings

368

'''what a platform's normcase does to ASCII strings

368

369

This is specified per platform, and should be consistent with what normcase

370

This is specified per platform, and should be consistent with what normcase

370

on that platform actually does.

371

on that platform actually does.

371

372

lower: normcase lowercases ASCII strings

373

lower: normcase lowercases ASCII strings

373

upper: normcase uppercases ASCII strings

374

upper: normcase uppercases ASCII strings

374

other: the fallback function should always be called

375

other: the fallback function should always be called

375

376

This should be kept in sync with normcase_spec in util.h.'''

377

This should be kept in sync with normcase_spec in util.h.'''

377

lower = -1

378

lower = -1

378

upper = 1

379

upper = 1

379

other = 0

380

other = 0

380

381

_jsonmap = []

382

_jsonmap = []

382

_jsonmap.extend("\\u%04x" % x for x in xrange(32))

383

_jsonmap.extend("\\u%04x" % x for x in xrange(32))

383

_jsonmap.extend(chr(x) for x in xrange(32, ~~256~~))

384

_jsonmap.extend(chr(x) for x in xrange(32, 127))

384

_jsonmap[~~0x7f~~] = '\\u007f'

385

_jsonmap.append('\\u007f')

385

_jsonmap[0x09] = '\\t'

386

_jsonmap[0x09] = '\\t'

386

_jsonmap[0x0a] = '\\n'

387

_jsonmap[0x0a] = '\\n'

387

_jsonmap[0x22] = '\\"'

388

_jsonmap[0x22] = '\\"'

388

_jsonmap[0x5c] = '\\\\'

389

_jsonmap[0x5c] = '\\\\'

389

_jsonmap[0x08] = '\\b'

390

_jsonmap[0x08] = '\\b'

390

_jsonmap[0x0c] = '\\f'

391

_jsonmap[0x0c] = '\\f'

391

_jsonmap[0x0d] = '\\r'

392

_jsonmap[0x0d] = '\\r'

393

_paranoidjsonmap = _jsonmap[:]

394

_jsonmap.extend(chr(x) for x in xrange(128, 256))

392

395

393

def jsonescape(s):

396

def jsonescape(s, paranoid=False):

394

'''returns a string suitable for JSON

397

'''returns a string suitable for JSON

395

398

396

JSON is problematic for us because it doesn't support non-Unicode

399

JSON is problematic for us because it doesn't support non-Unicode

397

bytes. To deal with this, we take the following approach:

400

bytes. To deal with this, we take the following approach:

398

401

399

- localstr objects are converted back to UTF-8

402

- localstr objects are converted back to UTF-8

400

- valid UTF-8/ASCII strings are passed as-is

403

- valid UTF-8/ASCII strings are passed as-is

401

- other strings are converted to UTF-8b surrogate encoding

404

- other strings are converted to UTF-8b surrogate encoding

402

- apply JSON-specified string escaping

405

- apply JSON-specified string escaping

403

406

404

(escapes are doubled in these tests)

407

(escapes are doubled in these tests)

405

408

406

>>> jsonescape('this is a test')

409

>>> jsonescape('this is a test')

407

'this is a test'

410

'this is a test'

408

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

411

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

409

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

412

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

410

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

413

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

411

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

414

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

412

>>> jsonescape('a weird byte: \\xdd')

415

>>> jsonescape('a weird byte: \\xdd')

413

'a weird byte: \\xed\\xb3\\x9d'

416

'a weird byte: \\xed\\xb3\\x9d'

414

>>> jsonescape('utf-8: caf\\xc3\\xa9')

417

>>> jsonescape('utf-8: caf\\xc3\\xa9')

415

'utf-8: caf\\xc3\\xa9'

418

'utf-8: caf\\xc3\\xa9'

416

>>> jsonescape('')

419

>>> jsonescape('')

417

''

420

''

421

422

If paranoid, non-ascii characters are also escaped. This is suitable for

423

web output.

424

425

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

426

'escape boundary: ~ \\\\u007f \\\\u0080'

427

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

428

'a weird byte: \\\\udcdd'

429

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

430

'utf-8: caf\\\\u00e9'

431

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

432

'non-BMP: \\\\ud834\\\\udd1e'

418

'''

433

'''

419

434

420

return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))

435

if paranoid:

436

jm = _paranoidjsonmap

437

else:

438

jm = _jsonmap

439

440

u8chars = toutf8b(s)

441

try:

442

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

443

except IndexError:

444

pass

445

# non-BMP char is represented as UTF-16 surrogate pair

446

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

447

u16codes.pop(0) # drop BOM

448

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

421

449

422

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

450

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

423

451

424

def getutf8char(s, pos):

452

def getutf8char(s, pos):

425

'''get the next full utf-8 character in the given string, starting at pos

453

'''get the next full utf-8 character in the given string, starting at pos

426

454

427

Raises a UnicodeError if the given location does not start a valid

455

Raises a UnicodeError if the given location does not start a valid

428

utf-8 character.

456

utf-8 character.

429

'''

457

'''

430

458

431

# find how many bytes to attempt decoding from first nibble

459

# find how many bytes to attempt decoding from first nibble

432

l = _utf8len[ord(s[pos]) >> 4]

460

l = _utf8len[ord(s[pos]) >> 4]

433

if not l: # ascii

461

if not l: # ascii

434

return s[pos]

462

return s[pos]

435

463

436

c = s[pos:pos + l]

464

c = s[pos:pos + l]

437

# validate with attempted decode

465

# validate with attempted decode

438

c.decode("utf-8")

466

c.decode("utf-8")

439

return c

467

return c

440

468

441

def toutf8b(s):

469

def toutf8b(s):

442

'''convert a local, possibly-binary string into UTF-8b

470

'''convert a local, possibly-binary string into UTF-8b

443

471

444

This is intended as a generic method to preserve data when working

472

This is intended as a generic method to preserve data when working

445

with schemes like JSON and XML that have no provision for

473

with schemes like JSON and XML that have no provision for

446

arbitrary byte strings. As Mercurial often doesn't know

474

arbitrary byte strings. As Mercurial often doesn't know

447

what encoding data is in, we use so-called UTF-8b.

475

what encoding data is in, we use so-called UTF-8b.

448

476

449

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

477

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

450

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

478

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

451

uDC00-uDCFF.

479

uDC00-uDCFF.

452

480

453

Principles of operation:

481

Principles of operation:

454

482

455

- ASCII and UTF-8 data successfully round-trips and is understood

483

- ASCII and UTF-8 data successfully round-trips and is understood

456

by Unicode-oriented clients

484

by Unicode-oriented clients

457

- filenames and file contents in arbitrary other encodings can have

485

- filenames and file contents in arbitrary other encodings can have

458

be round-tripped or recovered by clueful clients

486

be round-tripped or recovered by clueful clients

459

- local strings that have a cached known UTF-8 encoding (aka

487

- local strings that have a cached known UTF-8 encoding (aka

460

localstr) get sent as UTF-8 so Unicode-oriented clients get the

488

localstr) get sent as UTF-8 so Unicode-oriented clients get the

461

Unicode data they want

489

Unicode data they want

462

- because we must preserve UTF-8 bytestring in places such as

490

- because we must preserve UTF-8 bytestring in places such as

463

filenames, metadata can't be roundtripped without help

491

filenames, metadata can't be roundtripped without help

464

492

465

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

493

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

466

arbitrary bytes into an internal Unicode format that can be

494

arbitrary bytes into an internal Unicode format that can be

467

re-encoded back into the original. Here we are exposing the

495

re-encoded back into the original. Here we are exposing the

468

internal surrogate encoding as a UTF-8 string.)

496

internal surrogate encoding as a UTF-8 string.)

469

'''

497

'''

470

498

471

if "\xed" not in s:

499

if "\xed" not in s:

472

if isinstance(s, localstr):

500

if isinstance(s, localstr):

473

return s._utf8

501

return s._utf8

474

try:

502

try:

475

s.decode('utf-8')

503

s.decode('utf-8')

476

return s

504

return s

477

except UnicodeDecodeError:

505

except UnicodeDecodeError:

478

pass

506

pass

479

507

480

r = ""

508

r = ""

481

pos = 0

509

pos = 0

482

l = len(s)

510

l = len(s)

483

while pos < l:

511

while pos < l:

484

try:

512

try:

485

c = getutf8char(s, pos)

513

c = getutf8char(s, pos)

486

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

514

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

487

# have to re-escape existing U+DCxx characters

515

# have to re-escape existing U+DCxx characters

488

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

516

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

489

pos += 1

517

pos += 1

490

else:

518

else:

491

pos += len(c)

519

pos += len(c)

492

except UnicodeDecodeError:

520

except UnicodeDecodeError:

493

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

521

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

494

pos += 1

522

pos += 1

495

r += c

523

r += c

496

return r

524

return r

497

525

498

def fromutf8b(s):

526

def fromutf8b(s):

499

'''Given a UTF-8b string, return a local, possibly-binary string.

527

'''Given a UTF-8b string, return a local, possibly-binary string.

500

528

501

return the original binary string. This

529

return the original binary string. This

502

is a round-trip process for strings like filenames, but metadata

530

is a round-trip process for strings like filenames, but metadata

503

that's was passed through tolocal will remain in UTF-8.

531

that's was passed through tolocal will remain in UTF-8.

504

532

505

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

533

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

506

>>> m = "\\xc3\\xa9\\x99abcd"

534

>>> m = "\\xc3\\xa9\\x99abcd"

507

>>> toutf8b(m)

535

>>> toutf8b(m)

508

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

536

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

509

>>> roundtrip(m)

537

>>> roundtrip(m)

510

True

538

True

511

>>> roundtrip("\\xc2\\xc2\\x80")

539

>>> roundtrip("\\xc2\\xc2\\x80")

512

True

540

True

513

>>> roundtrip("\\xef\\xbf\\xbd")

541

>>> roundtrip("\\xef\\xbf\\xbd")

514

True

542

True

515

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

543

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

516

True

544

True

517

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

545

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

518

True

546

True

519

'''

547

'''

520

548

521

# fast path - look for uDxxx prefixes in s

549

# fast path - look for uDxxx prefixes in s

522

if "\xed" not in s:

550

if "\xed" not in s:

523

return s

551

return s

524

552

525

# We could do this with the unicode type but some Python builds

553

# We could do this with the unicode type but some Python builds

526

# use UTF-16 internally (issue5031) which causes non-BMP code

554

# use UTF-16 internally (issue5031) which causes non-BMP code

527

# points to be escaped. Instead, we use our handy getutf8char

555

# points to be escaped. Instead, we use our handy getutf8char

528

# helper again to walk the string without "decoding" it.

556

# helper again to walk the string without "decoding" it.

529

557

530

r = ""

558

r = ""

531

pos = 0

559

pos = 0

532

l = len(s)

560

l = len(s)

533

while pos < l:

561

while pos < l:

534

c = getutf8char(s, pos)

562

c = getutf8char(s, pos)

535

pos += len(c)

563

pos += len(c)

536

# unescape U+DCxx characters

564

# unescape U+DCxx characters

537

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

565

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

538

c = chr(ord(c.decode("utf-8")) & 0xff)

566

c = chr(ord(c.decode("utf-8")) & 0xff)

539

r += c

567

r += c

540

return r

568

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
+            import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
             )
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in xrange(32))
-            _jsonmap.extend(chr(x) for x in xrange(32, 256))
+            _jsonmap.extend(chr(x) for x in xrange(32, 127))
-            _jsonmap[0x7f] = '\\u007f'
+            _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
+            _paranoidjsonmap = _jsonmap[:]
+            _jsonmap.extend(chr(x) for x in xrange(128, 256))
-            def jsonescape(s):
+            def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
+                If paranoid, non-ascii characters are also escaped. This is suitable for
+                web output.
+                >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+                'escape boundary: ~ \\\\u007f \\\\u0080'
+                >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+                'a weird byte: \\\\udcdd'
+                >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+                'utf-8: caf\\\\u00e9'
+                >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+                'non-BMP: \\\\ud834\\\\udd1e'
                 '''
-                return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
+                if paranoid:
+                    jm = _paranoidjsonmap
+                else:
+                    jm = _jsonmap
+                u8chars = toutf8b(s)
+                try:
+                    return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
+                except IndexError:
+                    pass
+                # non-BMP char is represented as UTF-16 surrogate pair
+                u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+                u16codes.pop(0)  # drop BOM
+                return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r