upstream/mercurial-mirror Commit - r28069:b2d24c28

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

)

17

)

18

19

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

19

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

20

# "Unicode Subtleties"), so we need to ignore them in some places for

20

# "Unicode Subtleties"), so we need to ignore them in some places for

21

# sanity.

21

# sanity.

22

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

22

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

23

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

23

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

24

"206a 206b 206c 206d 206e 206f feff".split()]

24

"206a 206b 206c 206d 206e 206f feff".split()]

25

# verify the next function will work

25

# verify the next function will work

26

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

26

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

27

28

def hfsignoreclean(s):

28

def hfsignoreclean(s):

29

"""Remove codepoints ignored by HFS+ from s.

29

"""Remove codepoints ignored by HFS+ from s.

30

31

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

31

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

32

'.hg'

32

'.hg'

33

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

33

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

34

'.hg'

34

'.hg'

35

"""

35

"""

36

if "\xe2" in s or "\xef" in s:

36

if "\xe2" in s or "\xef" in s:

37

for c in _ignore:

37

for c in _ignore:

38

s = s.replace(c, '')

38

s = s.replace(c, '')

39

return s

39

return s

40

41

def _getpreferredencoding():

41

def _getpreferredencoding():

42

'''

42

'''

43

On darwin, getpreferredencoding ignores the locale environment and

43

On darwin, getpreferredencoding ignores the locale environment and

44

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

44

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

45

for Python 2.7 and up. This is the same corrected code for earlier

45

for Python 2.7 and up. This is the same corrected code for earlier

46

Python versions.

46

Python versions.

47

48

However, we can't use a version check for this method, as some distributions

48

However, we can't use a version check for this method, as some distributions

49

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

49

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

50

encoding, as it is unlikely that this encoding is the actually expected.

50

encoding, as it is unlikely that this encoding is the actually expected.

51

'''

51

'''

52

try:

52

try:

53

locale.CODESET

53

locale.CODESET

54

except AttributeError:

54

except AttributeError:

55

# Fall back to parsing environment variables :-(

55

# Fall back to parsing environment variables :-(

56

return locale.getdefaultlocale()[1]

56

return locale.getdefaultlocale()[1]

57

58

oldloc = locale.setlocale(locale.LC_CTYPE)

58

oldloc = locale.setlocale(locale.LC_CTYPE)

59

locale.setlocale(locale.LC_CTYPE, "")

59

locale.setlocale(locale.LC_CTYPE, "")

60

result = locale.nl_langinfo(locale.CODESET)

60

result = locale.nl_langinfo(locale.CODESET)

61

locale.setlocale(locale.LC_CTYPE, oldloc)

61

locale.setlocale(locale.LC_CTYPE, oldloc)

62

63

return result

63

return result

64

65

_encodingfixers = {

65

_encodingfixers = {

66

'646': lambda: 'ascii',

66

'646': lambda: 'ascii',

67

'ANSI_X3.4-1968': lambda: 'ascii',

67

'ANSI_X3.4-1968': lambda: 'ascii',

68

'mac-roman': _getpreferredencoding

68

'mac-roman': _getpreferredencoding

69

}

69

}

70

71

try:

71

try:

72

encoding = os.environ.get("HGENCODING")

72

encoding = os.environ.get("HGENCODING")

73

if not encoding:

73

if not encoding:

74

encoding = locale.getpreferredencoding() or 'ascii'

74

encoding = locale.getpreferredencoding() or 'ascii'

75

encoding = _encodingfixers.get(encoding, lambda: encoding)()

75

encoding = _encodingfixers.get(encoding, lambda: encoding)()

76

except locale.Error:

76

except locale.Error:

77

encoding = 'ascii'

77

encoding = 'ascii'

78

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

78

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

79

fallbackencoding = 'ISO-8859-1'

79

fallbackencoding = 'ISO-8859-1'

80

81

class localstr(str):

81

class localstr(str):

82

'''This class allows strings that are unmodified to be

82

'''This class allows strings that are unmodified to be

83

round-tripped to the local encoding and back'''

83

round-tripped to the local encoding and back'''

84

def __new__(cls, u, l):

84

def __new__(cls, u, l):

85

s = str.__new__(cls, l)

85

s = str.__new__(cls, l)

86

s._utf8 = u

86

s._utf8 = u

87

return s

87

return s

88

def __hash__(self):

88

def __hash__(self):

89

return hash(self._utf8) # avoid collisions in local string space

89

return hash(self._utf8) # avoid collisions in local string space

90

91

def tolocal(s):

91

def tolocal(s):

92

"""

92

"""

93

Convert a string from internal UTF-8 to local encoding

93

Convert a string from internal UTF-8 to local encoding

94

95

All internal strings should be UTF-8 but some repos before the

95

All internal strings should be UTF-8 but some repos before the

96

implementation of locale support may contain latin1 or possibly

96

implementation of locale support may contain latin1 or possibly

97

other character sets. We attempt to decode everything strictly

97

other character sets. We attempt to decode everything strictly

98

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

98

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

99

replace unknown characters.

99

replace unknown characters.

100

101

The localstr class is used to cache the known UTF-8 encoding of

101

The localstr class is used to cache the known UTF-8 encoding of

102

strings next to their local representation to allow lossless

102

strings next to their local representation to allow lossless

103

round-trip conversion back to UTF-8.

103

round-trip conversion back to UTF-8.

104

105

>>> u = 'foo: \\xc3\\xa4' # utf-8

105

>>> u = 'foo: \\xc3\\xa4' # utf-8

106

>>> l = tolocal(u)

106

>>> l = tolocal(u)

107

>>> l

107

>>> l

108

'foo: ?'

108

'foo: ?'

109

>>> fromlocal(l)

109

>>> fromlocal(l)

110

'foo: \\xc3\\xa4'

110

'foo: \\xc3\\xa4'

111

>>> u2 = 'foo: \\xc3\\xa1'

111

>>> u2 = 'foo: \\xc3\\xa1'

112

>>> d = { l: 1, tolocal(u2): 2 }

112

>>> d = { l: 1, tolocal(u2): 2 }

113

>>> len(d) # no collision

113

>>> len(d) # no collision

114

2

114

2

115

>>> 'foo: ?' in d

115

>>> 'foo: ?' in d

116

False

116

False

117

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

117

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

118

>>> l = tolocal(l1)

118

>>> l = tolocal(l1)

119

>>> l

119

>>> l

120

'foo: ?'

120

'foo: ?'

121

>>> fromlocal(l) # magically in utf-8

121

>>> fromlocal(l) # magically in utf-8

122

'foo: \\xc3\\xa4'

122

'foo: \\xc3\\xa4'

123

"""

123

"""

124

125

try:

125

try:

126

try:

126

try:

127

# make sure string is actually stored in UTF-8

127

# make sure string is actually stored in UTF-8

128

u = s.decode('UTF-8')

128

u = s.decode('UTF-8')

129

if encoding == 'UTF-8':

129

if encoding == 'UTF-8':

130

# fast path

130

# fast path

131

return s

131

return s

132

r = u.encode(encoding, "replace")

132

r = u.encode(encoding, "replace")

133

if u == r.decode(encoding):

133

if u == r.decode(encoding):

134

# r is a safe, non-lossy encoding of s

134

# r is a safe, non-lossy encoding of s

135

return r

135

return r

136

return localstr(s, r)

136

return localstr(s, r)

137

except UnicodeDecodeError:

137

except UnicodeDecodeError:

138

# we should only get here if we're looking at an ancient changeset

138

# we should only get here if we're looking at an ancient changeset

139

try:

139

try:

140

u = s.decode(fallbackencoding)

140

u = s.decode(fallbackencoding)

141

r = u.encode(encoding, "replace")

141

r = u.encode(encoding, "replace")

142

if u == r.decode(encoding):

142

if u == r.decode(encoding):

143

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

144

return r

144

return r

145

return localstr(u.encode('UTF-8'), r)

145

return localstr(u.encode('UTF-8'), r)

146

except UnicodeDecodeError:

146

except UnicodeDecodeError:

147

u = s.decode("utf-8", "replace") # last ditch

147

u = s.decode("utf-8", "replace") # last ditch

148

return u.encode(encoding, "replace") # can't round-trip

148

return u.encode(encoding, "replace") # can't round-trip

149

except LookupError as k:

149

except LookupError as k:

150

raise error.Abort(k, hint="please check your locale settings")

150

raise error.Abort(k, hint="please check your locale settings")

151

152

def fromlocal(s):

152

def fromlocal(s):

153

"""

153

"""

154

Convert a string from the local character encoding to UTF-8

154

Convert a string from the local character encoding to UTF-8

155

156

We attempt to decode strings using the encoding mode set by

156

We attempt to decode strings using the encoding mode set by

157

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

157

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

158

characters will cause an error message. Other modes include

158

characters will cause an error message. Other modes include

159

'replace', which replaces unknown characters with a special

159

'replace', which replaces unknown characters with a special

160

Unicode character, and 'ignore', which drops the character.

160

Unicode character, and 'ignore', which drops the character.

161

"""

161

"""

162

163

# can we do a lossless round-trip?

163

# can we do a lossless round-trip?

164

if isinstance(s, localstr):

164

if isinstance(s, localstr):

165

return s._utf8

165

return s._utf8

166

167

try:

167

try:

168

return s.decode(encoding, encodingmode).encode("utf-8")

168

return s.decode(encoding, encodingmode).encode("utf-8")

169

except UnicodeDecodeError as inst:

169

except UnicodeDecodeError as inst:

170

sub = s[max(0, inst.start - 10):inst.start + 10]

170

sub = s[max(0, inst.start - 10):inst.start + 10]

171

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

171

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

172

except LookupError as k:

172

except LookupError as k:

173

raise error.Abort(k, hint="please check your locale settings")

173

raise error.Abort(k, hint="please check your locale settings")

174

175

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

175

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

176

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

176

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

177

and "WFA" or "WF")

177

and "WFA" or "WF")

178

179

def colwidth(s):

179

def colwidth(s):

180

"Find the column width of a string for display in the local encoding"

180

"Find the column width of a string for display in the local encoding"

181

return ucolwidth(s.decode(encoding, 'replace'))

181

return ucolwidth(s.decode(encoding, 'replace'))

182

183

def ucolwidth(d):

183

def ucolwidth(d):

184

"Find the column width of a Unicode string for display"

184

"Find the column width of a Unicode string for display"

185

eaw = getattr(unicodedata, 'east_asian_width', None)

185

eaw = getattr(unicodedata, 'east_asian_width', None)

186

if eaw is not None:

186

if eaw is not None:

187

return sum([eaw(c) in wide and 2 or 1 for c in d])

187

return sum([eaw(c) in wide and 2 or 1 for c in d])

188

return len(d)

188

return len(d)

189

190

def getcols(s, start, c):

190

def getcols(s, start, c):

191

'''Use colwidth to find a c-column substring of s starting at byte

191

'''Use colwidth to find a c-column substring of s starting at byte

192

index start'''

192

index start'''

193

for x in xrange(start + c, len(s)):

193

for x in xrange(start + c, len(s)):

194

t = s[start:x]

194

t = s[start:x]

195

if colwidth(t) == c:

195

if colwidth(t) == c:

196

return t

196

return t

197

198

def trim(s, width, ellipsis='', leftside=False):

198

def trim(s, width, ellipsis='', leftside=False):

199

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

199

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

200

201

If 'leftside' is True, left side of string 's' is trimmed.

201

If 'leftside' is True, left side of string 's' is trimmed.

202

'ellipsis' is always placed at trimmed side.

202

'ellipsis' is always placed at trimmed side.

203

204

>>> ellipsis = '+++'

204

>>> ellipsis = '+++'

205

>>> from . import encoding

205

>>> from . import encoding

206

>>> encoding.encoding = 'utf-8'

206

>>> encoding.encoding = 'utf-8'

207

>>> t= '1234567890'

207

>>> t= '1234567890'

208

>>> print trim(t, 12, ellipsis=ellipsis)

208

>>> print trim(t, 12, ellipsis=ellipsis)

209

1234567890

209

1234567890

210

>>> print trim(t, 10, ellipsis=ellipsis)

210

>>> print trim(t, 10, ellipsis=ellipsis)

211

1234567890

211

1234567890

212

>>> print trim(t, 8, ellipsis=ellipsis)

212

>>> print trim(t, 8, ellipsis=ellipsis)

213

12345+++

213

12345+++

214

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

214

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

215

+++67890

215

+++67890

216

>>> print trim(t, 8)

216

>>> print trim(t, 8)

217

12345678

217

12345678

218

>>> print trim(t, 8, leftside=True)

218

>>> print trim(t, 8, leftside=True)

219

34567890

219

34567890

220

>>> print trim(t, 3, ellipsis=ellipsis)

220

>>> print trim(t, 3, ellipsis=ellipsis)

221

+++

221

+++

222

>>> print trim(t, 1, ellipsis=ellipsis)

222

>>> print trim(t, 1, ellipsis=ellipsis)

223

+

223

+

224

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

224

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

225

>>> t = u.encode(encoding.encoding)

225

>>> t = u.encode(encoding.encoding)

226

>>> print trim(t, 12, ellipsis=ellipsis)

226

>>> print trim(t, 12, ellipsis=ellipsis)

227

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

227

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

228

>>> print trim(t, 10, ellipsis=ellipsis)

228

>>> print trim(t, 10, ellipsis=ellipsis)

229

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

229

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

230

>>> print trim(t, 8, ellipsis=ellipsis)

230

>>> print trim(t, 8, ellipsis=ellipsis)

231

\xe3\x81\x82\xe3\x81\x84+++

231

\xe3\x81\x82\xe3\x81\x84+++

232

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

232

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

233

+++\xe3\x81\x88\xe3\x81\x8a

233

+++\xe3\x81\x88\xe3\x81\x8a

234

>>> print trim(t, 5)

234

>>> print trim(t, 5)

235

\xe3\x81\x82\xe3\x81\x84

235

\xe3\x81\x82\xe3\x81\x84

236

>>> print trim(t, 5, leftside=True)

236

>>> print trim(t, 5, leftside=True)

237

\xe3\x81\x88\xe3\x81\x8a

237

\xe3\x81\x88\xe3\x81\x8a

238

>>> print trim(t, 4, ellipsis=ellipsis)

238

>>> print trim(t, 4, ellipsis=ellipsis)

239

+++

239

+++

240

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

240

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

241

+++

241

+++

242

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

242

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

243

>>> print trim(t, 12, ellipsis=ellipsis)

243

>>> print trim(t, 12, ellipsis=ellipsis)

244

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

244

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

245

>>> print trim(t, 10, ellipsis=ellipsis)

245

>>> print trim(t, 10, ellipsis=ellipsis)

246

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

246

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

247

>>> print trim(t, 8, ellipsis=ellipsis)

247

>>> print trim(t, 8, ellipsis=ellipsis)

248

\x11\x22\x33\x44\x55+++

248

\x11\x22\x33\x44\x55+++

249

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

249

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

250

+++\x66\x77\x88\x99\xaa

250

+++\x66\x77\x88\x99\xaa

251

>>> print trim(t, 8)

251

>>> print trim(t, 8)

252

\x11\x22\x33\x44\x55\x66\x77\x88

252

\x11\x22\x33\x44\x55\x66\x77\x88

253

>>> print trim(t, 8, leftside=True)

253

>>> print trim(t, 8, leftside=True)

254

\x33\x44\x55\x66\x77\x88\x99\xaa

254

\x33\x44\x55\x66\x77\x88\x99\xaa

255

>>> print trim(t, 3, ellipsis=ellipsis)

255

>>> print trim(t, 3, ellipsis=ellipsis)

256

+++

256

+++

257

>>> print trim(t, 1, ellipsis=ellipsis)

257

>>> print trim(t, 1, ellipsis=ellipsis)

258

+

258

+

259

"""

259

"""

260

try:

260

try:

261

u = s.decode(encoding)

261

u = s.decode(encoding)

262

except UnicodeDecodeError:

262

except UnicodeDecodeError:

263

if len(s) <= width: # trimming is not needed

263

if len(s) <= width: # trimming is not needed

264

return s

264

return s

265

width -= len(ellipsis)

265

width -= len(ellipsis)

266

if width <= 0: # no enough room even for ellipsis

266

if width <= 0: # no enough room even for ellipsis

267

return ellipsis[:width + len(ellipsis)]

267

return ellipsis[:width + len(ellipsis)]

268

if leftside:

268

if leftside:

269

return ellipsis + s[-width:]

269

return ellipsis + s[-width:]

270

return s[:width] + ellipsis

270

return s[:width] + ellipsis

271

272

if ucolwidth(u) <= width: # trimming is not needed

272

if ucolwidth(u) <= width: # trimming is not needed

273

return s

273

return s

274

275

width -= len(ellipsis)

275

width -= len(ellipsis)

276

if width <= 0: # no enough room even for ellipsis

276

if width <= 0: # no enough room even for ellipsis

277

return ellipsis[:width + len(ellipsis)]

277

return ellipsis[:width + len(ellipsis)]

278

279

if leftside:

279

if leftside:

280

uslice = lambda i: u[i:]

280

uslice = lambda i: u[i:]

281

concat = lambda s: ellipsis + s

281

concat = lambda s: ellipsis + s

282

else:

282

else:

283

uslice = lambda i: u[:-i]

283

uslice = lambda i: u[:-i]

284

concat = lambda s: s + ellipsis

284

concat = lambda s: s + ellipsis

285

for i in xrange(1, len(u)):

285

for i in xrange(1, len(u)):

286

usub = uslice(i)

286

usub = uslice(i)

287

if ucolwidth(usub) <= width:

287

if ucolwidth(usub) <= width:

288

return concat(usub.encode(encoding))

288

return concat(usub.encode(encoding))

289

return ellipsis # no enough room for multi-column characters

289

return ellipsis # no enough room for multi-column characters

290

291

def _asciilower(s):

291

def _asciilower(s):

292

'''convert a string to lowercase if ASCII

292

'''convert a string to lowercase if ASCII

293

294

Raises UnicodeDecodeError if non-ASCII characters are found.'''

294

Raises UnicodeDecodeError if non-ASCII characters are found.'''

295

s.decode('ascii')

295

s.decode('ascii')

296

return s.lower()

296

return s.lower()

297

298

def asciilower(s):

298

def asciilower(s):

299

# delay importing avoids cyclic dependency around "parsers" in

299

# delay importing avoids cyclic dependency around "parsers" in

300

# pure Python build (util => i18n => encoding => parsers => util)

300

# pure Python build (util => i18n => encoding => parsers => util)

301

from . import parsers

301

from . import parsers

302

impl = getattr(parsers, 'asciilower', _asciilower)

302

impl = getattr(parsers, 'asciilower', _asciilower)

303

global asciilower

303

global asciilower

304

asciilower = impl

304

asciilower = impl

305

return impl(s)

305

return impl(s)

306

307

def _asciiupper(s):

307

def _asciiupper(s):

308

'''convert a string to uppercase if ASCII

308

'''convert a string to uppercase if ASCII

309

310

Raises UnicodeDecodeError if non-ASCII characters are found.'''

310

Raises UnicodeDecodeError if non-ASCII characters are found.'''

311

s.decode('ascii')

311

s.decode('ascii')

312

return s.upper()

312

return s.upper()

313

314

def asciiupper(s):

314

def asciiupper(s):

315

# delay importing avoids cyclic dependency around "parsers" in

315

# delay importing avoids cyclic dependency around "parsers" in

316

# pure Python build (util => i18n => encoding => parsers => util)

316

# pure Python build (util => i18n => encoding => parsers => util)

317

from . import parsers

317

from . import parsers

318

impl = getattr(parsers, 'asciiupper', _asciiupper)

318

impl = getattr(parsers, 'asciiupper', _asciiupper)

319

global asciiupper

319

global asciiupper

320

asciiupper = impl

320

asciiupper = impl

321

return impl(s)

321

return impl(s)

322

323

def lower(s):

323

def lower(s):

324

"best-effort encoding-aware case-folding of local string s"

324

"best-effort encoding-aware case-folding of local string s"

325

try:

325

try:

326

return asciilower(s)

326

return asciilower(s)

327

except UnicodeDecodeError:

327

except UnicodeDecodeError:

328

pass

328

pass

329

try:

329

try:

330

if isinstance(s, localstr):

330

if isinstance(s, localstr):

331

u = s._utf8.decode("utf-8")

331

u = s._utf8.decode("utf-8")

332

else:

332

else:

333

u = s.decode(encoding, encodingmode)

333

u = s.decode(encoding, encodingmode)

334

335

lu = u.lower()

335

lu = u.lower()

336

if u == lu:

336

if u == lu:

337

return s # preserve localstring

337

return s # preserve localstring

338

return lu.encode(encoding)

338

return lu.encode(encoding)

339

except UnicodeError:

339

except UnicodeError:

340

return s.lower() # we don't know how to fold this except in ASCII

340

return s.lower() # we don't know how to fold this except in ASCII

341

except LookupError as k:

341

except LookupError as k:

342

raise error.Abort(k, hint="please check your locale settings")

342

raise error.Abort(k, hint="please check your locale settings")

343

344

def upper(s):

344

def upper(s):

345

"best-effort encoding-aware case-folding of local string s"

345

"best-effort encoding-aware case-folding of local string s"

346

try:

346

try:

347

return asciiupper(s)

347

return asciiupper(s)

348

except UnicodeDecodeError:

348

except UnicodeDecodeError:

349

return upperfallback(s)

349

return upperfallback(s)

350

351

def upperfallback(s):

351

def upperfallback(s):

352

try:

352

try:

353

if isinstance(s, localstr):

353

if isinstance(s, localstr):

354

u = s._utf8.decode("utf-8")

354

u = s._utf8.decode("utf-8")

355

else:

355

else:

356

u = s.decode(encoding, encodingmode)

356

u = s.decode(encoding, encodingmode)

357

358

uu = u.upper()

358

uu = u.upper()

359

if u == uu:

359

if u == uu:

360

return s # preserve localstring

360

return s # preserve localstring

361

return uu.encode(encoding)

361

return uu.encode(encoding)

362

except UnicodeError:

362

except UnicodeError:

363

return s.upper() # we don't know how to fold this except in ASCII

363

return s.upper() # we don't know how to fold this except in ASCII

364

except LookupError as k:

364

except LookupError as k:

365

raise error.Abort(k, hint="please check your locale settings")

365

raise error.Abort(k, hint="please check your locale settings")

366

367

class normcasespecs(object):

367

class normcasespecs(object):

368

'''what a platform's normcase does to ASCII strings

368

'''what a platform's normcase does to ASCII strings

369

370

This is specified per platform, and should be consistent with what normcase

370

This is specified per platform, and should be consistent with what normcase

371

on that platform actually does.

371

on that platform actually does.

372

373

lower: normcase lowercases ASCII strings

373

lower: normcase lowercases ASCII strings

374

upper: normcase uppercases ASCII strings

374

upper: normcase uppercases ASCII strings

375

other: the fallback function should always be called

375

other: the fallback function should always be called

376

377

This should be kept in sync with normcase_spec in util.h.'''

377

This should be kept in sync with normcase_spec in util.h.'''

378

lower = -1

378

lower = -1

379

upper = 1

379

upper = 1

380

other = 0

380

other = 0

381

382

_jsonmap = []

382

_jsonmap = []

383

_jsonmap.extend("\\u%04x" % x for x in xrange(32))

383

_jsonmap.extend("\\u%04x" % x for x in xrange(32))

384

_jsonmap.extend(chr(x) for x in xrange(32, 127))

384

_jsonmap.extend(chr(x) for x in xrange(32, 127))

385

_jsonmap.append('\\u007f')

385

_jsonmap.append('\\u007f')

386

_jsonmap[0x09] = '\\t'

386

_jsonmap[0x09] = '\\t'

387

_jsonmap[0x0a] = '\\n'

387

_jsonmap[0x0a] = '\\n'

388

_jsonmap[0x22] = '\\"'

388

_jsonmap[0x22] = '\\"'

389

_jsonmap[0x5c] = '\\\\'

389

_jsonmap[0x5c] = '\\\\'

390

_jsonmap[0x08] = '\\b'

390

_jsonmap[0x08] = '\\b'

391

_jsonmap[0x0c] = '\\f'

391

_jsonmap[0x0c] = '\\f'

392

_jsonmap[0x0d] = '\\r'

392

_jsonmap[0x0d] = '\\r'

393

_paranoidjsonmap = _jsonmap[:]

393

_paranoidjsonmap = _jsonmap[:]

394

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

395

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

394

_jsonmap.extend(chr(x) for x in xrange(128, 256))

396

_jsonmap.extend(chr(x) for x in xrange(128, 256))

395

397

396

def jsonescape(s, paranoid=False):

398

def jsonescape(s, paranoid=False):

397

'''returns a string suitable for JSON

399

'''returns a string suitable for JSON

398

400

399

JSON is problematic for us because it doesn't support non-Unicode

401

JSON is problematic for us because it doesn't support non-Unicode

400

bytes. To deal with this, we take the following approach:

402

bytes. To deal with this, we take the following approach:

401

403

402

- localstr objects are converted back to UTF-8

404

- localstr objects are converted back to UTF-8

403

- valid UTF-8/ASCII strings are passed as-is

405

- valid UTF-8/ASCII strings are passed as-is

404

- other strings are converted to UTF-8b surrogate encoding

406

- other strings are converted to UTF-8b surrogate encoding

405

- apply JSON-specified string escaping

407

- apply JSON-specified string escaping

406

408

407

(escapes are doubled in these tests)

409

(escapes are doubled in these tests)

408

410

409

>>> jsonescape('this is a test')

411

>>> jsonescape('this is a test')

410

'this is a test'

412

'this is a test'

411

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

413

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

412

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

414

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

413

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

415

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

414

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

416

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

415

>>> jsonescape('a weird byte: \\xdd')

417

>>> jsonescape('a weird byte: \\xdd')

416

'a weird byte: \\xed\\xb3\\x9d'

418

'a weird byte: \\xed\\xb3\\x9d'

417

>>> jsonescape('utf-8: caf\\xc3\\xa9')

419

>>> jsonescape('utf-8: caf\\xc3\\xa9')

418

'utf-8: caf\\xc3\\xa9'

420

'utf-8: caf\\xc3\\xa9'

419

>>> jsonescape('')

421

>>> jsonescape('')

420

''

422

''

421

423

422

If paranoid, non-ascii characters are also escaped. ~~This is suitable for~~

424

If paranoid, non-ascii and common troublesome characters are also escaped.

423

web output.

425

This is suitable for web output.

424

426

425

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

427

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

426

'escape boundary: ~ \\\\u007f \\\\u0080'

428

'escape boundary: ~ \\\\u007f \\\\u0080'

427

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

429

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

428

'a weird byte: \\\\udcdd'

430

'a weird byte: \\\\udcdd'

429

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

431

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

430

'utf-8: caf\\\\u00e9'

432

'utf-8: caf\\\\u00e9'

431

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

433

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

432

'non-BMP: \\\\ud834\\\\udd1e'

434

'non-BMP: \\\\ud834\\\\udd1e'

435

>>> jsonescape('<foo@example.org>', paranoid=True)

436

'\\\\u003cfoo@example.org\\\\u003e'

433

'''

437

'''

434

438

435

if paranoid:

439

if paranoid:

436

jm = _paranoidjsonmap

440

jm = _paranoidjsonmap

437

else:

441

else:

438

jm = _jsonmap

442

jm = _jsonmap

439

443

440

u8chars = toutf8b(s)

444

u8chars = toutf8b(s)

441

try:

445

try:

442

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

446

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

443

except IndexError:

447

except IndexError:

444

pass

448

pass

445

# non-BMP char is represented as UTF-16 surrogate pair

449

# non-BMP char is represented as UTF-16 surrogate pair

446

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

450

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

447

u16codes.pop(0) # drop BOM

451

u16codes.pop(0) # drop BOM

448

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

452

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

449

453

450

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

454

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

451

455

452

def getutf8char(s, pos):

456

def getutf8char(s, pos):

453

'''get the next full utf-8 character in the given string, starting at pos

457

'''get the next full utf-8 character in the given string, starting at pos

454

458

455

Raises a UnicodeError if the given location does not start a valid

459

Raises a UnicodeError if the given location does not start a valid

456

utf-8 character.

460

utf-8 character.

457

'''

461

'''

458

462

459

# find how many bytes to attempt decoding from first nibble

463

# find how many bytes to attempt decoding from first nibble

460

l = _utf8len[ord(s[pos]) >> 4]

464

l = _utf8len[ord(s[pos]) >> 4]

461

if not l: # ascii

465

if not l: # ascii

462

return s[pos]

466

return s[pos]

463

467

464

c = s[pos:pos + l]

468

c = s[pos:pos + l]

465

# validate with attempted decode

469

# validate with attempted decode

466

c.decode("utf-8")

470

c.decode("utf-8")

467

return c

471

return c

468

472

469

def toutf8b(s):

473

def toutf8b(s):

470

'''convert a local, possibly-binary string into UTF-8b

474

'''convert a local, possibly-binary string into UTF-8b

471

475

472

This is intended as a generic method to preserve data when working

476

This is intended as a generic method to preserve data when working

473

with schemes like JSON and XML that have no provision for

477

with schemes like JSON and XML that have no provision for

474

arbitrary byte strings. As Mercurial often doesn't know

478

arbitrary byte strings. As Mercurial often doesn't know

475

what encoding data is in, we use so-called UTF-8b.

479

what encoding data is in, we use so-called UTF-8b.

476

480

477

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

481

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

478

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

482

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

479

uDC00-uDCFF.

483

uDC00-uDCFF.

480

484

481

Principles of operation:

485

Principles of operation:

482

486

483

- ASCII and UTF-8 data successfully round-trips and is understood

487

- ASCII and UTF-8 data successfully round-trips and is understood

484

by Unicode-oriented clients

488

by Unicode-oriented clients

485

- filenames and file contents in arbitrary other encodings can have

489

- filenames and file contents in arbitrary other encodings can have

486

be round-tripped or recovered by clueful clients

490

be round-tripped or recovered by clueful clients

487

- local strings that have a cached known UTF-8 encoding (aka

491

- local strings that have a cached known UTF-8 encoding (aka

488

localstr) get sent as UTF-8 so Unicode-oriented clients get the

492

localstr) get sent as UTF-8 so Unicode-oriented clients get the

489

Unicode data they want

493

Unicode data they want

490

- because we must preserve UTF-8 bytestring in places such as

494

- because we must preserve UTF-8 bytestring in places such as

491

filenames, metadata can't be roundtripped without help

495

filenames, metadata can't be roundtripped without help

492

496

493

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

497

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

494

arbitrary bytes into an internal Unicode format that can be

498

arbitrary bytes into an internal Unicode format that can be

495

re-encoded back into the original. Here we are exposing the

499

re-encoded back into the original. Here we are exposing the

496

internal surrogate encoding as a UTF-8 string.)

500

internal surrogate encoding as a UTF-8 string.)

497

'''

501

'''

498

502

499

if "\xed" not in s:

503

if "\xed" not in s:

500

if isinstance(s, localstr):

504

if isinstance(s, localstr):

501

return s._utf8

505

return s._utf8

502

try:

506

try:

503

s.decode('utf-8')

507

s.decode('utf-8')

504

return s

508

return s

505

except UnicodeDecodeError:

509

except UnicodeDecodeError:

506

pass

510

pass

507

511

508

r = ""

512

r = ""

509

pos = 0

513

pos = 0

510

l = len(s)

514

l = len(s)

511

while pos < l:

515

while pos < l:

512

try:

516

try:

513

c = getutf8char(s, pos)

517

c = getutf8char(s, pos)

514

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

518

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

515

# have to re-escape existing U+DCxx characters

519

# have to re-escape existing U+DCxx characters

516

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

520

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

517

pos += 1

521

pos += 1

518

else:

522

else:

519

pos += len(c)

523

pos += len(c)

520

except UnicodeDecodeError:

524

except UnicodeDecodeError:

521

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

525

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

522

pos += 1

526

pos += 1

523

r += c

527

r += c

524

return r

528

return r

525

529

526

def fromutf8b(s):

530

def fromutf8b(s):

527

'''Given a UTF-8b string, return a local, possibly-binary string.

531

'''Given a UTF-8b string, return a local, possibly-binary string.

528

532

529

return the original binary string. This

533

return the original binary string. This

530

is a round-trip process for strings like filenames, but metadata

534

is a round-trip process for strings like filenames, but metadata

531

that's was passed through tolocal will remain in UTF-8.

535

that's was passed through tolocal will remain in UTF-8.

532

536

533

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

537

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

534

>>> m = "\\xc3\\xa9\\x99abcd"

538

>>> m = "\\xc3\\xa9\\x99abcd"

535

>>> toutf8b(m)

539

>>> toutf8b(m)

536

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

540

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

537

>>> roundtrip(m)

541

>>> roundtrip(m)

538

True

542

True

539

>>> roundtrip("\\xc2\\xc2\\x80")

543

>>> roundtrip("\\xc2\\xc2\\x80")

540

True

544

True

541

>>> roundtrip("\\xef\\xbf\\xbd")

545

>>> roundtrip("\\xef\\xbf\\xbd")

542

True

546

True

543

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

547

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

544

True

548

True

545

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

549

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

546

True

550

True

547

'''

551

'''

548

552

549

# fast path - look for uDxxx prefixes in s

553

# fast path - look for uDxxx prefixes in s

550

if "\xed" not in s:

554

if "\xed" not in s:

551

return s

555

return s

552

556

553

# We could do this with the unicode type but some Python builds

557

# We could do this with the unicode type but some Python builds

554

# use UTF-16 internally (issue5031) which causes non-BMP code

558

# use UTF-16 internally (issue5031) which causes non-BMP code

555

# points to be escaped. Instead, we use our handy getutf8char

559

# points to be escaped. Instead, we use our handy getutf8char

556

# helper again to walk the string without "decoding" it.

560

# helper again to walk the string without "decoding" it.

557

561

558

r = ""

562

r = ""

559

pos = 0

563

pos = 0

560

l = len(s)

564

l = len(s)

561

while pos < l:

565

while pos < l:

562

c = getutf8char(s, pos)

566

c = getutf8char(s, pos)

563

pos += len(c)

567

pos += len(c)

564

# unescape U+DCxx characters

568

# unescape U+DCxx characters

565

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

569

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

566

c = chr(ord(c.decode("utf-8")) & 0xff)

570

c = chr(ord(c.decode("utf-8")) & 0xff)

567

r += c

571

r += c

568

return r

572

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
             )
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in xrange(32))
             _jsonmap.extend(chr(x) for x in xrange(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
+            _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
+            _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(chr(x) for x in xrange(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
-                If paranoid, non-ascii characters are also escaped. This is suitable for
+                If paranoid, non-ascii and common troublesome characters are also escaped.
-                web output.
+                This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
+                >>> jsonescape('<foo@example.org>', paranoid=True)
+                '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r