upstream/mercurial-mirror Commit - r28508:3c6e94d0

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import sys

13

import sys

14

import unicodedata

14

import unicodedata

15

16

from . import (

16

from . import (

17

error,

17

error,

18

)

18

)

19

20

if sys.version_info[0] >= 3:

20

if sys.version_info[0] >= 3:

21

unichr = chr

21

unichr = chr

22

23

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

23

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

24

# "Unicode Subtleties"), so we need to ignore them in some places for

24

# "Unicode Subtleties"), so we need to ignore them in some places for

25

# sanity.

25

# sanity.

26

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

26

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

27

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

27

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

28

"206a 206b 206c 206d 206e 206f feff".split()]

28

"206a 206b 206c 206d 206e 206f feff".split()]

29

# verify the next function will work

29

# verify the next function will work

30

if sys.version_info[0] >= 3:

30

if sys.version_info[0] >= 3:

31

assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])

31

assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])

32

else:

32

else:

33

assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])

33

assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])

34

35

def hfsignoreclean(s):

35

def hfsignoreclean(s):

36

"""Remove codepoints ignored by HFS+ from s.

36

"""Remove codepoints ignored by HFS+ from s.

37

38

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

38

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

39

'.hg'

39

'.hg'

40

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

40

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

41

'.hg'

41

'.hg'

42

"""

42

"""

43

if "\xe2" in s or "\xef" in s:

43

if "\xe2" in s or "\xef" in s:

44

for c in _ignore:

44

for c in _ignore:

45

s = s.replace(c, '')

45

s = s.replace(c, '')

46

return s

46

return s

47

48

def _getpreferredencoding():

48

def _getpreferredencoding():

49

'''

49

'''

50

On darwin, getpreferredencoding ignores the locale environment and

50

On darwin, getpreferredencoding ignores the locale environment and

51

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

51

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

52

for Python 2.7 and up. This is the same corrected code for earlier

52

for Python 2.7 and up. This is the same corrected code for earlier

53

Python versions.

53

Python versions.

54

55

However, we can't use a version check for this method, as some distributions

55

However, we can't use a version check for this method, as some distributions

56

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

56

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

57

encoding, as it is unlikely that this encoding is the actually expected.

57

encoding, as it is unlikely that this encoding is the actually expected.

58

'''

58

'''

59

try:

59

try:

60

locale.CODESET

60

locale.CODESET

61

except AttributeError:

61

except AttributeError:

62

# Fall back to parsing environment variables :-(

62

# Fall back to parsing environment variables :-(

63

return locale.getdefaultlocale()[1]

63

return locale.getdefaultlocale()[1]

64

65

oldloc = locale.setlocale(locale.LC_CTYPE)

65

oldloc = locale.setlocale(locale.LC_CTYPE)

66

locale.setlocale(locale.LC_CTYPE, "")

66

locale.setlocale(locale.LC_CTYPE, "")

67

result = locale.nl_langinfo(locale.CODESET)

67

result = locale.nl_langinfo(locale.CODESET)

68

locale.setlocale(locale.LC_CTYPE, oldloc)

68

locale.setlocale(locale.LC_CTYPE, oldloc)

69

70

return result

70

return result

71

72

_encodingfixers = {

72

_encodingfixers = {

73

'646': lambda: 'ascii',

73

'646': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

75

'mac-roman': _getpreferredencoding

75

'mac-roman': _getpreferredencoding

76

}

76

}

77

78

try:

78

try:

79

encoding = os.environ.get("HGENCODING")

79

encoding = os.environ.get("HGENCODING")

80

if not encoding:

80

if not encoding:

81

encoding = locale.getpreferredencoding() or 'ascii'

81

encoding = locale.getpreferredencoding() or 'ascii'

82

encoding = _encodingfixers.get(encoding, lambda: encoding)()

82

encoding = _encodingfixers.get(encoding, lambda: encoding)()

83

except locale.Error:

83

except locale.Error:

84

encoding = 'ascii'

84

encoding = 'ascii'

85

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

85

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

86

fallbackencoding = 'ISO-8859-1'

86

fallbackencoding = 'ISO-8859-1'

87

88

class localstr(str):

88

class localstr(str):

89

'''This class allows strings that are unmodified to be

89

'''This class allows strings that are unmodified to be

90

round-tripped to the local encoding and back'''

90

round-tripped to the local encoding and back'''

91

def __new__(cls, u, l):

91

def __new__(cls, u, l):

92

s = str.__new__(cls, l)

92

s = str.__new__(cls, l)

93

s._utf8 = u

93

s._utf8 = u

94

return s

94

return s

95

def __hash__(self):

95

def __hash__(self):

96

return hash(self._utf8) # avoid collisions in local string space

96

return hash(self._utf8) # avoid collisions in local string space

97

98

def tolocal(s):

98

def tolocal(s):

99

"""

99

"""

100

Convert a string from internal UTF-8 to local encoding

100

Convert a string from internal UTF-8 to local encoding

101

102

All internal strings should be UTF-8 but some repos before the

102

All internal strings should be UTF-8 but some repos before the

103

implementation of locale support may contain latin1 or possibly

103

implementation of locale support may contain latin1 or possibly

104

other character sets. We attempt to decode everything strictly

104

other character sets. We attempt to decode everything strictly

105

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

105

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

106

replace unknown characters.

106

replace unknown characters.

107

108

The localstr class is used to cache the known UTF-8 encoding of

108

The localstr class is used to cache the known UTF-8 encoding of

109

strings next to their local representation to allow lossless

109

strings next to their local representation to allow lossless

110

round-trip conversion back to UTF-8.

110

round-trip conversion back to UTF-8.

111

112

>>> u = 'foo: \\xc3\\xa4' # utf-8

112

>>> u = 'foo: \\xc3\\xa4' # utf-8

113

>>> l = tolocal(u)

113

>>> l = tolocal(u)

114

>>> l

114

>>> l

115

'foo: ?'

115

'foo: ?'

116

>>> fromlocal(l)

116

>>> fromlocal(l)

117

'foo: \\xc3\\xa4'

117

'foo: \\xc3\\xa4'

118

>>> u2 = 'foo: \\xc3\\xa1'

118

>>> u2 = 'foo: \\xc3\\xa1'

119

>>> d = { l: 1, tolocal(u2): 2 }

119

>>> d = { l: 1, tolocal(u2): 2 }

120

>>> len(d) # no collision

120

>>> len(d) # no collision

121

2

121

2

122

>>> 'foo: ?' in d

122

>>> 'foo: ?' in d

123

False

123

False

124

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

124

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

125

>>> l = tolocal(l1)

125

>>> l = tolocal(l1)

126

>>> l

126

>>> l

127

'foo: ?'

127

'foo: ?'

128

>>> fromlocal(l) # magically in utf-8

128

>>> fromlocal(l) # magically in utf-8

129

'foo: \\xc3\\xa4'

129

'foo: \\xc3\\xa4'

130

"""

130

"""

131

132

try:

132

try:

133

try:

133

try:

134

# make sure string is actually stored in UTF-8

134

# make sure string is actually stored in UTF-8

135

u = s.decode('UTF-8')

135

u = s.decode('UTF-8')

136

if encoding == 'UTF-8':

136

if encoding == 'UTF-8':

137

# fast path

137

# fast path

138

return s

138

return s

139

r = u.encode(encoding, "replace")

139

r = u.encode(encoding, "replace")

140

if u == r.decode(encoding):

140

if u == r.decode(encoding):

141

# r is a safe, non-lossy encoding of s

141

# r is a safe, non-lossy encoding of s

142

return r

142

return r

143

return localstr(s, r)

143

return localstr(s, r)

144

except UnicodeDecodeError:

144

except UnicodeDecodeError:

145

# we should only get here if we're looking at an ancient changeset

145

# we should only get here if we're looking at an ancient changeset

146

try:

146

try:

147

u = s.decode(fallbackencoding)

147

u = s.decode(fallbackencoding)

148

r = u.encode(encoding, "replace")

148

r = u.encode(encoding, "replace")

149

if u == r.decode(encoding):

149

if u == r.decode(encoding):

150

# r is a safe, non-lossy encoding of s

150

# r is a safe, non-lossy encoding of s

151

return r

151

return r

152

return localstr(u.encode('UTF-8'), r)

152

return localstr(u.encode('UTF-8'), r)

153

except UnicodeDecodeError:

153

except UnicodeDecodeError:

154

u = s.decode("utf-8", "replace") # last ditch

154

u = s.decode("utf-8", "replace") # last ditch

155

return u.encode(encoding, "replace") # can't round-trip

155

return u.encode(encoding, "replace") # can't round-trip

156

except LookupError as k:

156

except LookupError as k:

157

raise error.Abort(k, hint="please check your locale settings")

157

raise error.Abort(k, hint="please check your locale settings")

158

159

def fromlocal(s):

159

def fromlocal(s):

160

"""

160

"""

161

Convert a string from the local character encoding to UTF-8

161

Convert a string from the local character encoding to UTF-8

162

163

We attempt to decode strings using the encoding mode set by

163

We attempt to decode strings using the encoding mode set by

164

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

164

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

165

characters will cause an error message. Other modes include

165

characters will cause an error message. Other modes include

166

'replace', which replaces unknown characters with a special

166

'replace', which replaces unknown characters with a special

167

Unicode character, and 'ignore', which drops the character.

167

Unicode character, and 'ignore', which drops the character.

168

"""

168

"""

169

170

# can we do a lossless round-trip?

170

# can we do a lossless round-trip?

171

if isinstance(s, localstr):

171

if isinstance(s, localstr):

172

return s._utf8

172

return s._utf8

173

174

try:

174

try:

175

return s.decode(encoding, encodingmode).encode("utf-8")

175

return s.decode(encoding, encodingmode).encode("utf-8")

176

except UnicodeDecodeError as inst:

176

except UnicodeDecodeError as inst:

177

sub = s[max(0, inst.start - 10):inst.start + 10]

177

sub = s[max(0, inst.start - 10):inst.start + 10]

178

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

178

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

179

except LookupError as k:

179

except LookupError as k:

180

raise error.Abort(k, hint="please check your locale settings")

180

raise error.Abort(k, hint="please check your locale settings")

181

182

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

182

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

183

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

183

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

184

and "WFA" or "WF")

184

and "WFA" or "WF")

185

186

def colwidth(s):

186

def colwidth(s):

187

"Find the column width of a string for display in the local encoding"

187

"Find the column width of a string for display in the local encoding"

188

return ucolwidth(s.decode(encoding, 'replace'))

188

return ucolwidth(s.decode(encoding, 'replace'))

189

190

def ucolwidth(d):

190

def ucolwidth(d):

191

"Find the column width of a Unicode string for display"

191

"Find the column width of a Unicode string for display"

192

eaw = getattr(unicodedata, 'east_asian_width', None)

192

eaw = getattr(unicodedata, 'east_asian_width', None)

193

if eaw is not None:

193

if eaw is not None:

194

return sum([eaw(c) in wide and 2 or 1 for c in d])

194

return sum([eaw(c) in wide and 2 or 1 for c in d])

195

return len(d)

195

return len(d)

196

197

def getcols(s, start, c):

197

def getcols(s, start, c):

198

'''Use colwidth to find a c-column substring of s starting at byte

198

'''Use colwidth to find a c-column substring of s starting at byte

199

index start'''

199

index start'''

200

for x in xrange(start + c, len(s)):

200

for x in xrange(start + c, len(s)):

201

t = s[start:x]

201

t = s[start:x]

202

if colwidth(t) == c:

202

if colwidth(t) == c:

203

return t

203

return t

204

205

def trim(s, width, ellipsis='', leftside=False):

205

def trim(s, width, ellipsis='', leftside=False):

206

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

206

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

207

208

If 'leftside' is True, left side of string 's' is trimmed.

208

If 'leftside' is True, left side of string 's' is trimmed.

209

'ellipsis' is always placed at trimmed side.

209

'ellipsis' is always placed at trimmed side.

210

211

>>> ellipsis = '+++'

211

>>> ellipsis = '+++'

212

>>> from . import encoding

212

>>> from . import encoding

213

>>> encoding.encoding = 'utf-8'

213

>>> encoding.encoding = 'utf-8'

214

>>> t= '1234567890'

214

>>> t= '1234567890'

215

>>> print trim(t, 12, ellipsis=ellipsis)

215

>>> print trim(t, 12, ellipsis=ellipsis)

216

1234567890

216

1234567890

217

>>> print trim(t, 10, ellipsis=ellipsis)

217

>>> print trim(t, 10, ellipsis=ellipsis)

218

1234567890

218

1234567890

219

>>> print trim(t, 8, ellipsis=ellipsis)

219

>>> print trim(t, 8, ellipsis=ellipsis)

220

12345+++

220

12345+++

221

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

221

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

222

+++67890

222

+++67890

223

>>> print trim(t, 8)

223

>>> print trim(t, 8)

224

12345678

224

12345678

225

>>> print trim(t, 8, leftside=True)

225

>>> print trim(t, 8, leftside=True)

226

34567890

226

34567890

227

>>> print trim(t, 3, ellipsis=ellipsis)

227

>>> print trim(t, 3, ellipsis=ellipsis)

228

+++

228

+++

229

>>> print trim(t, 1, ellipsis=ellipsis)

229

>>> print trim(t, 1, ellipsis=ellipsis)

230

+

230

+

231

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

231

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

232

>>> t = u.encode(encoding.encoding)

232

>>> t = u.encode(encoding.encoding)

233

>>> print trim(t, 12, ellipsis=ellipsis)

233

>>> print trim(t, 12, ellipsis=ellipsis)

234

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

234

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

235

>>> print trim(t, 10, ellipsis=ellipsis)

235

>>> print trim(t, 10, ellipsis=ellipsis)

236

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

236

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

237

>>> print trim(t, 8, ellipsis=ellipsis)

237

>>> print trim(t, 8, ellipsis=ellipsis)

238

\xe3\x81\x82\xe3\x81\x84+++

238

\xe3\x81\x82\xe3\x81\x84+++

239

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

239

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

240

+++\xe3\x81\x88\xe3\x81\x8a

240

+++\xe3\x81\x88\xe3\x81\x8a

241

>>> print trim(t, 5)

241

>>> print trim(t, 5)

242

\xe3\x81\x82\xe3\x81\x84

242

\xe3\x81\x82\xe3\x81\x84

243

>>> print trim(t, 5, leftside=True)

243

>>> print trim(t, 5, leftside=True)

244

\xe3\x81\x88\xe3\x81\x8a

244

\xe3\x81\x88\xe3\x81\x8a

245

>>> print trim(t, 4, ellipsis=ellipsis)

245

>>> print trim(t, 4, ellipsis=ellipsis)

246

+++

246

+++

247

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

247

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

248

+++

248

+++

249

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

249

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

250

>>> print trim(t, 12, ellipsis=ellipsis)

250

>>> print trim(t, 12, ellipsis=ellipsis)

251

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

251

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

252

>>> print trim(t, 10, ellipsis=ellipsis)

252

>>> print trim(t, 10, ellipsis=ellipsis)

253

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

253

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

254

>>> print trim(t, 8, ellipsis=ellipsis)

254

>>> print trim(t, 8, ellipsis=ellipsis)

255

\x11\x22\x33\x44\x55+++

255

\x11\x22\x33\x44\x55+++

256

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

256

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

257

+++\x66\x77\x88\x99\xaa

257

+++\x66\x77\x88\x99\xaa

258

>>> print trim(t, 8)

258

>>> print trim(t, 8)

259

\x11\x22\x33\x44\x55\x66\x77\x88

259

\x11\x22\x33\x44\x55\x66\x77\x88

260

>>> print trim(t, 8, leftside=True)

260

>>> print trim(t, 8, leftside=True)

261

\x33\x44\x55\x66\x77\x88\x99\xaa

261

\x33\x44\x55\x66\x77\x88\x99\xaa

262

>>> print trim(t, 3, ellipsis=ellipsis)

262

>>> print trim(t, 3, ellipsis=ellipsis)

263

+++

263

+++

264

>>> print trim(t, 1, ellipsis=ellipsis)

264

>>> print trim(t, 1, ellipsis=ellipsis)

265

+

265

+

266

"""

266

"""

267

try:

267

try:

268

u = s.decode(encoding)

268

u = s.decode(encoding)

269

except UnicodeDecodeError:

269

except UnicodeDecodeError:

270

if len(s) <= width: # trimming is not needed

270

if len(s) <= width: # trimming is not needed

271

return s

271

return s

272

width -= len(ellipsis)

272

width -= len(ellipsis)

273

if width <= 0: # no enough room even for ellipsis

273

if width <= 0: # no enough room even for ellipsis

274

return ellipsis[:width + len(ellipsis)]

274

return ellipsis[:width + len(ellipsis)]

275

if leftside:

275

if leftside:

276

return ellipsis + s[-width:]

276

return ellipsis + s[-width:]

277

return s[:width] + ellipsis

277

return s[:width] + ellipsis

278

279

if ucolwidth(u) <= width: # trimming is not needed

279

if ucolwidth(u) <= width: # trimming is not needed

280

return s

280

return s

281

282

width -= len(ellipsis)

282

width -= len(ellipsis)

283

if width <= 0: # no enough room even for ellipsis

283

if width <= 0: # no enough room even for ellipsis

284

return ellipsis[:width + len(ellipsis)]

284

return ellipsis[:width + len(ellipsis)]

285

286

if leftside:

286

if leftside:

287

uslice = lambda i: u[i:]

287

uslice = lambda i: u[i:]

288

concat = lambda s: ellipsis + s

288

concat = lambda s: ellipsis + s

289

else:

289

else:

290

uslice = lambda i: u[:-i]

290

uslice = lambda i: u[:-i]

291

concat = lambda s: s + ellipsis

291

concat = lambda s: s + ellipsis

292

for i in xrange(1, len(u)):

292

for i in xrange(1, len(u)):

293

usub = uslice(i)

293

usub = uslice(i)

294

if ucolwidth(usub) <= width:

294

if ucolwidth(usub) <= width:

295

return concat(usub.encode(encoding))

295

return concat(usub.encode(encoding))

296

return ellipsis # no enough room for multi-column characters

296

return ellipsis # no enough room for multi-column characters

297

298

def _asciilower(s):

298

def _asciilower(s):

299

'''convert a string to lowercase if ASCII

299

'''convert a string to lowercase if ASCII

300

301

Raises UnicodeDecodeError if non-ASCII characters are found.'''

301

Raises UnicodeDecodeError if non-ASCII characters are found.'''

302

s.decode('ascii')

302

s.decode('ascii')

303

return s.lower()

303

return s.lower()

304

305

def asciilower(s):

305

def asciilower(s):

306

# delay importing avoids cyclic dependency around "parsers" in

306

# delay importing avoids cyclic dependency around "parsers" in

307

# pure Python build (util => i18n => encoding => parsers => util)

307

# pure Python build (util => i18n => encoding => parsers => util)

308

from . import parsers

308

from . import parsers

309

impl = getattr(parsers, 'asciilower', _asciilower)

309

impl = getattr(parsers, 'asciilower', _asciilower)

310

global asciilower

310

global asciilower

311

asciilower = impl

311

asciilower = impl

312

return impl(s)

312

return impl(s)

313

314

def _asciiupper(s):

314

def _asciiupper(s):

315

'''convert a string to uppercase if ASCII

315

'''convert a string to uppercase if ASCII

316

317

Raises UnicodeDecodeError if non-ASCII characters are found.'''

317

Raises UnicodeDecodeError if non-ASCII characters are found.'''

318

s.decode('ascii')

318

s.decode('ascii')

319

return s.upper()

319

return s.upper()

320

321

def asciiupper(s):

321

def asciiupper(s):

322

# delay importing avoids cyclic dependency around "parsers" in

322

# delay importing avoids cyclic dependency around "parsers" in

323

# pure Python build (util => i18n => encoding => parsers => util)

323

# pure Python build (util => i18n => encoding => parsers => util)

324

from . import parsers

324

from . import parsers

325

impl = getattr(parsers, 'asciiupper', _asciiupper)

325

impl = getattr(parsers, 'asciiupper', _asciiupper)

326

global asciiupper

326

global asciiupper

327

asciiupper = impl

327

asciiupper = impl

328

return impl(s)

328

return impl(s)

329

330

def lower(s):

330

def lower(s):

331

"best-effort encoding-aware case-folding of local string s"

331

"best-effort encoding-aware case-folding of local string s"

332

try:

332

try:

333

return asciilower(s)

333

return asciilower(s)

334

except UnicodeDecodeError:

334

except UnicodeDecodeError:

335

pass

335

pass

336

try:

336

try:

337

if isinstance(s, localstr):

337

if isinstance(s, localstr):

338

u = s._utf8.decode("utf-8")

338

u = s._utf8.decode("utf-8")

339

else:

339

else:

340

u = s.decode(encoding, encodingmode)

340

u = s.decode(encoding, encodingmode)

341

342

lu = u.lower()

342

lu = u.lower()

343

if u == lu:

343

if u == lu:

344

return s # preserve localstring

344

return s # preserve localstring

345

return lu.encode(encoding)

345

return lu.encode(encoding)

346

except UnicodeError:

346

except UnicodeError:

347

return s.lower() # we don't know how to fold this except in ASCII

347

return s.lower() # we don't know how to fold this except in ASCII

348

except LookupError as k:

348

except LookupError as k:

349

raise error.Abort(k, hint="please check your locale settings")

349

raise error.Abort(k, hint="please check your locale settings")

350

351

def upper(s):

351

def upper(s):

352

"best-effort encoding-aware case-folding of local string s"

352

"best-effort encoding-aware case-folding of local string s"

353

try:

353

try:

354

return asciiupper(s)

354

return asciiupper(s)

355

except UnicodeDecodeError:

355

except UnicodeDecodeError:

356

return upperfallback(s)

356

return upperfallback(s)

357

358

def upperfallback(s):

358

def upperfallback(s):

359

try:

359

try:

360

if isinstance(s, localstr):

360

if isinstance(s, localstr):

361

u = s._utf8.decode("utf-8")

361

u = s._utf8.decode("utf-8")

362

else:

362

else:

363

u = s.decode(encoding, encodingmode)

363

u = s.decode(encoding, encodingmode)

364

365

uu = u.upper()

365

uu = u.upper()

366

if u == uu:

366

if u == uu:

367

return s # preserve localstring

367

return s # preserve localstring

368

return uu.encode(encoding)

368

return uu.encode(encoding)

369

except UnicodeError:

369

except UnicodeError:

370

return s.upper() # we don't know how to fold this except in ASCII

370

return s.upper() # we don't know how to fold this except in ASCII

371

except LookupError as k:

371

except LookupError as k:

372

raise error.Abort(k, hint="please check your locale settings")

372

raise error.Abort(k, hint="please check your locale settings")

373

374

class normcasespecs(object):

374

class normcasespecs(object):

375

'''what a platform's normcase does to ASCII strings

375

'''what a platform's normcase does to ASCII strings

376

377

This is specified per platform, and should be consistent with what normcase

377

This is specified per platform, and should be consistent with what normcase

378

on that platform actually does.

378

on that platform actually does.

379

380

lower: normcase lowercases ASCII strings

380

lower: normcase lowercases ASCII strings

381

upper: normcase uppercases ASCII strings

381

upper: normcase uppercases ASCII strings

382

other: the fallback function should always be called

382

other: the fallback function should always be called

383

384

This should be kept in sync with normcase_spec in util.h.'''

384

This should be kept in sync with normcase_spec in util.h.'''

385

lower = -1

385

lower = -1

386

upper = 1

386

upper = 1

387

other = 0

387

other = 0

388

389

_jsonmap = []

389

_jsonmap = []

390

_jsonmap.extend("\\u%04x" % x for x in xrange(32))

390

_jsonmap.extend("\\u%04x" % x for x in range(32))

391

_jsonmap.extend(chr(x) for x in xrange(32, 127))

391

_jsonmap.extend(chr(x) for x in range(32, 127))

392

_jsonmap.append('\\u007f')

392

_jsonmap.append('\\u007f')

393

_jsonmap[0x09] = '\\t'

393

_jsonmap[0x09] = '\\t'

394

_jsonmap[0x0a] = '\\n'

394

_jsonmap[0x0a] = '\\n'

395

_jsonmap[0x22] = '\\"'

395

_jsonmap[0x22] = '\\"'

396

_jsonmap[0x5c] = '\\\\'

396

_jsonmap[0x5c] = '\\\\'

397

_jsonmap[0x08] = '\\b'

397

_jsonmap[0x08] = '\\b'

398

_jsonmap[0x0c] = '\\f'

398

_jsonmap[0x0c] = '\\f'

399

_jsonmap[0x0d] = '\\r'

399

_jsonmap[0x0d] = '\\r'

400

_paranoidjsonmap = _jsonmap[:]

400

_paranoidjsonmap = _jsonmap[:]

401

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

401

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

402

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

402

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

403

_jsonmap.extend(chr(x) for x in xrange(128, 256))

403

_jsonmap.extend(chr(x) for x in range(128, 256))

404

405

def jsonescape(s, paranoid=False):

405

def jsonescape(s, paranoid=False):

406

'''returns a string suitable for JSON

406

'''returns a string suitable for JSON

407

408

JSON is problematic for us because it doesn't support non-Unicode

408

JSON is problematic for us because it doesn't support non-Unicode

409

bytes. To deal with this, we take the following approach:

409

bytes. To deal with this, we take the following approach:

410

411

- localstr objects are converted back to UTF-8

411

- localstr objects are converted back to UTF-8

412

- valid UTF-8/ASCII strings are passed as-is

412

- valid UTF-8/ASCII strings are passed as-is

413

- other strings are converted to UTF-8b surrogate encoding

413

- other strings are converted to UTF-8b surrogate encoding

414

- apply JSON-specified string escaping

414

- apply JSON-specified string escaping

415

416

(escapes are doubled in these tests)

416

(escapes are doubled in these tests)

417

418

>>> jsonescape('this is a test')

418

>>> jsonescape('this is a test')

419

'this is a test'

419

'this is a test'

420

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

420

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

421

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

421

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

422

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

422

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

423

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

423

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

424

>>> jsonescape('a weird byte: \\xdd')

424

>>> jsonescape('a weird byte: \\xdd')

425

'a weird byte: \\xed\\xb3\\x9d'

425

'a weird byte: \\xed\\xb3\\x9d'

426

>>> jsonescape('utf-8: caf\\xc3\\xa9')

426

>>> jsonescape('utf-8: caf\\xc3\\xa9')

427

'utf-8: caf\\xc3\\xa9'

427

'utf-8: caf\\xc3\\xa9'

428

>>> jsonescape('')

428

>>> jsonescape('')

429

''

429

''

430

431

If paranoid, non-ascii and common troublesome characters are also escaped.

431

If paranoid, non-ascii and common troublesome characters are also escaped.

432

This is suitable for web output.

432

This is suitable for web output.

433

434

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

434

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

435

'escape boundary: ~ \\\\u007f \\\\u0080'

435

'escape boundary: ~ \\\\u007f \\\\u0080'

436

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

436

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

437

'a weird byte: \\\\udcdd'

437

'a weird byte: \\\\udcdd'

438

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

438

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

439

'utf-8: caf\\\\u00e9'

439

'utf-8: caf\\\\u00e9'

440

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

440

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

441

'non-BMP: \\\\ud834\\\\udd1e'

441

'non-BMP: \\\\ud834\\\\udd1e'

442

>>> jsonescape('<foo@example.org>', paranoid=True)

442

>>> jsonescape('<foo@example.org>', paranoid=True)

443

'\\\\u003cfoo@example.org\\\\u003e'

443

'\\\\u003cfoo@example.org\\\\u003e'

444

'''

444

'''

445

446

if paranoid:

446

if paranoid:

447

jm = _paranoidjsonmap

447

jm = _paranoidjsonmap

448

else:

448

else:

449

jm = _jsonmap

449

jm = _jsonmap

450

451

u8chars = toutf8b(s)

451

u8chars = toutf8b(s)

452

try:

452

try:

453

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

453

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

454

except IndexError:

454

except IndexError:

455

pass

455

pass

456

# non-BMP char is represented as UTF-16 surrogate pair

456

# non-BMP char is represented as UTF-16 surrogate pair

457

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

457

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

458

u16codes.pop(0) # drop BOM

458

u16codes.pop(0) # drop BOM

459

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

459

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

460

461

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

461

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

462

463

def getutf8char(s, pos):

463

def getutf8char(s, pos):

464

'''get the next full utf-8 character in the given string, starting at pos

464

'''get the next full utf-8 character in the given string, starting at pos

465

466

Raises a UnicodeError if the given location does not start a valid

466

Raises a UnicodeError if the given location does not start a valid

467

utf-8 character.

467

utf-8 character.

468

'''

468

'''

469

470

# find how many bytes to attempt decoding from first nibble

470

# find how many bytes to attempt decoding from first nibble

471

l = _utf8len[ord(s[pos]) >> 4]

471

l = _utf8len[ord(s[pos]) >> 4]

472

if not l: # ascii

472

if not l: # ascii

473

return s[pos]

473

return s[pos]

474

475

c = s[pos:pos + l]

475

c = s[pos:pos + l]

476

# validate with attempted decode

476

# validate with attempted decode

477

c.decode("utf-8")

477

c.decode("utf-8")

478

return c

478

return c

479

480

def toutf8b(s):

480

def toutf8b(s):

481

'''convert a local, possibly-binary string into UTF-8b

481

'''convert a local, possibly-binary string into UTF-8b

482

483

This is intended as a generic method to preserve data when working

483

This is intended as a generic method to preserve data when working

484

with schemes like JSON and XML that have no provision for

484

with schemes like JSON and XML that have no provision for

485

arbitrary byte strings. As Mercurial often doesn't know

485

arbitrary byte strings. As Mercurial often doesn't know

486

what encoding data is in, we use so-called UTF-8b.

486

what encoding data is in, we use so-called UTF-8b.

487

488

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

488

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

489

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

489

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

490

uDC00-uDCFF.

490

uDC00-uDCFF.

491

492

Principles of operation:

492

Principles of operation:

493

494

- ASCII and UTF-8 data successfully round-trips and is understood

494

- ASCII and UTF-8 data successfully round-trips and is understood

495

by Unicode-oriented clients

495

by Unicode-oriented clients

496

- filenames and file contents in arbitrary other encodings can have

496

- filenames and file contents in arbitrary other encodings can have

497

be round-tripped or recovered by clueful clients

497

be round-tripped or recovered by clueful clients

498

- local strings that have a cached known UTF-8 encoding (aka

498

- local strings that have a cached known UTF-8 encoding (aka

499

localstr) get sent as UTF-8 so Unicode-oriented clients get the

499

localstr) get sent as UTF-8 so Unicode-oriented clients get the

500

Unicode data they want

500

Unicode data they want

501

- because we must preserve UTF-8 bytestring in places such as

501

- because we must preserve UTF-8 bytestring in places such as

502

filenames, metadata can't be roundtripped without help

502

filenames, metadata can't be roundtripped without help

503

504

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

504

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

505

arbitrary bytes into an internal Unicode format that can be

505

arbitrary bytes into an internal Unicode format that can be

506

re-encoded back into the original. Here we are exposing the

506

re-encoded back into the original. Here we are exposing the

507

internal surrogate encoding as a UTF-8 string.)

507

internal surrogate encoding as a UTF-8 string.)

508

'''

508

'''

509

510

if "\xed" not in s:

510

if "\xed" not in s:

511

if isinstance(s, localstr):

511

if isinstance(s, localstr):

512

return s._utf8

512

return s._utf8

513

try:

513

try:

514

s.decode('utf-8')

514

s.decode('utf-8')

515

return s

515

return s

516

except UnicodeDecodeError:

516

except UnicodeDecodeError:

517

pass

517

pass

518

519

r = ""

519

r = ""

520

pos = 0

520

pos = 0

521

l = len(s)

521

l = len(s)

522

while pos < l:

522

while pos < l:

523

try:

523

try:

524

c = getutf8char(s, pos)

524

c = getutf8char(s, pos)

525

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

525

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

526

# have to re-escape existing U+DCxx characters

526

# have to re-escape existing U+DCxx characters

527

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

527

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

528

pos += 1

528

pos += 1

529

else:

529

else:

530

pos += len(c)

530

pos += len(c)

531

except UnicodeDecodeError:

531

except UnicodeDecodeError:

532

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

532

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

533

pos += 1

533

pos += 1

534

r += c

534

r += c

535

return r

535

return r

536

537

def fromutf8b(s):

537

def fromutf8b(s):

538

'''Given a UTF-8b string, return a local, possibly-binary string.

538

'''Given a UTF-8b string, return a local, possibly-binary string.

539

540

return the original binary string. This

540

return the original binary string. This

541

is a round-trip process for strings like filenames, but metadata

541

is a round-trip process for strings like filenames, but metadata

542

that's was passed through tolocal will remain in UTF-8.

542

that's was passed through tolocal will remain in UTF-8.

543

544

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

544

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

545

>>> m = "\\xc3\\xa9\\x99abcd"

545

>>> m = "\\xc3\\xa9\\x99abcd"

546

>>> toutf8b(m)

546

>>> toutf8b(m)

547

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

547

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

548

>>> roundtrip(m)

548

>>> roundtrip(m)

549

True

549

True

550

>>> roundtrip("\\xc2\\xc2\\x80")

550

>>> roundtrip("\\xc2\\xc2\\x80")

551

True

551

True

552

>>> roundtrip("\\xef\\xbf\\xbd")

552

>>> roundtrip("\\xef\\xbf\\xbd")

553

True

553

True

554

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

554

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

555

True

555

True

556

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

556

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

557

True

557

True

558

'''

558

'''

559

560

# fast path - look for uDxxx prefixes in s

560

# fast path - look for uDxxx prefixes in s

561

if "\xed" not in s:

561

if "\xed" not in s:

562

return s

562

return s

563

564

# We could do this with the unicode type but some Python builds

564

# We could do this with the unicode type but some Python builds

565

# use UTF-16 internally (issue5031) which causes non-BMP code

565

# use UTF-16 internally (issue5031) which causes non-BMP code

566

# points to be escaped. Instead, we use our handy getutf8char

566

# points to be escaped. Instead, we use our handy getutf8char

567

# helper again to walk the string without "decoding" it.

567

# helper again to walk the string without "decoding" it.

568

569

r = ""

569

r = ""

570

pos = 0

570

pos = 0

571

l = len(s)

571

l = len(s)

572

while pos < l:

572

while pos < l:

573

c = getutf8char(s, pos)

573

c = getutf8char(s, pos)

574

pos += len(c)

574

pos += len(c)

575

# unescape U+DCxx characters

575

# unescape U+DCxx characters

576

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

576

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

577

c = chr(ord(c.decode("utf-8")) & 0xff)

577

c = chr(ord(c.decode("utf-8")) & 0xff)

578

r += c

578

r += c

579

return r

579

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import sys
             import unicodedata
             from . import (
                 error,
             )
             if sys.version_info[0] >= 3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             if sys.version_info[0] >= 3:
                 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
             else:
                 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
-            _jsonmap.extend("\\u%04x" % x for x in xrange(32))
+            _jsonmap.extend("\\u%04x" % x for x in range(32))
-            _jsonmap.extend(chr(x) for x in xrange(32, 127))
+            _jsonmap.extend(chr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
-            _jsonmap.extend(chr(x) for x in xrange(128, 256))
+            _jsonmap.extend(chr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r