upstream/mercurial-mirror Commit - r33756:f5fc54e7

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

charencode = policy.importmod(r'charencode')

22

23

asciilower = charencode.asciilower

24

asciiupper = charencode.asciiupper

25

21

_sysstr = pycompat.sysstr

26

_sysstr = pycompat.sysstr

22

27

23

if pycompat.ispy3:

28

if pycompat.ispy3:

24

unichr = chr

29

unichr = chr

25

30

26

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

31

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

27

# "Unicode Subtleties"), so we need to ignore them in some places for

32

# "Unicode Subtleties"), so we need to ignore them in some places for

28

# sanity.

33

# sanity.

29

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

34

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

30

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

35

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

31

"206a 206b 206c 206d 206e 206f feff".split()]

36

"206a 206b 206c 206d 206e 206f feff".split()]

32

# verify the next function will work

37

# verify the next function will work

33

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

38

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

34

39

35

def hfsignoreclean(s):

40

def hfsignoreclean(s):

36

"""Remove codepoints ignored by HFS+ from s.

41

"""Remove codepoints ignored by HFS+ from s.

37

42

38

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

43

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

39

'.hg'

44

'.hg'

40

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

45

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

41

'.hg'

46

'.hg'

42

"""

47

"""

43

if "\xe2" in s or "\xef" in s:

48

if "\xe2" in s or "\xef" in s:

44

for c in _ignore:

49

for c in _ignore:

45

s = s.replace(c, '')

50

s = s.replace(c, '')

46

return s

51

return s

47

52

48

# encoding.environ is provided read-only, which may not be used to modify

53

# encoding.environ is provided read-only, which may not be used to modify

49

# the process environment

54

# the process environment

50

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

55

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

51

if not pycompat.ispy3:

56

if not pycompat.ispy3:

52

environ = os.environ # re-exports

57

environ = os.environ # re-exports

53

elif _nativeenviron:

58

elif _nativeenviron:

54

environ = os.environb # re-exports

59

environ = os.environb # re-exports

55

else:

60

else:

56

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

61

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

57

# and recreate it once encoding is settled

62

# and recreate it once encoding is settled

58

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

63

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

59

for k, v in os.environ.items()) # re-exports

64

for k, v in os.environ.items()) # re-exports

60

65

61

_encodingfixers = {

66

_encodingfixers = {

62

'646': lambda: 'ascii',

67

'646': lambda: 'ascii',

63

'ANSI_X3.4-1968': lambda: 'ascii',

68

'ANSI_X3.4-1968': lambda: 'ascii',

64

}

69

}

65

70

66

try:

71

try:

67

encoding = environ.get("HGENCODING")

72

encoding = environ.get("HGENCODING")

68

if not encoding:

73

if not encoding:

69

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

74

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

70

encoding = _encodingfixers.get(encoding, lambda: encoding)()

75

encoding = _encodingfixers.get(encoding, lambda: encoding)()

71

except locale.Error:

76

except locale.Error:

72

encoding = 'ascii'

77

encoding = 'ascii'

73

encodingmode = environ.get("HGENCODINGMODE", "strict")

78

encodingmode = environ.get("HGENCODINGMODE", "strict")

74

fallbackencoding = 'ISO-8859-1'

79

fallbackencoding = 'ISO-8859-1'

75

80

76

class localstr(str):

81

class localstr(str):

77

'''This class allows strings that are unmodified to be

82

'''This class allows strings that are unmodified to be

78

round-tripped to the local encoding and back'''

83

round-tripped to the local encoding and back'''

79

def __new__(cls, u, l):

84

def __new__(cls, u, l):

80

s = str.__new__(cls, l)

85

s = str.__new__(cls, l)

81

s._utf8 = u

86

s._utf8 = u

82

return s

87

return s

83

def __hash__(self):

88

def __hash__(self):

84

return hash(self._utf8) # avoid collisions in local string space

89

return hash(self._utf8) # avoid collisions in local string space

85

90

86

def tolocal(s):

91

def tolocal(s):

87

"""

92

"""

88

Convert a string from internal UTF-8 to local encoding

93

Convert a string from internal UTF-8 to local encoding

89

94

90

All internal strings should be UTF-8 but some repos before the

95

All internal strings should be UTF-8 but some repos before the

91

implementation of locale support may contain latin1 or possibly

96

implementation of locale support may contain latin1 or possibly

92

other character sets. We attempt to decode everything strictly

97

other character sets. We attempt to decode everything strictly

93

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

98

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

94

replace unknown characters.

99

replace unknown characters.

95

100

96

The localstr class is used to cache the known UTF-8 encoding of

101

The localstr class is used to cache the known UTF-8 encoding of

97

strings next to their local representation to allow lossless

102

strings next to their local representation to allow lossless

98

round-trip conversion back to UTF-8.

103

round-trip conversion back to UTF-8.

99

104

100

>>> u = 'foo: \\xc3\\xa4' # utf-8

105

>>> u = 'foo: \\xc3\\xa4' # utf-8

101

>>> l = tolocal(u)

106

>>> l = tolocal(u)

102

>>> l

107

>>> l

103

'foo: ?'

108

'foo: ?'

104

>>> fromlocal(l)

109

>>> fromlocal(l)

105

'foo: \\xc3\\xa4'

110

'foo: \\xc3\\xa4'

106

>>> u2 = 'foo: \\xc3\\xa1'

111

>>> u2 = 'foo: \\xc3\\xa1'

107

>>> d = { l: 1, tolocal(u2): 2 }

112

>>> d = { l: 1, tolocal(u2): 2 }

108

>>> len(d) # no collision

113

>>> len(d) # no collision

109

2

114

2

110

>>> 'foo: ?' in d

115

>>> 'foo: ?' in d

111

False

116

False

112

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

117

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

113

>>> l = tolocal(l1)

118

>>> l = tolocal(l1)

114

>>> l

119

>>> l

115

'foo: ?'

120

'foo: ?'

116

>>> fromlocal(l) # magically in utf-8

121

>>> fromlocal(l) # magically in utf-8

117

'foo: \\xc3\\xa4'

122

'foo: \\xc3\\xa4'

118

"""

123

"""

119

124

120

try:

125

try:

121

try:

126

try:

122

# make sure string is actually stored in UTF-8

127

# make sure string is actually stored in UTF-8

123

u = s.decode('UTF-8')

128

u = s.decode('UTF-8')

124

if encoding == 'UTF-8':

129

if encoding == 'UTF-8':

125

# fast path

130

# fast path

126

return s

131

return s

127

r = u.encode(_sysstr(encoding), u"replace")

132

r = u.encode(_sysstr(encoding), u"replace")

128

if u == r.decode(_sysstr(encoding)):

133

if u == r.decode(_sysstr(encoding)):

129

# r is a safe, non-lossy encoding of s

134

# r is a safe, non-lossy encoding of s

130

return r

135

return r

131

return localstr(s, r)

136

return localstr(s, r)

132

except UnicodeDecodeError:

137

except UnicodeDecodeError:

133

# we should only get here if we're looking at an ancient changeset

138

# we should only get here if we're looking at an ancient changeset

134

try:

139

try:

135

u = s.decode(_sysstr(fallbackencoding))

140

u = s.decode(_sysstr(fallbackencoding))

136

r = u.encode(_sysstr(encoding), u"replace")

141

r = u.encode(_sysstr(encoding), u"replace")

137

if u == r.decode(_sysstr(encoding)):

142

if u == r.decode(_sysstr(encoding)):

138

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

139

return r

144

return r

140

return localstr(u.encode('UTF-8'), r)

145

return localstr(u.encode('UTF-8'), r)

141

except UnicodeDecodeError:

146

except UnicodeDecodeError:

142

u = s.decode("utf-8", "replace") # last ditch

147

u = s.decode("utf-8", "replace") # last ditch

143

# can't round-trip

148

# can't round-trip

144

return u.encode(_sysstr(encoding), u"replace")

149

return u.encode(_sysstr(encoding), u"replace")

145

except LookupError as k:

150

except LookupError as k:

146

raise error.Abort(k, hint="please check your locale settings")

151

raise error.Abort(k, hint="please check your locale settings")

147

152

148

def fromlocal(s):

153

def fromlocal(s):

149

"""

154

"""

150

Convert a string from the local character encoding to UTF-8

155

Convert a string from the local character encoding to UTF-8

151

156

152

We attempt to decode strings using the encoding mode set by

157

We attempt to decode strings using the encoding mode set by

153

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

158

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

154

characters will cause an error message. Other modes include

159

characters will cause an error message. Other modes include

155

'replace', which replaces unknown characters with a special

160

'replace', which replaces unknown characters with a special

156

Unicode character, and 'ignore', which drops the character.

161

Unicode character, and 'ignore', which drops the character.

157

"""

162

"""

158

163

159

# can we do a lossless round-trip?

164

# can we do a lossless round-trip?

160

if isinstance(s, localstr):

165

if isinstance(s, localstr):

161

return s._utf8

166

return s._utf8

162

167

163

try:

168

try:

164

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

169

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

165

return u.encode("utf-8")

170

return u.encode("utf-8")

166

except UnicodeDecodeError as inst:

171

except UnicodeDecodeError as inst:

167

sub = s[max(0, inst.start - 10):inst.start + 10]

172

sub = s[max(0, inst.start - 10):inst.start + 10]

168

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

173

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

169

except LookupError as k:

174

except LookupError as k:

170

raise error.Abort(k, hint="please check your locale settings")

175

raise error.Abort(k, hint="please check your locale settings")

171

176

172

def unitolocal(u):

177

def unitolocal(u):

173

"""Convert a unicode string to a byte string of local encoding"""

178

"""Convert a unicode string to a byte string of local encoding"""

174

return tolocal(u.encode('utf-8'))

179

return tolocal(u.encode('utf-8'))

175

180

176

def unifromlocal(s):

181

def unifromlocal(s):

177

"""Convert a byte string of local encoding to a unicode string"""

182

"""Convert a byte string of local encoding to a unicode string"""

178

return fromlocal(s).decode('utf-8')

183

return fromlocal(s).decode('utf-8')

179

184

180

def unimethod(bytesfunc):

185

def unimethod(bytesfunc):

181

"""Create a proxy method that forwards __unicode__() and __str__() of

186

"""Create a proxy method that forwards __unicode__() and __str__() of

182

Python 3 to __bytes__()"""

187

Python 3 to __bytes__()"""

183

def unifunc(obj):

188

def unifunc(obj):

184

return unifromlocal(bytesfunc(obj))

189

return unifromlocal(bytesfunc(obj))

185

return unifunc

190

return unifunc

186

191

187

# converter functions between native str and byte string. use these if the

192

# converter functions between native str and byte string. use these if the

188

# character encoding is not aware (e.g. exception message) or is known to

193

# character encoding is not aware (e.g. exception message) or is known to

189

# be locale dependent (e.g. date formatting.)

194

# be locale dependent (e.g. date formatting.)

190

if pycompat.ispy3:

195

if pycompat.ispy3:

191

strtolocal = unitolocal

196

strtolocal = unitolocal

192

strfromlocal = unifromlocal

197

strfromlocal = unifromlocal

193

strmethod = unimethod

198

strmethod = unimethod

194

else:

199

else:

195

strtolocal = pycompat.identity

200

strtolocal = pycompat.identity

196

strfromlocal = pycompat.identity

201

strfromlocal = pycompat.identity

197

strmethod = pycompat.identity

202

strmethod = pycompat.identity

198

203

199

if not _nativeenviron:

204

if not _nativeenviron:

200

# now encoding and helper functions are available, recreate the environ

205

# now encoding and helper functions are available, recreate the environ

201

# dict to be exported to other modules

206

# dict to be exported to other modules

202

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

207

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

203

for k, v in os.environ.items()) # re-exports

208

for k, v in os.environ.items()) # re-exports

204

209

205

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

210

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

206

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

211

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

207

and "WFA" or "WF")

212

and "WFA" or "WF")

208

213

209

def colwidth(s):

214

def colwidth(s):

210

"Find the column width of a string for display in the local encoding"

215

"Find the column width of a string for display in the local encoding"

211

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

216

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

212

217

213

def ucolwidth(d):

218

def ucolwidth(d):

214

"Find the column width of a Unicode string for display"

219

"Find the column width of a Unicode string for display"

215

eaw = getattr(unicodedata, 'east_asian_width', None)

220

eaw = getattr(unicodedata, 'east_asian_width', None)

216

if eaw is not None:

221

if eaw is not None:

217

return sum([eaw(c) in _wide and 2 or 1 for c in d])

222

return sum([eaw(c) in _wide and 2 or 1 for c in d])

218

return len(d)

223

return len(d)

219

224

220

def getcols(s, start, c):

225

def getcols(s, start, c):

221

'''Use colwidth to find a c-column substring of s starting at byte

226

'''Use colwidth to find a c-column substring of s starting at byte

222

index start'''

227

index start'''

223

for x in xrange(start + c, len(s)):

228

for x in xrange(start + c, len(s)):

224

t = s[start:x]

229

t = s[start:x]

225

if colwidth(t) == c:

230

if colwidth(t) == c:

226

return t

231

return t

227

232

228

def trim(s, width, ellipsis='', leftside=False):

233

def trim(s, width, ellipsis='', leftside=False):

229

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

234

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

230

235

231

If 'leftside' is True, left side of string 's' is trimmed.

236

If 'leftside' is True, left side of string 's' is trimmed.

232

'ellipsis' is always placed at trimmed side.

237

'ellipsis' is always placed at trimmed side.

233

238

234

>>> ellipsis = '+++'

239

>>> ellipsis = '+++'

235

>>> from . import encoding

240

>>> from . import encoding

236

>>> encoding.encoding = 'utf-8'

241

>>> encoding.encoding = 'utf-8'

237

>>> t= '1234567890'

242

>>> t= '1234567890'

238

>>> print trim(t, 12, ellipsis=ellipsis)

243

>>> print trim(t, 12, ellipsis=ellipsis)

239

1234567890

244

1234567890

240

>>> print trim(t, 10, ellipsis=ellipsis)

245

>>> print trim(t, 10, ellipsis=ellipsis)

241

1234567890

246

1234567890

242

>>> print trim(t, 8, ellipsis=ellipsis)

247

>>> print trim(t, 8, ellipsis=ellipsis)

243

12345+++

248

12345+++

244

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

249

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

245

+++67890

250

+++67890

246

>>> print trim(t, 8)

251

>>> print trim(t, 8)

247

12345678

252

12345678

248

>>> print trim(t, 8, leftside=True)

253

>>> print trim(t, 8, leftside=True)

249

34567890

254

34567890

250

>>> print trim(t, 3, ellipsis=ellipsis)

255

>>> print trim(t, 3, ellipsis=ellipsis)

251

+++

256

+++

252

>>> print trim(t, 1, ellipsis=ellipsis)

257

>>> print trim(t, 1, ellipsis=ellipsis)

253

+

258

+

254

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

259

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

255

>>> t = u.encode(encoding.encoding)

260

>>> t = u.encode(encoding.encoding)

256

>>> print trim(t, 12, ellipsis=ellipsis)

261

>>> print trim(t, 12, ellipsis=ellipsis)

257

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

262

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

258

>>> print trim(t, 10, ellipsis=ellipsis)

263

>>> print trim(t, 10, ellipsis=ellipsis)

259

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

264

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

260

>>> print trim(t, 8, ellipsis=ellipsis)

265

>>> print trim(t, 8, ellipsis=ellipsis)

261

\xe3\x81\x82\xe3\x81\x84+++

266

\xe3\x81\x82\xe3\x81\x84+++

262

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

267

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

263

+++\xe3\x81\x88\xe3\x81\x8a

268

+++\xe3\x81\x88\xe3\x81\x8a

264

>>> print trim(t, 5)

269

>>> print trim(t, 5)

265

\xe3\x81\x82\xe3\x81\x84

270

\xe3\x81\x82\xe3\x81\x84

266

>>> print trim(t, 5, leftside=True)

271

>>> print trim(t, 5, leftside=True)

267

\xe3\x81\x88\xe3\x81\x8a

272

\xe3\x81\x88\xe3\x81\x8a

268

>>> print trim(t, 4, ellipsis=ellipsis)

273

>>> print trim(t, 4, ellipsis=ellipsis)

269

+++

274

+++

270

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

275

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

271

+++

276

+++

272

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

277

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

273

>>> print trim(t, 12, ellipsis=ellipsis)

278

>>> print trim(t, 12, ellipsis=ellipsis)

274

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

279

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

275

>>> print trim(t, 10, ellipsis=ellipsis)

280

>>> print trim(t, 10, ellipsis=ellipsis)

276

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

281

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

277

>>> print trim(t, 8, ellipsis=ellipsis)

282

>>> print trim(t, 8, ellipsis=ellipsis)

278

\x11\x22\x33\x44\x55+++

283

\x11\x22\x33\x44\x55+++

279

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

284

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

280

+++\x66\x77\x88\x99\xaa

285

+++\x66\x77\x88\x99\xaa

281

>>> print trim(t, 8)

286

>>> print trim(t, 8)

282

\x11\x22\x33\x44\x55\x66\x77\x88

287

\x11\x22\x33\x44\x55\x66\x77\x88

283

>>> print trim(t, 8, leftside=True)

288

>>> print trim(t, 8, leftside=True)

284

\x33\x44\x55\x66\x77\x88\x99\xaa

289

\x33\x44\x55\x66\x77\x88\x99\xaa

285

>>> print trim(t, 3, ellipsis=ellipsis)

290

>>> print trim(t, 3, ellipsis=ellipsis)

286

+++

291

+++

287

>>> print trim(t, 1, ellipsis=ellipsis)

292

>>> print trim(t, 1, ellipsis=ellipsis)

288

+

293

+

289

"""

294

"""

290

try:

295

try:

291

u = s.decode(_sysstr(encoding))

296

u = s.decode(_sysstr(encoding))

292

except UnicodeDecodeError:

297

except UnicodeDecodeError:

293

if len(s) <= width: # trimming is not needed

298

if len(s) <= width: # trimming is not needed

294

return s

299

return s

295

width -= len(ellipsis)

300

width -= len(ellipsis)

296

if width <= 0: # no enough room even for ellipsis

301

if width <= 0: # no enough room even for ellipsis

297

return ellipsis[:width + len(ellipsis)]

302

return ellipsis[:width + len(ellipsis)]

298

if leftside:

303

if leftside:

299

return ellipsis + s[-width:]

304

return ellipsis + s[-width:]

300

return s[:width] + ellipsis

305

return s[:width] + ellipsis

301

306

302

if ucolwidth(u) <= width: # trimming is not needed

307

if ucolwidth(u) <= width: # trimming is not needed

303

return s

308

return s

304

309

305

width -= len(ellipsis)

310

width -= len(ellipsis)

306

if width <= 0: # no enough room even for ellipsis

311

if width <= 0: # no enough room even for ellipsis

307

return ellipsis[:width + len(ellipsis)]

312

return ellipsis[:width + len(ellipsis)]

308

313

309

if leftside:

314

if leftside:

310

uslice = lambda i: u[i:]

315

uslice = lambda i: u[i:]

311

concat = lambda s: ellipsis + s

316

concat = lambda s: ellipsis + s

312

else:

317

else:

313

uslice = lambda i: u[:-i]

318

uslice = lambda i: u[:-i]

314

concat = lambda s: s + ellipsis

319

concat = lambda s: s + ellipsis

315

for i in xrange(1, len(u)):

320

for i in xrange(1, len(u)):

316

usub = uslice(i)

321

usub = uslice(i)

317

if ucolwidth(usub) <= width:

322

if ucolwidth(usub) <= width:

318

return concat(usub.encode(_sysstr(encoding)))

323

return concat(usub.encode(_sysstr(encoding)))

319

return ellipsis # no enough room for multi-column characters

324

return ellipsis # no enough room for multi-column characters

320

325

321

def _asciilower(s):

322

'''convert a string to lowercase if ASCII

323

324

Raises UnicodeDecodeError if non-ASCII characters are found.'''

325

s.decode('ascii')

326

return s.lower()

327

328

def asciilower(s):

329

# delay importing avoids cyclic dependency around "parsers" in

330

# pure Python build (util => i18n => encoding => parsers => util)

331

parsers = policy.importmod(r'parsers')

332

impl = getattr(parsers, 'asciilower', _asciilower)

333

global asciilower

334

asciilower = impl

335

return impl(s)

336

337

def _asciiupper(s):

338

'''convert a string to uppercase if ASCII

339

340

Raises UnicodeDecodeError if non-ASCII characters are found.'''

341

s.decode('ascii')

342

return s.upper()

343

344

def asciiupper(s):

345

# delay importing avoids cyclic dependency around "parsers" in

346

# pure Python build (util => i18n => encoding => parsers => util)

347

parsers = policy.importmod(r'parsers')

348

impl = getattr(parsers, 'asciiupper', _asciiupper)

349

global asciiupper

350

asciiupper = impl

351

return impl(s)

352

353

def lower(s):

326

def lower(s):

354

"best-effort encoding-aware case-folding of local string s"

327

"best-effort encoding-aware case-folding of local string s"

355

try:

328

try:

356

return asciilower(s)

329

return asciilower(s)

357

except UnicodeDecodeError:

330

except UnicodeDecodeError:

358

pass

331

pass

359

try:

332

try:

360

if isinstance(s, localstr):

333

if isinstance(s, localstr):

361

u = s._utf8.decode("utf-8")

334

u = s._utf8.decode("utf-8")

362

else:

335

else:

363

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

336

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

364

337

365

lu = u.lower()

338

lu = u.lower()

366

if u == lu:

339

if u == lu:

367

return s # preserve localstring

340

return s # preserve localstring

368

return lu.encode(_sysstr(encoding))

341

return lu.encode(_sysstr(encoding))

369

except UnicodeError:

342

except UnicodeError:

370

return s.lower() # we don't know how to fold this except in ASCII

343

return s.lower() # we don't know how to fold this except in ASCII

371

except LookupError as k:

344

except LookupError as k:

372

raise error.Abort(k, hint="please check your locale settings")

345

raise error.Abort(k, hint="please check your locale settings")

373

346

374

def upper(s):

347

def upper(s):

375

"best-effort encoding-aware case-folding of local string s"

348

"best-effort encoding-aware case-folding of local string s"

376

try:

349

try:

377

return asciiupper(s)

350

return asciiupper(s)

378

except UnicodeDecodeError:

351

except UnicodeDecodeError:

379

return upperfallback(s)

352

return upperfallback(s)

380

353

381

def upperfallback(s):

354

def upperfallback(s):

382

try:

355

try:

383

if isinstance(s, localstr):

356

if isinstance(s, localstr):

384

u = s._utf8.decode("utf-8")

357

u = s._utf8.decode("utf-8")

385

else:

358

else:

386

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

359

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

387

360

388

uu = u.upper()

361

uu = u.upper()

389

if u == uu:

362

if u == uu:

390

return s # preserve localstring

363

return s # preserve localstring

391

return uu.encode(_sysstr(encoding))

364

return uu.encode(_sysstr(encoding))

392

except UnicodeError:

365

except UnicodeError:

393

return s.upper() # we don't know how to fold this except in ASCII

366

return s.upper() # we don't know how to fold this except in ASCII

394

except LookupError as k:

367

except LookupError as k:

395

raise error.Abort(k, hint="please check your locale settings")

368

raise error.Abort(k, hint="please check your locale settings")

396

369

397

class normcasespecs(object):

370

class normcasespecs(object):

398

'''what a platform's normcase does to ASCII strings

371

'''what a platform's normcase does to ASCII strings

399

372

400

This is specified per platform, and should be consistent with what normcase

373

This is specified per platform, and should be consistent with what normcase

401

on that platform actually does.

374

on that platform actually does.

402

375

403

lower: normcase lowercases ASCII strings

376

lower: normcase lowercases ASCII strings

404

upper: normcase uppercases ASCII strings

377

upper: normcase uppercases ASCII strings

405

other: the fallback function should always be called

378

other: the fallback function should always be called

406

379

407

This should be kept in sync with normcase_spec in util.h.'''

380

This should be kept in sync with normcase_spec in util.h.'''

408

lower = -1

381

lower = -1

409

upper = 1

382

upper = 1

410

other = 0

383

other = 0

411

384

412

_jsonmap = []

385

_jsonmap = []

413

_jsonmap.extend("\\u%04x" % x for x in range(32))

386

_jsonmap.extend("\\u%04x" % x for x in range(32))

414

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

387

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

415

_jsonmap.append('\\u007f')

388

_jsonmap.append('\\u007f')

416

_jsonmap[0x09] = '\\t'

389

_jsonmap[0x09] = '\\t'

417

_jsonmap[0x0a] = '\\n'

390

_jsonmap[0x0a] = '\\n'

418

_jsonmap[0x22] = '\\"'

391

_jsonmap[0x22] = '\\"'

419

_jsonmap[0x5c] = '\\\\'

392

_jsonmap[0x5c] = '\\\\'

420

_jsonmap[0x08] = '\\b'

393

_jsonmap[0x08] = '\\b'

421

_jsonmap[0x0c] = '\\f'

394

_jsonmap[0x0c] = '\\f'

422

_jsonmap[0x0d] = '\\r'

395

_jsonmap[0x0d] = '\\r'

423

_paranoidjsonmap = _jsonmap[:]

396

_paranoidjsonmap = _jsonmap[:]

424

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

397

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

425

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

398

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

426

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

399

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

427

400

428

def jsonescape(s, paranoid=False):

401

def jsonescape(s, paranoid=False):

429

'''returns a string suitable for JSON

402

'''returns a string suitable for JSON

430

403

431

JSON is problematic for us because it doesn't support non-Unicode

404

JSON is problematic for us because it doesn't support non-Unicode

432

bytes. To deal with this, we take the following approach:

405

bytes. To deal with this, we take the following approach:

433

406

434

- localstr objects are converted back to UTF-8

407

- localstr objects are converted back to UTF-8

435

- valid UTF-8/ASCII strings are passed as-is

408

- valid UTF-8/ASCII strings are passed as-is

436

- other strings are converted to UTF-8b surrogate encoding

409

- other strings are converted to UTF-8b surrogate encoding

437

- apply JSON-specified string escaping

410

- apply JSON-specified string escaping

438

411

439

(escapes are doubled in these tests)

412

(escapes are doubled in these tests)

440

413

441

>>> jsonescape('this is a test')

414

>>> jsonescape('this is a test')

442

'this is a test'

415

'this is a test'

443

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

416

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

444

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

417

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

445

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

418

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

446

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

419

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

447

>>> jsonescape('a weird byte: \\xdd')

420

>>> jsonescape('a weird byte: \\xdd')

448

'a weird byte: \\xed\\xb3\\x9d'

421

'a weird byte: \\xed\\xb3\\x9d'

449

>>> jsonescape('utf-8: caf\\xc3\\xa9')

422

>>> jsonescape('utf-8: caf\\xc3\\xa9')

450

'utf-8: caf\\xc3\\xa9'

423

'utf-8: caf\\xc3\\xa9'

451

>>> jsonescape('')

424

>>> jsonescape('')

452

''

425

''

453

426

454

If paranoid, non-ascii and common troublesome characters are also escaped.

427

If paranoid, non-ascii and common troublesome characters are also escaped.

455

This is suitable for web output.

428

This is suitable for web output.

456

429

457

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

430

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

458

'escape boundary: ~ \\\\u007f \\\\u0080'

431

'escape boundary: ~ \\\\u007f \\\\u0080'

459

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

432

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

460

'a weird byte: \\\\udcdd'

433

'a weird byte: \\\\udcdd'

461

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

434

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

462

'utf-8: caf\\\\u00e9'

435

'utf-8: caf\\\\u00e9'

463

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

436

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

464

'non-BMP: \\\\ud834\\\\udd1e'

437

'non-BMP: \\\\ud834\\\\udd1e'

465

>>> jsonescape('<foo@example.org>', paranoid=True)

438

>>> jsonescape('<foo@example.org>', paranoid=True)

466

'\\\\u003cfoo@example.org\\\\u003e'

439

'\\\\u003cfoo@example.org\\\\u003e'

467

'''

440

'''

468

441

469

if paranoid:

442

if paranoid:

470

jm = _paranoidjsonmap

443

jm = _paranoidjsonmap

471

else:

444

else:

472

jm = _jsonmap

445

jm = _jsonmap

473

446

474

u8chars = toutf8b(s)

447

u8chars = toutf8b(s)

475

try:

448

try:

476

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

449

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

477

except IndexError:

450

except IndexError:

478

pass

451

pass

479

# non-BMP char is represented as UTF-16 surrogate pair

452

# non-BMP char is represented as UTF-16 surrogate pair

480

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

453

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

481

u16codes.pop(0) # drop BOM

454

u16codes.pop(0) # drop BOM

482

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

455

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

483

456

484

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

457

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

485

458

486

def getutf8char(s, pos):

459

def getutf8char(s, pos):

487

'''get the next full utf-8 character in the given string, starting at pos

460

'''get the next full utf-8 character in the given string, starting at pos

488

461

489

Raises a UnicodeError if the given location does not start a valid

462

Raises a UnicodeError if the given location does not start a valid

490

utf-8 character.

463

utf-8 character.

491

'''

464

'''

492

465

493

# find how many bytes to attempt decoding from first nibble

466

# find how many bytes to attempt decoding from first nibble

494

l = _utf8len[ord(s[pos]) >> 4]

467

l = _utf8len[ord(s[pos]) >> 4]

495

if not l: # ascii

468

if not l: # ascii

496

return s[pos]

469

return s[pos]

497

470

498

c = s[pos:pos + l]

471

c = s[pos:pos + l]

499

# validate with attempted decode

472

# validate with attempted decode

500

c.decode("utf-8")

473

c.decode("utf-8")

501

return c

474

return c

502

475

503

def toutf8b(s):

476

def toutf8b(s):

504

'''convert a local, possibly-binary string into UTF-8b

477

'''convert a local, possibly-binary string into UTF-8b

505

478

506

This is intended as a generic method to preserve data when working

479

This is intended as a generic method to preserve data when working

507

with schemes like JSON and XML that have no provision for

480

with schemes like JSON and XML that have no provision for

508

arbitrary byte strings. As Mercurial often doesn't know

481

arbitrary byte strings. As Mercurial often doesn't know

509

what encoding data is in, we use so-called UTF-8b.

482

what encoding data is in, we use so-called UTF-8b.

510

483

511

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

484

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

512

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

485

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

513

uDC00-uDCFF.

486

uDC00-uDCFF.

514

487

515

Principles of operation:

488

Principles of operation:

516

489

517

- ASCII and UTF-8 data successfully round-trips and is understood

490

- ASCII and UTF-8 data successfully round-trips and is understood

518

by Unicode-oriented clients

491

by Unicode-oriented clients

519

- filenames and file contents in arbitrary other encodings can have

492

- filenames and file contents in arbitrary other encodings can have

520

be round-tripped or recovered by clueful clients

493

be round-tripped or recovered by clueful clients

521

- local strings that have a cached known UTF-8 encoding (aka

494

- local strings that have a cached known UTF-8 encoding (aka

522

localstr) get sent as UTF-8 so Unicode-oriented clients get the

495

localstr) get sent as UTF-8 so Unicode-oriented clients get the

523

Unicode data they want

496

Unicode data they want

524

- because we must preserve UTF-8 bytestring in places such as

497

- because we must preserve UTF-8 bytestring in places such as

525

filenames, metadata can't be roundtripped without help

498

filenames, metadata can't be roundtripped without help

526

499

527

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

500

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

528

arbitrary bytes into an internal Unicode format that can be

501

arbitrary bytes into an internal Unicode format that can be

529

re-encoded back into the original. Here we are exposing the

502

re-encoded back into the original. Here we are exposing the

530

internal surrogate encoding as a UTF-8 string.)

503

internal surrogate encoding as a UTF-8 string.)

531

'''

504

'''

532

505

533

if "\xed" not in s:

506

if "\xed" not in s:

534

if isinstance(s, localstr):

507

if isinstance(s, localstr):

535

return s._utf8

508

return s._utf8

536

try:

509

try:

537

s.decode('utf-8')

510

s.decode('utf-8')

538

return s

511

return s

539

except UnicodeDecodeError:

512

except UnicodeDecodeError:

540

pass

513

pass

541

514

542

r = ""

515

r = ""

543

pos = 0

516

pos = 0

544

l = len(s)

517

l = len(s)

545

while pos < l:

518

while pos < l:

546

try:

519

try:

547

c = getutf8char(s, pos)

520

c = getutf8char(s, pos)

548

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

521

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

549

# have to re-escape existing U+DCxx characters

522

# have to re-escape existing U+DCxx characters

550

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

523

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

551

pos += 1

524

pos += 1

552

else:

525

else:

553

pos += len(c)

526

pos += len(c)

554

except UnicodeDecodeError:

527

except UnicodeDecodeError:

555

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

528

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

556

pos += 1

529

pos += 1

557

r += c

530

r += c

558

return r

531

return r

559

532

560

def fromutf8b(s):

533

def fromutf8b(s):

561

'''Given a UTF-8b string, return a local, possibly-binary string.

534

'''Given a UTF-8b string, return a local, possibly-binary string.

562

535

563

return the original binary string. This

536

return the original binary string. This

564

is a round-trip process for strings like filenames, but metadata

537

is a round-trip process for strings like filenames, but metadata

565

that's was passed through tolocal will remain in UTF-8.

538

that's was passed through tolocal will remain in UTF-8.

566

539

567

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

540

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

568

>>> m = "\\xc3\\xa9\\x99abcd"

541

>>> m = "\\xc3\\xa9\\x99abcd"

569

>>> toutf8b(m)

542

>>> toutf8b(m)

570

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

543

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

571

>>> roundtrip(m)

544

>>> roundtrip(m)

572

True

545

True

573

>>> roundtrip("\\xc2\\xc2\\x80")

546

>>> roundtrip("\\xc2\\xc2\\x80")

574

True

547

True

575

>>> roundtrip("\\xef\\xbf\\xbd")

548

>>> roundtrip("\\xef\\xbf\\xbd")

576

True

549

True

577

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

550

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

578

True

551

True

579

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

552

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

580

True

553

True

581

'''

554

'''

582

555

583

# fast path - look for uDxxx prefixes in s

556

# fast path - look for uDxxx prefixes in s

584

if "\xed" not in s:

557

if "\xed" not in s:

585

return s

558

return s

586

559

587

# We could do this with the unicode type but some Python builds

560

# We could do this with the unicode type but some Python builds

588

# use UTF-16 internally (issue5031) which causes non-BMP code

561

# use UTF-16 internally (issue5031) which causes non-BMP code

589

# points to be escaped. Instead, we use our handy getutf8char

562

# points to be escaped. Instead, we use our handy getutf8char

590

# helper again to walk the string without "decoding" it.

563

# helper again to walk the string without "decoding" it.

591

564

592

r = ""

565

r = ""

593

pos = 0

566

pos = 0

594

l = len(s)

567

l = len(s)

595

while pos < l:

568

while pos < l:

596

c = getutf8char(s, pos)

569

c = getutf8char(s, pos)

597

pos += len(c)

570

pos += len(c)

598

# unescape U+DCxx characters

571

# unescape U+DCxx characters

599

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

572

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

600

c = chr(ord(c.decode("utf-8")) & 0xff)

573

c = chr(ord(c.decode("utf-8")) & 0xff)

601

r += c

574

r += c

602

return r

575

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

@@ -0,0 +1,22 b''
	1	# charencode.py - miscellaneous character encoding
	2	#
	3	# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
	4	#
	5	# This software may be used and distributed according to the terms of the
	6	# GNU General Public License version 2 or any later version.
	7
	8	from __future__ import absolute_import
	9
	10	def asciilower(s):
	11	'''convert a string to lowercase if ASCII
	12
	13	Raises UnicodeDecodeError if non-ASCII characters are found.'''
	14	s.decode('ascii')
	15	return s.lower()
	16
	17	def asciiupper(s):
	18	'''convert a string to uppercase if ASCII
	19
	20	Raises UnicodeDecodeError if non-ASCII characters are found.'''
	21	s.decode('ascii')
	22	return s.upper()

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
+            charencode = policy.importmod(r'charencode')
+            asciilower = charencode.asciilower
+            asciiupper = charencode.asciiupper
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
-            def _asciilower(s):
-                '''convert a string to lowercase if ASCII
-                Raises UnicodeDecodeError if non-ASCII characters are found.'''
-                s.decode('ascii')
-                return s.lower()
-            def asciilower(s):
-                # delay importing avoids cyclic dependency around "parsers" in
-                # pure Python build (util => i18n => encoding => parsers => util)
-                parsers = policy.importmod(r'parsers')
-                impl = getattr(parsers, 'asciilower', _asciilower)
-                global asciilower
-                asciilower = impl
-                return impl(s)
-            def _asciiupper(s):
-                '''convert a string to uppercase if ASCII
-                Raises UnicodeDecodeError if non-ASCII characters are found.'''
-                s.decode('ascii')
-                return s.upper()
-            def asciiupper(s):
-                # delay importing avoids cyclic dependency around "parsers" in
-                # pure Python build (util => i18n => encoding => parsers => util)
-                parsers = policy.importmod(r'parsers')
-                impl = getattr(parsers, 'asciiupper', _asciiupper)
-                global asciiupper
-                asciiupper = impl
-                return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r

             # policy.py - module policy logic for Mercurial.
             #
             # Copyright 2015 Gregory Szorc <gregory.szorc@gmail.com>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import os
             import sys
             # Rules for how modules can be loaded. Values are:
             #
             #    c - require C extensions
             #    allow - allow pure Python implementation when C loading fails
             #    cffi - required cffi versions (implemented within pure module)
             #    cffi-allow - allow pure Python implementation if cffi version is missing
             #    py - only load pure Python modules
             #
             # By default, fall back to the pure modules so the in-place build can
             # run without recompiling the C extensions. This will be overridden by
             # __modulepolicy__ generated by setup.py.
             policy = b'allow'
             _packageprefs = {
                 # policy: (versioned package, pure package)
                 b'c': (r'cext', None),
                 b'allow': (r'cext', r'pure'),
                 b'cffi': (r'cffi', None),
                 b'cffi-allow': (r'cffi', r'pure'),
                 b'py': (None, r'pure'),
             }
             try:
                 from . import __modulepolicy__
                 policy = __modulepolicy__.modulepolicy
             except ImportError:
                 pass
             # PyPy doesn't load C extensions.
             #
             # The canonical way to do this is to test platform.python_implementation().
             # But we don't import platform and don't bloat for it here.
             if r'__pypy__' in sys.builtin_module_names:
                 policy = b'cffi'
             # Our C extensions aren't yet compatible with Python 3. So use pure Python
             # on Python 3 for now.
             if sys.version_info[0] >= 3:
                 policy = b'py'
             # Environment variable can always force settings.
             if sys.version_info[0] >= 3:
                 if r'HGMODULEPOLICY' in os.environ:
                     policy = os.environ[r'HGMODULEPOLICY'].encode(r'utf-8')
             else:
                 policy = os.environ.get(r'HGMODULEPOLICY', policy)
             def _importfrom(pkgname, modname):
                 # from .<pkgname> import <modname> (where . is looked through this module)
                 fakelocals = {}
                 pkg = __import__(pkgname, globals(), fakelocals, [modname], level=1)
                 try:
                     fakelocals[modname] = mod = getattr(pkg, modname)
                 except AttributeError:
                     raise ImportError(r'cannot import name %s' % modname)
                 # force import; fakelocals[modname] may be replaced with the real module
                 getattr(mod, r'__doc__', None)
                 return fakelocals[modname]
             # keep in sync with "version" in C modules
             _cextversions = {
                 (r'cext', r'base85'): 1,
                 (r'cext', r'bdiff'): 1,
                 (r'cext', r'diffhelpers'): 1,
                 (r'cext', r'mpatch'): 1,
                 (r'cext', r'osutil'): 1,
                 (r'cext', r'parsers'): 1,
             }
             # map import request to other package or module
             _modredirects = {
+                (r'cext', r'charencode'): (r'cext', r'parsers'),
                 (r'cffi', r'base85'): (r'pure', r'base85'),
+                (r'cffi', r'charencode'): (r'pure', r'charencode'),
                 (r'cffi', r'diffhelpers'): (r'pure', r'diffhelpers'),
                 (r'cffi', r'parsers'): (r'pure', r'parsers'),
             }
             def _checkmod(pkgname, modname, mod):
                 expected = _cextversions.get((pkgname, modname))
                 actual = getattr(mod, r'version', None)
                 if actual != expected:
                     raise ImportError(r'cannot import module %s.%s '
                                       r'(expected version: %d, actual: %r)'
                                       % (pkgname, modname, expected, actual))
             def importmod(modname):
                 """Import module according to policy and check API version"""
                 try:
                     verpkg, purepkg = _packageprefs[policy]
                 except KeyError:
                     raise ImportError(r'invalid HGMODULEPOLICY %r' % policy)
                 assert verpkg or purepkg
                 if verpkg:
                     pn, mn = _modredirects.get((verpkg, modname), (verpkg, modname))
                     try:
                         mod = _importfrom(pn, mn)
                         if pn == verpkg:
                             _checkmod(pn, mn, mod)
                         return mod
                     except ImportError:
                         if not purepkg:
                             raise
                 pn, mn = _modredirects.get((purepkg, modname), (purepkg, modname))
                 return _importfrom(pn, mn)