upstream/mercurial-mirror Commit - r30622:ce36fa9b

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

pycompat,

17

pycompat,

18

)

18

)

19

20

_sysstr = pycompat.sysstr

20

_sysstr = pycompat.sysstr

21

22

if pycompat.ispy3:

22

if pycompat.ispy3:

23

unichr = chr

23

unichr = chr

24

25

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

25

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

26

# "Unicode Subtleties"), so we need to ignore them in some places for

26

# "Unicode Subtleties"), so we need to ignore them in some places for

27

# sanity.

27

# sanity.

28

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

28

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

29

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

29

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

30

"206a 206b 206c 206d 206e 206f feff".split()]

30

"206a 206b 206c 206d 206e 206f feff".split()]

31

# verify the next function will work

31

# verify the next function will work

32

if pycompat.ispy3:

32

if pycompat.ispy3:

33

assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])

33

assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])

34

else:

34

else:

35

assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])

35

assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])

36

37

def hfsignoreclean(s):

37

def hfsignoreclean(s):

38

"""Remove codepoints ignored by HFS+ from s.

38

"""Remove codepoints ignored by HFS+ from s.

39

40

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

40

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

41

'.hg'

41

'.hg'

42

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

42

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

43

'.hg'

43

'.hg'

44

"""

44

"""

45

if "\xe2" in s or "\xef" in s:

45

if "\xe2" in s or "\xef" in s:

46

for c in _ignore:

46

for c in _ignore:

47

s = s.replace(c, '')

47

s = s.replace(c, '')

48

return s

48

return s

49

50

# encoding.environ is provided read-only, which may not be used to modify

50

# encoding.environ is provided read-only, which may not be used to modify

51

# the process environment

51

# the process environment

52

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

52

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

53

if not pycompat.ispy3:

53

if not pycompat.ispy3:

54

environ = os.environ

54

environ = os.environ

55

elif _nativeenviron:

55

elif _nativeenviron:

56

environ = os.environb

56

environ = os.environb

57

else:

57

else:

58

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

58

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

59

# and recreate it once encoding is settled

59

# and recreate it once encoding is settled

60

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

60

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

61

for k, v in os.environ.items())

61

for k, v in os.environ.items())

62

63

def _getpreferredencoding():

63

def _getpreferredencoding():

64

'''

64

'''

65

On darwin, getpreferredencoding ignores the locale environment and

65

On darwin, getpreferredencoding ignores the locale environment and

66

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

66

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

67

for Python 2.7 and up. This is the same corrected code for earlier

67

for Python 2.7 and up. This is the same corrected code for earlier

68

Python versions.

68

Python versions.

69

70

However, we can't use a version check for this method, as some distributions

70

However, we can't use a version check for this method, as some distributions

71

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

71

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

72

encoding, as it is unlikely that this encoding is the actually expected.

72

encoding, as it is unlikely that this encoding is the actually expected.

73

'''

73

'''

74

try:

74

try:

75

locale.CODESET

75

locale.CODESET

76

except AttributeError:

76

except AttributeError:

77

# Fall back to parsing environment variables :-(

77

# Fall back to parsing environment variables :-(

78

return locale.getdefaultlocale()[1]

78

return locale.getdefaultlocale()[1]

79

80

oldloc = locale.setlocale(locale.LC_CTYPE)

80

oldloc = locale.setlocale(locale.LC_CTYPE)

81

locale.setlocale(locale.LC_CTYPE, "")

81

locale.setlocale(locale.LC_CTYPE, "")

82

result = locale.nl_langinfo(locale.CODESET)

82

result = locale.nl_langinfo(locale.CODESET)

83

locale.setlocale(locale.LC_CTYPE, oldloc)

83

locale.setlocale(locale.LC_CTYPE, oldloc)

84

85

return result

85

return result

86

87

_encodingfixers = {

87

_encodingfixers = {

88

'646': lambda: 'ascii',

88

'646': lambda: 'ascii',

89

'ANSI_X3.4-1968': lambda: 'ascii',

89

'ANSI_X3.4-1968': lambda: 'ascii',

90

'mac-roman': _getpreferredencoding

90

'mac-roman': _getpreferredencoding

91

}

91

}

92

93

try:

93

try:

94

encoding = environ.get("HGENCODING")

94

encoding = environ.get("HGENCODING")

95

if not encoding:

95

if not encoding:

96

encoding = locale.getpreferredencoding() or 'ascii'

96

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

97

encoding = _encodingfixers.get(encoding, lambda: encoding)()

97

encoding = _encodingfixers.get(encoding, lambda: encoding)()

98

except locale.Error:

98

except locale.Error:

99

encoding = 'ascii'

99

encoding = 'ascii'

100

encodingmode = environ.get("HGENCODINGMODE", "strict")

100

encodingmode = environ.get("HGENCODINGMODE", "strict")

101

fallbackencoding = 'ISO-8859-1'

101

fallbackencoding = 'ISO-8859-1'

102

103

class localstr(str):

103

class localstr(str):

104

'''This class allows strings that are unmodified to be

104

'''This class allows strings that are unmodified to be

105

round-tripped to the local encoding and back'''

105

round-tripped to the local encoding and back'''

106

def __new__(cls, u, l):

106

def __new__(cls, u, l):

107

s = str.__new__(cls, l)

107

s = str.__new__(cls, l)

108

s._utf8 = u

108

s._utf8 = u

109

return s

109

return s

110

def __hash__(self):

110

def __hash__(self):

111

return hash(self._utf8) # avoid collisions in local string space

111

return hash(self._utf8) # avoid collisions in local string space

112

113

def tolocal(s):

113

def tolocal(s):

114

"""

114

"""

115

Convert a string from internal UTF-8 to local encoding

115

Convert a string from internal UTF-8 to local encoding

116

117

All internal strings should be UTF-8 but some repos before the

117

All internal strings should be UTF-8 but some repos before the

118

implementation of locale support may contain latin1 or possibly

118

implementation of locale support may contain latin1 or possibly

119

other character sets. We attempt to decode everything strictly

119

other character sets. We attempt to decode everything strictly

120

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

120

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

121

replace unknown characters.

121

replace unknown characters.

122

123

The localstr class is used to cache the known UTF-8 encoding of

123

The localstr class is used to cache the known UTF-8 encoding of

124

strings next to their local representation to allow lossless

124

strings next to their local representation to allow lossless

125

round-trip conversion back to UTF-8.

125

round-trip conversion back to UTF-8.

126

127

>>> u = 'foo: \\xc3\\xa4' # utf-8

127

>>> u = 'foo: \\xc3\\xa4' # utf-8

128

>>> l = tolocal(u)

128

>>> l = tolocal(u)

129

>>> l

129

>>> l

130

'foo: ?'

130

'foo: ?'

131

>>> fromlocal(l)

131

>>> fromlocal(l)

132

'foo: \\xc3\\xa4'

132

'foo: \\xc3\\xa4'

133

>>> u2 = 'foo: \\xc3\\xa1'

133

>>> u2 = 'foo: \\xc3\\xa1'

134

>>> d = { l: 1, tolocal(u2): 2 }

134

>>> d = { l: 1, tolocal(u2): 2 }

135

>>> len(d) # no collision

135

>>> len(d) # no collision

136

2

136

2

137

>>> 'foo: ?' in d

137

>>> 'foo: ?' in d

138

False

138

False

139

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

139

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

140

>>> l = tolocal(l1)

140

>>> l = tolocal(l1)

141

>>> l

141

>>> l

142

'foo: ?'

142

'foo: ?'

143

>>> fromlocal(l) # magically in utf-8

143

>>> fromlocal(l) # magically in utf-8

144

'foo: \\xc3\\xa4'

144

'foo: \\xc3\\xa4'

145

"""

145

"""

146

147

try:

147

try:

148

try:

148

try:

149

# make sure string is actually stored in UTF-8

149

# make sure string is actually stored in UTF-8

150

u = s.decode('UTF-8')

150

u = s.decode('UTF-8')

151

if encoding == 'UTF-8':

151

if encoding == 'UTF-8':

152

# fast path

152

# fast path

153

return s

153

return s

154

r = u.encode(_sysstr(encoding), u"replace")

154

r = u.encode(_sysstr(encoding), u"replace")

155

if u == r.decode(_sysstr(encoding)):

155

if u == r.decode(_sysstr(encoding)):

156

# r is a safe, non-lossy encoding of s

156

# r is a safe, non-lossy encoding of s

157

return r

157

return r

158

return localstr(s, r)

158

return localstr(s, r)

159

except UnicodeDecodeError:

159

except UnicodeDecodeError:

160

# we should only get here if we're looking at an ancient changeset

160

# we should only get here if we're looking at an ancient changeset

161

try:

161

try:

162

u = s.decode(_sysstr(fallbackencoding))

162

u = s.decode(_sysstr(fallbackencoding))

163

r = u.encode(_sysstr(encoding), u"replace")

163

r = u.encode(_sysstr(encoding), u"replace")

164

if u == r.decode(_sysstr(encoding)):

164

if u == r.decode(_sysstr(encoding)):

165

# r is a safe, non-lossy encoding of s

165

# r is a safe, non-lossy encoding of s

166

return r

166

return r

167

return localstr(u.encode('UTF-8'), r)

167

return localstr(u.encode('UTF-8'), r)

168

except UnicodeDecodeError:

168

except UnicodeDecodeError:

169

u = s.decode("utf-8", "replace") # last ditch

169

u = s.decode("utf-8", "replace") # last ditch

170

# can't round-trip

170

# can't round-trip

171

return u.encode(_sysstr(encoding), u"replace")

171

return u.encode(_sysstr(encoding), u"replace")

172

except LookupError as k:

172

except LookupError as k:

173

raise error.Abort(k, hint="please check your locale settings")

173

raise error.Abort(k, hint="please check your locale settings")

174

175

def fromlocal(s):

175

def fromlocal(s):

176

"""

176

"""

177

Convert a string from the local character encoding to UTF-8

177

Convert a string from the local character encoding to UTF-8

178

179

We attempt to decode strings using the encoding mode set by

179

We attempt to decode strings using the encoding mode set by

180

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

180

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

181

characters will cause an error message. Other modes include

181

characters will cause an error message. Other modes include

182

'replace', which replaces unknown characters with a special

182

'replace', which replaces unknown characters with a special

183

Unicode character, and 'ignore', which drops the character.

183

Unicode character, and 'ignore', which drops the character.

184

"""

184

"""

185

186

# can we do a lossless round-trip?

186

# can we do a lossless round-trip?

187

if isinstance(s, localstr):

187

if isinstance(s, localstr):

188

return s._utf8

188

return s._utf8

189

190

try:

190

try:

191

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

191

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

192

return u.encode("utf-8")

192

return u.encode("utf-8")

193

except UnicodeDecodeError as inst:

193

except UnicodeDecodeError as inst:

194

sub = s[max(0, inst.start - 10):inst.start + 10]

194

sub = s[max(0, inst.start - 10):inst.start + 10]

195

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

195

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

196

except LookupError as k:

196

except LookupError as k:

197

raise error.Abort(k, hint="please check your locale settings")

197

raise error.Abort(k, hint="please check your locale settings")

198

199

if not _nativeenviron:

199

if not _nativeenviron:

200

# now encoding and helper functions are available, recreate the environ

200

# now encoding and helper functions are available, recreate the environ

201

# dict to be exported to other modules

201

# dict to be exported to other modules

202

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

202

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

203

for k, v in os.environ.items())

203

for k, v in os.environ.items())

204

205

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

205

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

206

wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

206

wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

207

and "WFA" or "WF")

207

and "WFA" or "WF")

208

209

def colwidth(s):

209

def colwidth(s):

210

"Find the column width of a string for display in the local encoding"

210

"Find the column width of a string for display in the local encoding"

211

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

211

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

212

213

def ucolwidth(d):

213

def ucolwidth(d):

214

"Find the column width of a Unicode string for display"

214

"Find the column width of a Unicode string for display"

215

eaw = getattr(unicodedata, 'east_asian_width', None)

215

eaw = getattr(unicodedata, 'east_asian_width', None)

216

if eaw is not None:

216

if eaw is not None:

217

return sum([eaw(c) in wide and 2 or 1 for c in d])

217

return sum([eaw(c) in wide and 2 or 1 for c in d])

218

return len(d)

218

return len(d)

219

220

def getcols(s, start, c):

220

def getcols(s, start, c):

221

'''Use colwidth to find a c-column substring of s starting at byte

221

'''Use colwidth to find a c-column substring of s starting at byte

222

index start'''

222

index start'''

223

for x in xrange(start + c, len(s)):

223

for x in xrange(start + c, len(s)):

224

t = s[start:x]

224

t = s[start:x]

225

if colwidth(t) == c:

225

if colwidth(t) == c:

226

return t

226

return t

227

228

def trim(s, width, ellipsis='', leftside=False):

228

def trim(s, width, ellipsis='', leftside=False):

229

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

229

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

230

231

If 'leftside' is True, left side of string 's' is trimmed.

231

If 'leftside' is True, left side of string 's' is trimmed.

232

'ellipsis' is always placed at trimmed side.

232

'ellipsis' is always placed at trimmed side.

233

234

>>> ellipsis = '+++'

234

>>> ellipsis = '+++'

235

>>> from . import encoding

235

>>> from . import encoding

236

>>> encoding.encoding = 'utf-8'

236

>>> encoding.encoding = 'utf-8'

237

>>> t= '1234567890'

237

>>> t= '1234567890'

238

>>> print trim(t, 12, ellipsis=ellipsis)

238

>>> print trim(t, 12, ellipsis=ellipsis)

239

1234567890

239

1234567890

240

>>> print trim(t, 10, ellipsis=ellipsis)

240

>>> print trim(t, 10, ellipsis=ellipsis)

241

1234567890

241

1234567890

242

>>> print trim(t, 8, ellipsis=ellipsis)

242

>>> print trim(t, 8, ellipsis=ellipsis)

243

12345+++

243

12345+++

244

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

244

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

245

+++67890

245

+++67890

246

>>> print trim(t, 8)

246

>>> print trim(t, 8)

247

12345678

247

12345678

248

>>> print trim(t, 8, leftside=True)

248

>>> print trim(t, 8, leftside=True)

249

34567890

249

34567890

250

>>> print trim(t, 3, ellipsis=ellipsis)

250

>>> print trim(t, 3, ellipsis=ellipsis)

251

+++

251

+++

252

>>> print trim(t, 1, ellipsis=ellipsis)

252

>>> print trim(t, 1, ellipsis=ellipsis)

253

+

253

+

254

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

254

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

255

>>> t = u.encode(encoding.encoding)

255

>>> t = u.encode(encoding.encoding)

256

>>> print trim(t, 12, ellipsis=ellipsis)

256

>>> print trim(t, 12, ellipsis=ellipsis)

257

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

257

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

258

>>> print trim(t, 10, ellipsis=ellipsis)

258

>>> print trim(t, 10, ellipsis=ellipsis)

259

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

259

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

260

>>> print trim(t, 8, ellipsis=ellipsis)

260

>>> print trim(t, 8, ellipsis=ellipsis)

261

\xe3\x81\x82\xe3\x81\x84+++

261

\xe3\x81\x82\xe3\x81\x84+++

262

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

262

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

263

+++\xe3\x81\x88\xe3\x81\x8a

263

+++\xe3\x81\x88\xe3\x81\x8a

264

>>> print trim(t, 5)

264

>>> print trim(t, 5)

265

\xe3\x81\x82\xe3\x81\x84

265

\xe3\x81\x82\xe3\x81\x84

266

>>> print trim(t, 5, leftside=True)

266

>>> print trim(t, 5, leftside=True)

267

\xe3\x81\x88\xe3\x81\x8a

267

\xe3\x81\x88\xe3\x81\x8a

268

>>> print trim(t, 4, ellipsis=ellipsis)

268

>>> print trim(t, 4, ellipsis=ellipsis)

269

+++

269

+++

270

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

270

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

271

+++

271

+++

272

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

272

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

273

>>> print trim(t, 12, ellipsis=ellipsis)

273

>>> print trim(t, 12, ellipsis=ellipsis)

274

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

274

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

275

>>> print trim(t, 10, ellipsis=ellipsis)

275

>>> print trim(t, 10, ellipsis=ellipsis)

276

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

276

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

277

>>> print trim(t, 8, ellipsis=ellipsis)

277

>>> print trim(t, 8, ellipsis=ellipsis)

278

\x11\x22\x33\x44\x55+++

278

\x11\x22\x33\x44\x55+++

279

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

279

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

280

+++\x66\x77\x88\x99\xaa

280

+++\x66\x77\x88\x99\xaa

281

>>> print trim(t, 8)

281

>>> print trim(t, 8)

282

\x11\x22\x33\x44\x55\x66\x77\x88

282

\x11\x22\x33\x44\x55\x66\x77\x88

283

>>> print trim(t, 8, leftside=True)

283

>>> print trim(t, 8, leftside=True)

284

\x33\x44\x55\x66\x77\x88\x99\xaa

284

\x33\x44\x55\x66\x77\x88\x99\xaa

285

>>> print trim(t, 3, ellipsis=ellipsis)

285

>>> print trim(t, 3, ellipsis=ellipsis)

286

+++

286

+++

287

>>> print trim(t, 1, ellipsis=ellipsis)

287

>>> print trim(t, 1, ellipsis=ellipsis)

288

+

288

+

289

"""

289

"""

290

try:

290

try:

291

u = s.decode(_sysstr(encoding))

291

u = s.decode(_sysstr(encoding))

292

except UnicodeDecodeError:

292

except UnicodeDecodeError:

293

if len(s) <= width: # trimming is not needed

293

if len(s) <= width: # trimming is not needed

294

return s

294

return s

295

width -= len(ellipsis)

295

width -= len(ellipsis)

296

if width <= 0: # no enough room even for ellipsis

296

if width <= 0: # no enough room even for ellipsis

297

return ellipsis[:width + len(ellipsis)]

297

return ellipsis[:width + len(ellipsis)]

298

if leftside:

298

if leftside:

299

return ellipsis + s[-width:]

299

return ellipsis + s[-width:]

300

return s[:width] + ellipsis

300

return s[:width] + ellipsis

301

302

if ucolwidth(u) <= width: # trimming is not needed

302

if ucolwidth(u) <= width: # trimming is not needed

303

return s

303

return s

304

305

width -= len(ellipsis)

305

width -= len(ellipsis)

306

if width <= 0: # no enough room even for ellipsis

306

if width <= 0: # no enough room even for ellipsis

307

return ellipsis[:width + len(ellipsis)]

307

return ellipsis[:width + len(ellipsis)]

308

309

if leftside:

309

if leftside:

310

uslice = lambda i: u[i:]

310

uslice = lambda i: u[i:]

311

concat = lambda s: ellipsis + s

311

concat = lambda s: ellipsis + s

312

else:

312

else:

313

uslice = lambda i: u[:-i]

313

uslice = lambda i: u[:-i]

314

concat = lambda s: s + ellipsis

314

concat = lambda s: s + ellipsis

315

for i in xrange(1, len(u)):

315

for i in xrange(1, len(u)):

316

usub = uslice(i)

316

usub = uslice(i)

317

if ucolwidth(usub) <= width:

317

if ucolwidth(usub) <= width:

318

return concat(usub.encode(_sysstr(encoding)))

318

return concat(usub.encode(_sysstr(encoding)))

319

return ellipsis # no enough room for multi-column characters

319

return ellipsis # no enough room for multi-column characters

320

321

def _asciilower(s):

321

def _asciilower(s):

322

'''convert a string to lowercase if ASCII

322

'''convert a string to lowercase if ASCII

323

324

Raises UnicodeDecodeError if non-ASCII characters are found.'''

324

Raises UnicodeDecodeError if non-ASCII characters are found.'''

325

s.decode('ascii')

325

s.decode('ascii')

326

return s.lower()

326

return s.lower()

327

328

def asciilower(s):

328

def asciilower(s):

329

# delay importing avoids cyclic dependency around "parsers" in

329

# delay importing avoids cyclic dependency around "parsers" in

330

# pure Python build (util => i18n => encoding => parsers => util)

330

# pure Python build (util => i18n => encoding => parsers => util)

331

from . import parsers

331

from . import parsers

332

impl = getattr(parsers, 'asciilower', _asciilower)

332

impl = getattr(parsers, 'asciilower', _asciilower)

333

global asciilower

333

global asciilower

334

asciilower = impl

334

asciilower = impl

335

return impl(s)

335

return impl(s)

336

337

def _asciiupper(s):

337

def _asciiupper(s):

338

'''convert a string to uppercase if ASCII

338

'''convert a string to uppercase if ASCII

339

340

Raises UnicodeDecodeError if non-ASCII characters are found.'''

340

Raises UnicodeDecodeError if non-ASCII characters are found.'''

341

s.decode('ascii')

341

s.decode('ascii')

342

return s.upper()

342

return s.upper()

343

344

def asciiupper(s):

344

def asciiupper(s):

345

# delay importing avoids cyclic dependency around "parsers" in

345

# delay importing avoids cyclic dependency around "parsers" in

346

# pure Python build (util => i18n => encoding => parsers => util)

346

# pure Python build (util => i18n => encoding => parsers => util)

347

from . import parsers

347

from . import parsers

348

impl = getattr(parsers, 'asciiupper', _asciiupper)

348

impl = getattr(parsers, 'asciiupper', _asciiupper)

349

global asciiupper

349

global asciiupper

350

asciiupper = impl

350

asciiupper = impl

351

return impl(s)

351

return impl(s)

352

353

def lower(s):

353

def lower(s):

354

"best-effort encoding-aware case-folding of local string s"

354

"best-effort encoding-aware case-folding of local string s"

355

try:

355

try:

356

return asciilower(s)

356

return asciilower(s)

357

except UnicodeDecodeError:

357

except UnicodeDecodeError:

358

pass

358

pass

359

try:

359

try:

360

if isinstance(s, localstr):

360

if isinstance(s, localstr):

361

u = s._utf8.decode("utf-8")

361

u = s._utf8.decode("utf-8")

362

else:

362

else:

363

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

363

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

364

365

lu = u.lower()

365

lu = u.lower()

366

if u == lu:

366

if u == lu:

367

return s # preserve localstring

367

return s # preserve localstring

368

return lu.encode(_sysstr(encoding))

368

return lu.encode(_sysstr(encoding))

369

except UnicodeError:

369

except UnicodeError:

370

return s.lower() # we don't know how to fold this except in ASCII

370

return s.lower() # we don't know how to fold this except in ASCII

371

except LookupError as k:

371

except LookupError as k:

372

raise error.Abort(k, hint="please check your locale settings")

372

raise error.Abort(k, hint="please check your locale settings")

373

374

def upper(s):

374

def upper(s):

375

"best-effort encoding-aware case-folding of local string s"

375

"best-effort encoding-aware case-folding of local string s"

376

try:

376

try:

377

return asciiupper(s)

377

return asciiupper(s)

378

except UnicodeDecodeError:

378

except UnicodeDecodeError:

379

return upperfallback(s)

379

return upperfallback(s)

380

381

def upperfallback(s):

381

def upperfallback(s):

382

try:

382

try:

383

if isinstance(s, localstr):

383

if isinstance(s, localstr):

384

u = s._utf8.decode("utf-8")

384

u = s._utf8.decode("utf-8")

385

else:

385

else:

386

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

386

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

387

388

uu = u.upper()

388

uu = u.upper()

389

if u == uu:

389

if u == uu:

390

return s # preserve localstring

390

return s # preserve localstring

391

return uu.encode(_sysstr(encoding))

391

return uu.encode(_sysstr(encoding))

392

except UnicodeError:

392

except UnicodeError:

393

return s.upper() # we don't know how to fold this except in ASCII

393

return s.upper() # we don't know how to fold this except in ASCII

394

except LookupError as k:

394

except LookupError as k:

395

raise error.Abort(k, hint="please check your locale settings")

395

raise error.Abort(k, hint="please check your locale settings")

396

397

class normcasespecs(object):

397

class normcasespecs(object):

398

'''what a platform's normcase does to ASCII strings

398

'''what a platform's normcase does to ASCII strings

399

400

This is specified per platform, and should be consistent with what normcase

400

This is specified per platform, and should be consistent with what normcase

401

on that platform actually does.

401

on that platform actually does.

402

403

lower: normcase lowercases ASCII strings

403

lower: normcase lowercases ASCII strings

404

upper: normcase uppercases ASCII strings

404

upper: normcase uppercases ASCII strings

405

other: the fallback function should always be called

405

other: the fallback function should always be called

406

407

This should be kept in sync with normcase_spec in util.h.'''

407

This should be kept in sync with normcase_spec in util.h.'''

408

lower = -1

408

lower = -1

409

upper = 1

409

upper = 1

410

other = 0

410

other = 0

411

412

_jsonmap = []

412

_jsonmap = []

413

_jsonmap.extend("\\u%04x" % x for x in range(32))

413

_jsonmap.extend("\\u%04x" % x for x in range(32))

414

_jsonmap.extend(chr(x) for x in range(32, 127))

414

_jsonmap.extend(chr(x) for x in range(32, 127))

415

_jsonmap.append('\\u007f')

415

_jsonmap.append('\\u007f')

416

_jsonmap[0x09] = '\\t'

416

_jsonmap[0x09] = '\\t'

417

_jsonmap[0x0a] = '\\n'

417

_jsonmap[0x0a] = '\\n'

418

_jsonmap[0x22] = '\\"'

418

_jsonmap[0x22] = '\\"'

419

_jsonmap[0x5c] = '\\\\'

419

_jsonmap[0x5c] = '\\\\'

420

_jsonmap[0x08] = '\\b'

420

_jsonmap[0x08] = '\\b'

421

_jsonmap[0x0c] = '\\f'

421

_jsonmap[0x0c] = '\\f'

422

_jsonmap[0x0d] = '\\r'

422

_jsonmap[0x0d] = '\\r'

423

_paranoidjsonmap = _jsonmap[:]

423

_paranoidjsonmap = _jsonmap[:]

424

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

424

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

425

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

425

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

426

_jsonmap.extend(chr(x) for x in range(128, 256))

426

_jsonmap.extend(chr(x) for x in range(128, 256))

427

428

def jsonescape(s, paranoid=False):

428

def jsonescape(s, paranoid=False):

429

'''returns a string suitable for JSON

429

'''returns a string suitable for JSON

430

431

JSON is problematic for us because it doesn't support non-Unicode

431

JSON is problematic for us because it doesn't support non-Unicode

432

bytes. To deal with this, we take the following approach:

432

bytes. To deal with this, we take the following approach:

433

434

- localstr objects are converted back to UTF-8

434

- localstr objects are converted back to UTF-8

435

- valid UTF-8/ASCII strings are passed as-is

435

- valid UTF-8/ASCII strings are passed as-is

436

- other strings are converted to UTF-8b surrogate encoding

436

- other strings are converted to UTF-8b surrogate encoding

437

- apply JSON-specified string escaping

437

- apply JSON-specified string escaping

438

439

(escapes are doubled in these tests)

439

(escapes are doubled in these tests)

440

441

>>> jsonescape('this is a test')

441

>>> jsonescape('this is a test')

442

'this is a test'

442

'this is a test'

443

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

443

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

444

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

444

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

445

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

445

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

446

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

446

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

447

>>> jsonescape('a weird byte: \\xdd')

447

>>> jsonescape('a weird byte: \\xdd')

448

'a weird byte: \\xed\\xb3\\x9d'

448

'a weird byte: \\xed\\xb3\\x9d'

449

>>> jsonescape('utf-8: caf\\xc3\\xa9')

449

>>> jsonescape('utf-8: caf\\xc3\\xa9')

450

'utf-8: caf\\xc3\\xa9'

450

'utf-8: caf\\xc3\\xa9'

451

>>> jsonescape('')

451

>>> jsonescape('')

452

''

452

''

453

454

If paranoid, non-ascii and common troublesome characters are also escaped.

454

If paranoid, non-ascii and common troublesome characters are also escaped.

455

This is suitable for web output.

455

This is suitable for web output.

456

457

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

457

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

458

'escape boundary: ~ \\\\u007f \\\\u0080'

458

'escape boundary: ~ \\\\u007f \\\\u0080'

459

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

459

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

460

'a weird byte: \\\\udcdd'

460

'a weird byte: \\\\udcdd'

461

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

461

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

462

'utf-8: caf\\\\u00e9'

462

'utf-8: caf\\\\u00e9'

463

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

463

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

464

'non-BMP: \\\\ud834\\\\udd1e'

464

'non-BMP: \\\\ud834\\\\udd1e'

465

>>> jsonescape('<foo@example.org>', paranoid=True)

465

>>> jsonescape('<foo@example.org>', paranoid=True)

466

'\\\\u003cfoo@example.org\\\\u003e'

466

'\\\\u003cfoo@example.org\\\\u003e'

467

'''

467

'''

468

469

if paranoid:

469

if paranoid:

470

jm = _paranoidjsonmap

470

jm = _paranoidjsonmap

471

else:

471

else:

472

jm = _jsonmap

472

jm = _jsonmap

473

474

u8chars = toutf8b(s)

474

u8chars = toutf8b(s)

475

try:

475

try:

476

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

476

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

477

except IndexError:

477

except IndexError:

478

pass

478

pass

479

# non-BMP char is represented as UTF-16 surrogate pair

479

# non-BMP char is represented as UTF-16 surrogate pair

480

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

480

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

481

u16codes.pop(0) # drop BOM

481

u16codes.pop(0) # drop BOM

482

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

482

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

483

484

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

484

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

485

486

def getutf8char(s, pos):

486

def getutf8char(s, pos):

487

'''get the next full utf-8 character in the given string, starting at pos

487

'''get the next full utf-8 character in the given string, starting at pos

488

489

Raises a UnicodeError if the given location does not start a valid

489

Raises a UnicodeError if the given location does not start a valid

490

utf-8 character.

490

utf-8 character.

491

'''

491

'''

492

493

# find how many bytes to attempt decoding from first nibble

493

# find how many bytes to attempt decoding from first nibble

494

l = _utf8len[ord(s[pos]) >> 4]

494

l = _utf8len[ord(s[pos]) >> 4]

495

if not l: # ascii

495

if not l: # ascii

496

return s[pos]

496

return s[pos]

497

498

c = s[pos:pos + l]

498

c = s[pos:pos + l]

499

# validate with attempted decode

499

# validate with attempted decode

500

c.decode("utf-8")

500

c.decode("utf-8")

501

return c

501

return c

502

503

def toutf8b(s):

503

def toutf8b(s):

504

'''convert a local, possibly-binary string into UTF-8b

504

'''convert a local, possibly-binary string into UTF-8b

505

506

This is intended as a generic method to preserve data when working

506

This is intended as a generic method to preserve data when working

507

with schemes like JSON and XML that have no provision for

507

with schemes like JSON and XML that have no provision for

508

arbitrary byte strings. As Mercurial often doesn't know

508

arbitrary byte strings. As Mercurial often doesn't know

509

what encoding data is in, we use so-called UTF-8b.

509

what encoding data is in, we use so-called UTF-8b.

510

511

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

511

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

512

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

512

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

513

uDC00-uDCFF.

513

uDC00-uDCFF.

514

515

Principles of operation:

515

Principles of operation:

516

517

- ASCII and UTF-8 data successfully round-trips and is understood

517

- ASCII and UTF-8 data successfully round-trips and is understood

518

by Unicode-oriented clients

518

by Unicode-oriented clients

519

- filenames and file contents in arbitrary other encodings can have

519

- filenames and file contents in arbitrary other encodings can have

520

be round-tripped or recovered by clueful clients

520

be round-tripped or recovered by clueful clients

521

- local strings that have a cached known UTF-8 encoding (aka

521

- local strings that have a cached known UTF-8 encoding (aka

522

localstr) get sent as UTF-8 so Unicode-oriented clients get the

522

localstr) get sent as UTF-8 so Unicode-oriented clients get the

523

Unicode data they want

523

Unicode data they want

524

- because we must preserve UTF-8 bytestring in places such as

524

- because we must preserve UTF-8 bytestring in places such as

525

filenames, metadata can't be roundtripped without help

525

filenames, metadata can't be roundtripped without help

526

527

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

527

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

528

arbitrary bytes into an internal Unicode format that can be

528

arbitrary bytes into an internal Unicode format that can be

529

re-encoded back into the original. Here we are exposing the

529

re-encoded back into the original. Here we are exposing the

530

internal surrogate encoding as a UTF-8 string.)

530

internal surrogate encoding as a UTF-8 string.)

531

'''

531

'''

532

533

if "\xed" not in s:

533

if "\xed" not in s:

534

if isinstance(s, localstr):

534

if isinstance(s, localstr):

535

return s._utf8

535

return s._utf8

536

try:

536

try:

537

s.decode('utf-8')

537

s.decode('utf-8')

538

return s

538

return s

539

except UnicodeDecodeError:

539

except UnicodeDecodeError:

540

pass

540

pass

541

542

r = ""

542

r = ""

543

pos = 0

543

pos = 0

544

l = len(s)

544

l = len(s)

545

while pos < l:

545

while pos < l:

546

try:

546

try:

547

c = getutf8char(s, pos)

547

c = getutf8char(s, pos)

548

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

548

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

549

# have to re-escape existing U+DCxx characters

549

# have to re-escape existing U+DCxx characters

550

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

550

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

551

pos += 1

551

pos += 1

552

else:

552

else:

553

pos += len(c)

553

pos += len(c)

554

except UnicodeDecodeError:

554

except UnicodeDecodeError:

555

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

555

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

556

pos += 1

556

pos += 1

557

r += c

557

r += c

558

return r

558

return r

559

560

def fromutf8b(s):

560

def fromutf8b(s):

561

'''Given a UTF-8b string, return a local, possibly-binary string.

561

'''Given a UTF-8b string, return a local, possibly-binary string.

562

563

return the original binary string. This

563

return the original binary string. This

564

is a round-trip process for strings like filenames, but metadata

564

is a round-trip process for strings like filenames, but metadata

565

that's was passed through tolocal will remain in UTF-8.

565

that's was passed through tolocal will remain in UTF-8.

566

567

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

567

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

568

>>> m = "\\xc3\\xa9\\x99abcd"

568

>>> m = "\\xc3\\xa9\\x99abcd"

569

>>> toutf8b(m)

569

>>> toutf8b(m)

570

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

570

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

571

>>> roundtrip(m)

571

>>> roundtrip(m)

572

True

572

True

573

>>> roundtrip("\\xc2\\xc2\\x80")

573

>>> roundtrip("\\xc2\\xc2\\x80")

574

True

574

True

575

>>> roundtrip("\\xef\\xbf\\xbd")

575

>>> roundtrip("\\xef\\xbf\\xbd")

576

True

576

True

577

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

577

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

578

True

578

True

579

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

579

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

580

True

580

True

581

'''

581

'''

582

583

# fast path - look for uDxxx prefixes in s

583

# fast path - look for uDxxx prefixes in s

584

if "\xed" not in s:

584

if "\xed" not in s:

585

return s

585

return s

586

587

# We could do this with the unicode type but some Python builds

587

# We could do this with the unicode type but some Python builds

588

# use UTF-16 internally (issue5031) which causes non-BMP code

588

# use UTF-16 internally (issue5031) which causes non-BMP code

589

# points to be escaped. Instead, we use our handy getutf8char

589

# points to be escaped. Instead, we use our handy getutf8char

590

# helper again to walk the string without "decoding" it.

590

# helper again to walk the string without "decoding" it.

591

592

r = ""

592

r = ""

593

pos = 0

593

pos = 0

594

l = len(s)

594

l = len(s)

595

while pos < l:

595

while pos < l:

596

c = getutf8char(s, pos)

596

c = getutf8char(s, pos)

597

pos += len(c)

597

pos += len(c)

598

# unescape U+DCxx characters

598

# unescape U+DCxx characters

599

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

599

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

600

c = chr(ord(c.decode("utf-8")) & 0xff)

600

c = chr(ord(c.decode("utf-8")) & 0xff)

601

r += c

601

r += c

602

return r

602

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 pycompat,
             )
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             if pycompat.ispy3:
                 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
             else:
                 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ
             elif _nativeenviron:
                 environ = os.environb
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
-                    encoding = locale.getpreferredencoding() or 'ascii'
+                    encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(chr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(chr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r