upstream/mercurial-mirror Commit - r32276:1a3a08b5

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

pycompat,

17

pycompat,

18

)

18

)

19

20

_sysstr = pycompat.sysstr

20

_sysstr = pycompat.sysstr

21

22

if pycompat.ispy3:

22

if pycompat.ispy3:

23

unichr = chr

23

unichr = chr

24

25

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

25

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

26

# "Unicode Subtleties"), so we need to ignore them in some places for

26

# "Unicode Subtleties"), so we need to ignore them in some places for

27

# sanity.

27

# sanity.

28

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

28

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

29

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

29

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

30

"206a 206b 206c 206d 206e 206f feff".split()]

30

"206a 206b 206c 206d 206e 206f feff".split()]

31

# verify the next function will work

31

# verify the next function will work

32

if pycompat.ispy3:

32

if pycompat.ispy3:

33

assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])

33

assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])

34

else:

34

else:

35

assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])

35

assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])

36

37

def hfsignoreclean(s):

37

def hfsignoreclean(s):

38

"""Remove codepoints ignored by HFS+ from s.

38

"""Remove codepoints ignored by HFS+ from s.

39

40

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

40

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

41

'.hg'

41

'.hg'

42

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

42

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

43

'.hg'

43

'.hg'

44

"""

44

"""

45

if "\xe2" in s or "\xef" in s:

45

if "\xe2" in s or "\xef" in s:

46

for c in _ignore:

46

for c in _ignore:

47

s = s.replace(c, '')

47

s = s.replace(c, '')

48

return s

48

return s

49

50

# encoding.environ is provided read-only, which may not be used to modify

50

# encoding.environ is provided read-only, which may not be used to modify

51

# the process environment

51

# the process environment

52

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

52

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

53

if not pycompat.ispy3:

53

if not pycompat.ispy3:

54

environ = os.environ # re-exports

54

environ = os.environ # re-exports

55

elif _nativeenviron:

55

elif _nativeenviron:

56

environ = os.environb # re-exports

56

environ = os.environb # re-exports

57

else:

57

else:

58

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

58

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

59

# and recreate it once encoding is settled

59

# and recreate it once encoding is settled

60

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

60

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

61

for k, v in os.environ.items()) # re-exports

61

for k, v in os.environ.items()) # re-exports

62

63

def _getpreferredencoding():

64

'''

65

On darwin, getpreferredencoding ignores the locale environment and

66

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

67

for Python 2.7 and up. This is the same corrected code for earlier

68

Python versions.

69

70

However, we can't use a version check for this method, as some distributions

71

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

72

encoding, as it is unlikely that this encoding is the actually expected.

73

'''

74

try:

75

locale.CODESET

76

except AttributeError:

77

# Fall back to parsing environment variables :-(

78

return locale.getdefaultlocale()[1]

79

80

oldloc = locale.setlocale(locale.LC_CTYPE)

81

locale.setlocale(locale.LC_CTYPE, "")

82

result = locale.nl_langinfo(locale.CODESET)

83

locale.setlocale(locale.LC_CTYPE, oldloc)

84

85

return result

86

87

_encodingfixers = {

63

_encodingfixers = {

88

'646': lambda: 'ascii',

64

'646': lambda: 'ascii',

89

'ANSI_X3.4-1968': lambda: 'ascii',

65

'ANSI_X3.4-1968': lambda: 'ascii',

90

'mac-roman': _getpreferredencoding

91

}

66

}

92

67

93

try:

68

try:

94

encoding = environ.get("HGENCODING")

69

encoding = environ.get("HGENCODING")

95

if not encoding:

70

if not encoding:

96

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

71

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

97

encoding = _encodingfixers.get(encoding, lambda: encoding)()

72

encoding = _encodingfixers.get(encoding, lambda: encoding)()

98

except locale.Error:

73

except locale.Error:

99

encoding = 'ascii'

74

encoding = 'ascii'

100

encodingmode = environ.get("HGENCODINGMODE", "strict")

75

encodingmode = environ.get("HGENCODINGMODE", "strict")

101

fallbackencoding = 'ISO-8859-1'

76

fallbackencoding = 'ISO-8859-1'

102

77

103

class localstr(str):

78

class localstr(str):

104

'''This class allows strings that are unmodified to be

79

'''This class allows strings that are unmodified to be

105

round-tripped to the local encoding and back'''

80

round-tripped to the local encoding and back'''

106

def __new__(cls, u, l):

81

def __new__(cls, u, l):

107

s = str.__new__(cls, l)

82

s = str.__new__(cls, l)

108

s._utf8 = u

83

s._utf8 = u

109

return s

84

return s

110

def __hash__(self):

85

def __hash__(self):

111

return hash(self._utf8) # avoid collisions in local string space

86

return hash(self._utf8) # avoid collisions in local string space

112

87

113

def tolocal(s):

88

def tolocal(s):

114

"""

89

"""

115

Convert a string from internal UTF-8 to local encoding

90

Convert a string from internal UTF-8 to local encoding

116

91

117

All internal strings should be UTF-8 but some repos before the

92

All internal strings should be UTF-8 but some repos before the

118

implementation of locale support may contain latin1 or possibly

93

implementation of locale support may contain latin1 or possibly

119

other character sets. We attempt to decode everything strictly

94

other character sets. We attempt to decode everything strictly

120

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

95

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

121

replace unknown characters.

96

replace unknown characters.

122

97

123

The localstr class is used to cache the known UTF-8 encoding of

98

The localstr class is used to cache the known UTF-8 encoding of

124

strings next to their local representation to allow lossless

99

strings next to their local representation to allow lossless

125

round-trip conversion back to UTF-8.

100

round-trip conversion back to UTF-8.

126

101

127

>>> u = 'foo: \\xc3\\xa4' # utf-8

102

>>> u = 'foo: \\xc3\\xa4' # utf-8

128

>>> l = tolocal(u)

103

>>> l = tolocal(u)

129

>>> l

104

>>> l

130

'foo: ?'

105

'foo: ?'

131

>>> fromlocal(l)

106

>>> fromlocal(l)

132

'foo: \\xc3\\xa4'

107

'foo: \\xc3\\xa4'

133

>>> u2 = 'foo: \\xc3\\xa1'

108

>>> u2 = 'foo: \\xc3\\xa1'

134

>>> d = { l: 1, tolocal(u2): 2 }

109

>>> d = { l: 1, tolocal(u2): 2 }

135

>>> len(d) # no collision

110

>>> len(d) # no collision

136

2

111

2

137

>>> 'foo: ?' in d

112

>>> 'foo: ?' in d

138

False

113

False

139

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

114

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

140

>>> l = tolocal(l1)

115

>>> l = tolocal(l1)

141

>>> l

116

>>> l

142

'foo: ?'

117

'foo: ?'

143

>>> fromlocal(l) # magically in utf-8

118

>>> fromlocal(l) # magically in utf-8

144

'foo: \\xc3\\xa4'

119

'foo: \\xc3\\xa4'

145

"""

120

"""

146

121

147

try:

122

try:

148

try:

123

try:

149

# make sure string is actually stored in UTF-8

124

# make sure string is actually stored in UTF-8

150

u = s.decode('UTF-8')

125

u = s.decode('UTF-8')

151

if encoding == 'UTF-8':

126

if encoding == 'UTF-8':

152

# fast path

127

# fast path

153

return s

128

return s

154

r = u.encode(_sysstr(encoding), u"replace")

129

r = u.encode(_sysstr(encoding), u"replace")

155

if u == r.decode(_sysstr(encoding)):

130

if u == r.decode(_sysstr(encoding)):

156

# r is a safe, non-lossy encoding of s

131

# r is a safe, non-lossy encoding of s

157

return r

132

return r

158

return localstr(s, r)

133

return localstr(s, r)

159

except UnicodeDecodeError:

134

except UnicodeDecodeError:

160

# we should only get here if we're looking at an ancient changeset

135

# we should only get here if we're looking at an ancient changeset

161

try:

136

try:

162

u = s.decode(_sysstr(fallbackencoding))

137

u = s.decode(_sysstr(fallbackencoding))

163

r = u.encode(_sysstr(encoding), u"replace")

138

r = u.encode(_sysstr(encoding), u"replace")

164

if u == r.decode(_sysstr(encoding)):

139

if u == r.decode(_sysstr(encoding)):

165

# r is a safe, non-lossy encoding of s

140

# r is a safe, non-lossy encoding of s

166

return r

141

return r

167

return localstr(u.encode('UTF-8'), r)

142

return localstr(u.encode('UTF-8'), r)

168

except UnicodeDecodeError:

143

except UnicodeDecodeError:

169

u = s.decode("utf-8", "replace") # last ditch

144

u = s.decode("utf-8", "replace") # last ditch

170

# can't round-trip

145

# can't round-trip

171

return u.encode(_sysstr(encoding), u"replace")

146

return u.encode(_sysstr(encoding), u"replace")

172

except LookupError as k:

147

except LookupError as k:

173

raise error.Abort(k, hint="please check your locale settings")

148

raise error.Abort(k, hint="please check your locale settings")

174

149

175

def fromlocal(s):

150

def fromlocal(s):

176

"""

151

"""

177

Convert a string from the local character encoding to UTF-8

152

Convert a string from the local character encoding to UTF-8

178

153

179

We attempt to decode strings using the encoding mode set by

154

We attempt to decode strings using the encoding mode set by

180

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

155

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

181

characters will cause an error message. Other modes include

156

characters will cause an error message. Other modes include

182

'replace', which replaces unknown characters with a special

157

'replace', which replaces unknown characters with a special

183

Unicode character, and 'ignore', which drops the character.

158

Unicode character, and 'ignore', which drops the character.

184

"""

159

"""

185

160

186

# can we do a lossless round-trip?

161

# can we do a lossless round-trip?

187

if isinstance(s, localstr):

162

if isinstance(s, localstr):

188

return s._utf8

163

return s._utf8

189

164

190

try:

165

try:

191

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

166

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

192

return u.encode("utf-8")

167

return u.encode("utf-8")

193

except UnicodeDecodeError as inst:

168

except UnicodeDecodeError as inst:

194

sub = s[max(0, inst.start - 10):inst.start + 10]

169

sub = s[max(0, inst.start - 10):inst.start + 10]

195

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

170

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

196

except LookupError as k:

171

except LookupError as k:

197

raise error.Abort(k, hint="please check your locale settings")

172

raise error.Abort(k, hint="please check your locale settings")

198

173

199

def unitolocal(u):

174

def unitolocal(u):

200

"""Convert a unicode string to a byte string of local encoding"""

175

"""Convert a unicode string to a byte string of local encoding"""

201

return tolocal(u.encode('utf-8'))

176

return tolocal(u.encode('utf-8'))

202

177

203

def unifromlocal(s):

178

def unifromlocal(s):

204

"""Convert a byte string of local encoding to a unicode string"""

179

"""Convert a byte string of local encoding to a unicode string"""

205

return fromlocal(s).decode('utf-8')

180

return fromlocal(s).decode('utf-8')

206

181

207

# converter functions between native str and byte string. use these if the

182

# converter functions between native str and byte string. use these if the

208

# character encoding is not aware (e.g. exception message) or is known to

183

# character encoding is not aware (e.g. exception message) or is known to

209

# be locale dependent (e.g. date formatting.)

184

# be locale dependent (e.g. date formatting.)

210

if pycompat.ispy3:

185

if pycompat.ispy3:

211

strtolocal = unitolocal

186

strtolocal = unitolocal

212

strfromlocal = unifromlocal

187

strfromlocal = unifromlocal

213

else:

188

else:

214

strtolocal = pycompat.identity

189

strtolocal = pycompat.identity

215

strfromlocal = pycompat.identity

190

strfromlocal = pycompat.identity

216

191

217

if not _nativeenviron:

192

if not _nativeenviron:

218

# now encoding and helper functions are available, recreate the environ

193

# now encoding and helper functions are available, recreate the environ

219

# dict to be exported to other modules

194

# dict to be exported to other modules

220

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

195

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

221

for k, v in os.environ.items()) # re-exports

196

for k, v in os.environ.items()) # re-exports

222

197

223

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

198

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

224

wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

199

wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

225

and "WFA" or "WF")

200

and "WFA" or "WF")

226

201

227

def colwidth(s):

202

def colwidth(s):

228

"Find the column width of a string for display in the local encoding"

203

"Find the column width of a string for display in the local encoding"

229

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

204

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

230

205

231

def ucolwidth(d):

206

def ucolwidth(d):

232

"Find the column width of a Unicode string for display"

207

"Find the column width of a Unicode string for display"

233

eaw = getattr(unicodedata, 'east_asian_width', None)

208

eaw = getattr(unicodedata, 'east_asian_width', None)

234

if eaw is not None:

209

if eaw is not None:

235

return sum([eaw(c) in wide and 2 or 1 for c in d])

210

return sum([eaw(c) in wide and 2 or 1 for c in d])

236

return len(d)

211

return len(d)

237

212

238

def getcols(s, start, c):

213

def getcols(s, start, c):

239

'''Use colwidth to find a c-column substring of s starting at byte

214

'''Use colwidth to find a c-column substring of s starting at byte

240

index start'''

215

index start'''

241

for x in xrange(start + c, len(s)):

216

for x in xrange(start + c, len(s)):

242

t = s[start:x]

217

t = s[start:x]

243

if colwidth(t) == c:

218

if colwidth(t) == c:

244

return t

219

return t

245

220

246

def trim(s, width, ellipsis='', leftside=False):

221

def trim(s, width, ellipsis='', leftside=False):

247

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

222

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

248

223

249

If 'leftside' is True, left side of string 's' is trimmed.

224

If 'leftside' is True, left side of string 's' is trimmed.

250

'ellipsis' is always placed at trimmed side.

225

'ellipsis' is always placed at trimmed side.

251

226

252

>>> ellipsis = '+++'

227

>>> ellipsis = '+++'

253

>>> from . import encoding

228

>>> from . import encoding

254

>>> encoding.encoding = 'utf-8'

229

>>> encoding.encoding = 'utf-8'

255

>>> t= '1234567890'

230

>>> t= '1234567890'

256

>>> print trim(t, 12, ellipsis=ellipsis)

231

>>> print trim(t, 12, ellipsis=ellipsis)

257

1234567890

232

1234567890

258

>>> print trim(t, 10, ellipsis=ellipsis)

233

>>> print trim(t, 10, ellipsis=ellipsis)

259

1234567890

234

1234567890

260

>>> print trim(t, 8, ellipsis=ellipsis)

235

>>> print trim(t, 8, ellipsis=ellipsis)

261

12345+++

236

12345+++

262

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

237

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

263

+++67890

238

+++67890

264

>>> print trim(t, 8)

239

>>> print trim(t, 8)

265

12345678

240

12345678

266

>>> print trim(t, 8, leftside=True)

241

>>> print trim(t, 8, leftside=True)

267

34567890

242

34567890

268

>>> print trim(t, 3, ellipsis=ellipsis)

243

>>> print trim(t, 3, ellipsis=ellipsis)

269

+++

244

+++

270

>>> print trim(t, 1, ellipsis=ellipsis)

245

>>> print trim(t, 1, ellipsis=ellipsis)

271

+

246

+

272

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

247

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

273

>>> t = u.encode(encoding.encoding)

248

>>> t = u.encode(encoding.encoding)

274

>>> print trim(t, 12, ellipsis=ellipsis)

249

>>> print trim(t, 12, ellipsis=ellipsis)

275

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

250

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

276

>>> print trim(t, 10, ellipsis=ellipsis)

251

>>> print trim(t, 10, ellipsis=ellipsis)

277

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

252

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

278

>>> print trim(t, 8, ellipsis=ellipsis)

253

>>> print trim(t, 8, ellipsis=ellipsis)

279

\xe3\x81\x82\xe3\x81\x84+++

254

\xe3\x81\x82\xe3\x81\x84+++

280

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

255

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

281

+++\xe3\x81\x88\xe3\x81\x8a

256

+++\xe3\x81\x88\xe3\x81\x8a

282

>>> print trim(t, 5)

257

>>> print trim(t, 5)

283

\xe3\x81\x82\xe3\x81\x84

258

\xe3\x81\x82\xe3\x81\x84

284

>>> print trim(t, 5, leftside=True)

259

>>> print trim(t, 5, leftside=True)

285

\xe3\x81\x88\xe3\x81\x8a

260

\xe3\x81\x88\xe3\x81\x8a

286

>>> print trim(t, 4, ellipsis=ellipsis)

261

>>> print trim(t, 4, ellipsis=ellipsis)

287

+++

262

+++

288

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

263

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

289

+++

264

+++

290

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

265

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

291

>>> print trim(t, 12, ellipsis=ellipsis)

266

>>> print trim(t, 12, ellipsis=ellipsis)

292

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

267

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

293

>>> print trim(t, 10, ellipsis=ellipsis)

268

>>> print trim(t, 10, ellipsis=ellipsis)

294

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

269

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

295

>>> print trim(t, 8, ellipsis=ellipsis)

270

>>> print trim(t, 8, ellipsis=ellipsis)

296

\x11\x22\x33\x44\x55+++

271

\x11\x22\x33\x44\x55+++

297

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

272

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

298

+++\x66\x77\x88\x99\xaa

273

+++\x66\x77\x88\x99\xaa

299

>>> print trim(t, 8)

274

>>> print trim(t, 8)

300

\x11\x22\x33\x44\x55\x66\x77\x88

275

\x11\x22\x33\x44\x55\x66\x77\x88

301

>>> print trim(t, 8, leftside=True)

276

>>> print trim(t, 8, leftside=True)

302

\x33\x44\x55\x66\x77\x88\x99\xaa

277

\x33\x44\x55\x66\x77\x88\x99\xaa

303

>>> print trim(t, 3, ellipsis=ellipsis)

278

>>> print trim(t, 3, ellipsis=ellipsis)

304

+++

279

+++

305

>>> print trim(t, 1, ellipsis=ellipsis)

280

>>> print trim(t, 1, ellipsis=ellipsis)

306

+

281

+

307

"""

282

"""

308

try:

283

try:

309

u = s.decode(_sysstr(encoding))

284

u = s.decode(_sysstr(encoding))

310

except UnicodeDecodeError:

285

except UnicodeDecodeError:

311

if len(s) <= width: # trimming is not needed

286

if len(s) <= width: # trimming is not needed

312

return s

287

return s

313

width -= len(ellipsis)

288

width -= len(ellipsis)

314

if width <= 0: # no enough room even for ellipsis

289

if width <= 0: # no enough room even for ellipsis

315

return ellipsis[:width + len(ellipsis)]

290

return ellipsis[:width + len(ellipsis)]

316

if leftside:

291

if leftside:

317

return ellipsis + s[-width:]

292

return ellipsis + s[-width:]

318

return s[:width] + ellipsis

293

return s[:width] + ellipsis

319

294

320

if ucolwidth(u) <= width: # trimming is not needed

295

if ucolwidth(u) <= width: # trimming is not needed

321

return s

296

return s

322

297

323

width -= len(ellipsis)

298

width -= len(ellipsis)

324

if width <= 0: # no enough room even for ellipsis

299

if width <= 0: # no enough room even for ellipsis

325

return ellipsis[:width + len(ellipsis)]

300

return ellipsis[:width + len(ellipsis)]

326

301

327

if leftside:

302

if leftside:

328

uslice = lambda i: u[i:]

303

uslice = lambda i: u[i:]

329

concat = lambda s: ellipsis + s

304

concat = lambda s: ellipsis + s

330

else:

305

else:

331

uslice = lambda i: u[:-i]

306

uslice = lambda i: u[:-i]

332

concat = lambda s: s + ellipsis

307

concat = lambda s: s + ellipsis

333

for i in xrange(1, len(u)):

308

for i in xrange(1, len(u)):

334

usub = uslice(i)

309

usub = uslice(i)

335

if ucolwidth(usub) <= width:

310

if ucolwidth(usub) <= width:

336

return concat(usub.encode(_sysstr(encoding)))

311

return concat(usub.encode(_sysstr(encoding)))

337

return ellipsis # no enough room for multi-column characters

312

return ellipsis # no enough room for multi-column characters

338

313

339

def _asciilower(s):

314

def _asciilower(s):

340

'''convert a string to lowercase if ASCII

315

'''convert a string to lowercase if ASCII

341

316

342

Raises UnicodeDecodeError if non-ASCII characters are found.'''

317

Raises UnicodeDecodeError if non-ASCII characters are found.'''

343

s.decode('ascii')

318

s.decode('ascii')

344

return s.lower()

319

return s.lower()

345

320

346

def asciilower(s):

321

def asciilower(s):

347

# delay importing avoids cyclic dependency around "parsers" in

322

# delay importing avoids cyclic dependency around "parsers" in

348

# pure Python build (util => i18n => encoding => parsers => util)

323

# pure Python build (util => i18n => encoding => parsers => util)

349

from . import parsers

324

from . import parsers

350

impl = getattr(parsers, 'asciilower', _asciilower)

325

impl = getattr(parsers, 'asciilower', _asciilower)

351

global asciilower

326

global asciilower

352

asciilower = impl

327

asciilower = impl

353

return impl(s)

328

return impl(s)

354

329

355

def _asciiupper(s):

330

def _asciiupper(s):

356

'''convert a string to uppercase if ASCII

331

'''convert a string to uppercase if ASCII

357

332

358

Raises UnicodeDecodeError if non-ASCII characters are found.'''

333

Raises UnicodeDecodeError if non-ASCII characters are found.'''

359

s.decode('ascii')

334

s.decode('ascii')

360

return s.upper()

335

return s.upper()

361

336

362

def asciiupper(s):

337

def asciiupper(s):

363

# delay importing avoids cyclic dependency around "parsers" in

338

# delay importing avoids cyclic dependency around "parsers" in

364

# pure Python build (util => i18n => encoding => parsers => util)

339

# pure Python build (util => i18n => encoding => parsers => util)

365

from . import parsers

340

from . import parsers

366

impl = getattr(parsers, 'asciiupper', _asciiupper)

341

impl = getattr(parsers, 'asciiupper', _asciiupper)

367

global asciiupper

342

global asciiupper

368

asciiupper = impl

343

asciiupper = impl

369

return impl(s)

344

return impl(s)

370

345

371

def lower(s):

346

def lower(s):

372

"best-effort encoding-aware case-folding of local string s"

347

"best-effort encoding-aware case-folding of local string s"

373

try:

348

try:

374

return asciilower(s)

349

return asciilower(s)

375

except UnicodeDecodeError:

350

except UnicodeDecodeError:

376

pass

351

pass

377

try:

352

try:

378

if isinstance(s, localstr):

353

if isinstance(s, localstr):

379

u = s._utf8.decode("utf-8")

354

u = s._utf8.decode("utf-8")

380

else:

355

else:

381

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

356

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

382

357

383

lu = u.lower()

358

lu = u.lower()

384

if u == lu:

359

if u == lu:

385

return s # preserve localstring

360

return s # preserve localstring

386

return lu.encode(_sysstr(encoding))

361

return lu.encode(_sysstr(encoding))

387

except UnicodeError:

362

except UnicodeError:

388

return s.lower() # we don't know how to fold this except in ASCII

363

return s.lower() # we don't know how to fold this except in ASCII

389

except LookupError as k:

364

except LookupError as k:

390

raise error.Abort(k, hint="please check your locale settings")

365

raise error.Abort(k, hint="please check your locale settings")

391

366

392

def upper(s):

367

def upper(s):

393

"best-effort encoding-aware case-folding of local string s"

368

"best-effort encoding-aware case-folding of local string s"

394

try:

369

try:

395

return asciiupper(s)

370

return asciiupper(s)

396

except UnicodeDecodeError:

371

except UnicodeDecodeError:

397

return upperfallback(s)

372

return upperfallback(s)

398

373

399

def upperfallback(s):

374

def upperfallback(s):

400

try:

375

try:

401

if isinstance(s, localstr):

376

if isinstance(s, localstr):

402

u = s._utf8.decode("utf-8")

377

u = s._utf8.decode("utf-8")

403

else:

378

else:

404

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

379

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

405

380

406

uu = u.upper()

381

uu = u.upper()

407

if u == uu:

382

if u == uu:

408

return s # preserve localstring

383

return s # preserve localstring

409

return uu.encode(_sysstr(encoding))

384

return uu.encode(_sysstr(encoding))

410

except UnicodeError:

385

except UnicodeError:

411

return s.upper() # we don't know how to fold this except in ASCII

386

return s.upper() # we don't know how to fold this except in ASCII

412

except LookupError as k:

387

except LookupError as k:

413

raise error.Abort(k, hint="please check your locale settings")

388

raise error.Abort(k, hint="please check your locale settings")

414

389

415

class normcasespecs(object):

390

class normcasespecs(object):

416

'''what a platform's normcase does to ASCII strings

391

'''what a platform's normcase does to ASCII strings

417

392

418

This is specified per platform, and should be consistent with what normcase

393

This is specified per platform, and should be consistent with what normcase

419

on that platform actually does.

394

on that platform actually does.

420

395

421

lower: normcase lowercases ASCII strings

396

lower: normcase lowercases ASCII strings

422

upper: normcase uppercases ASCII strings

397

upper: normcase uppercases ASCII strings

423

other: the fallback function should always be called

398

other: the fallback function should always be called

424

399

425

This should be kept in sync with normcase_spec in util.h.'''

400

This should be kept in sync with normcase_spec in util.h.'''

426

lower = -1

401

lower = -1

427

upper = 1

402

upper = 1

428

other = 0

403

other = 0

429

404

430

_jsonmap = []

405

_jsonmap = []

431

_jsonmap.extend("\\u%04x" % x for x in range(32))

406

_jsonmap.extend("\\u%04x" % x for x in range(32))

432

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

407

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

433

_jsonmap.append('\\u007f')

408

_jsonmap.append('\\u007f')

434

_jsonmap[0x09] = '\\t'

409

_jsonmap[0x09] = '\\t'

435

_jsonmap[0x0a] = '\\n'

410

_jsonmap[0x0a] = '\\n'

436

_jsonmap[0x22] = '\\"'

411

_jsonmap[0x22] = '\\"'

437

_jsonmap[0x5c] = '\\\\'

412

_jsonmap[0x5c] = '\\\\'

438

_jsonmap[0x08] = '\\b'

413

_jsonmap[0x08] = '\\b'

439

_jsonmap[0x0c] = '\\f'

414

_jsonmap[0x0c] = '\\f'

440

_jsonmap[0x0d] = '\\r'

415

_jsonmap[0x0d] = '\\r'

441

_paranoidjsonmap = _jsonmap[:]

416

_paranoidjsonmap = _jsonmap[:]

442

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

417

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

443

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

418

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

444

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

419

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

445

420

446

def jsonescape(s, paranoid=False):

421

def jsonescape(s, paranoid=False):

447

'''returns a string suitable for JSON

422

'''returns a string suitable for JSON

448

423

449

JSON is problematic for us because it doesn't support non-Unicode

424

JSON is problematic for us because it doesn't support non-Unicode

450

bytes. To deal with this, we take the following approach:

425

bytes. To deal with this, we take the following approach:

451

426

452

- localstr objects are converted back to UTF-8

427

- localstr objects are converted back to UTF-8

453

- valid UTF-8/ASCII strings are passed as-is

428

- valid UTF-8/ASCII strings are passed as-is

454

- other strings are converted to UTF-8b surrogate encoding

429

- other strings are converted to UTF-8b surrogate encoding

455

- apply JSON-specified string escaping

430

- apply JSON-specified string escaping

456

431

457

(escapes are doubled in these tests)

432

(escapes are doubled in these tests)

458

433

459

>>> jsonescape('this is a test')

434

>>> jsonescape('this is a test')

460

'this is a test'

435

'this is a test'

461

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

436

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

462

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

437

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

463

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

438

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

464

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

439

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

465

>>> jsonescape('a weird byte: \\xdd')

440

>>> jsonescape('a weird byte: \\xdd')

466

'a weird byte: \\xed\\xb3\\x9d'

441

'a weird byte: \\xed\\xb3\\x9d'

467

>>> jsonescape('utf-8: caf\\xc3\\xa9')

442

>>> jsonescape('utf-8: caf\\xc3\\xa9')

468

'utf-8: caf\\xc3\\xa9'

443

'utf-8: caf\\xc3\\xa9'

469

>>> jsonescape('')

444

>>> jsonescape('')

470

''

445

''

471

446

472

If paranoid, non-ascii and common troublesome characters are also escaped.

447

If paranoid, non-ascii and common troublesome characters are also escaped.

473

This is suitable for web output.

448

This is suitable for web output.

474

449

475

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

450

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

476

'escape boundary: ~ \\\\u007f \\\\u0080'

451

'escape boundary: ~ \\\\u007f \\\\u0080'

477

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

452

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

478

'a weird byte: \\\\udcdd'

453

'a weird byte: \\\\udcdd'

479

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

454

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

480

'utf-8: caf\\\\u00e9'

455

'utf-8: caf\\\\u00e9'

481

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

456

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

482

'non-BMP: \\\\ud834\\\\udd1e'

457

'non-BMP: \\\\ud834\\\\udd1e'

483

>>> jsonescape('<foo@example.org>', paranoid=True)

458

>>> jsonescape('<foo@example.org>', paranoid=True)

484

'\\\\u003cfoo@example.org\\\\u003e'

459

'\\\\u003cfoo@example.org\\\\u003e'

485

'''

460

'''

486

461

487

if paranoid:

462

if paranoid:

488

jm = _paranoidjsonmap

463

jm = _paranoidjsonmap

489

else:

464

else:

490

jm = _jsonmap

465

jm = _jsonmap

491

466

492

u8chars = toutf8b(s)

467

u8chars = toutf8b(s)

493

try:

468

try:

494

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

469

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

495

except IndexError:

470

except IndexError:

496

pass

471

pass

497

# non-BMP char is represented as UTF-16 surrogate pair

472

# non-BMP char is represented as UTF-16 surrogate pair

498

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

473

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

499

u16codes.pop(0) # drop BOM

474

u16codes.pop(0) # drop BOM

500

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

475

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

501

476

502

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

477

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

503

478

504

def getutf8char(s, pos):

479

def getutf8char(s, pos):

505

'''get the next full utf-8 character in the given string, starting at pos

480

'''get the next full utf-8 character in the given string, starting at pos

506

481

507

Raises a UnicodeError if the given location does not start a valid

482

Raises a UnicodeError if the given location does not start a valid

508

utf-8 character.

483

utf-8 character.

509

'''

484

'''

510

485

511

# find how many bytes to attempt decoding from first nibble

486

# find how many bytes to attempt decoding from first nibble

512

l = _utf8len[ord(s[pos]) >> 4]

487

l = _utf8len[ord(s[pos]) >> 4]

513

if not l: # ascii

488

if not l: # ascii

514

return s[pos]

489

return s[pos]

515

490

516

c = s[pos:pos + l]

491

c = s[pos:pos + l]

517

# validate with attempted decode

492

# validate with attempted decode

518

c.decode("utf-8")

493

c.decode("utf-8")

519

return c

494

return c

520

495

521

def toutf8b(s):

496

def toutf8b(s):

522

'''convert a local, possibly-binary string into UTF-8b

497

'''convert a local, possibly-binary string into UTF-8b

523

498

524

This is intended as a generic method to preserve data when working

499

This is intended as a generic method to preserve data when working

525

with schemes like JSON and XML that have no provision for

500

with schemes like JSON and XML that have no provision for

526

arbitrary byte strings. As Mercurial often doesn't know

501

arbitrary byte strings. As Mercurial often doesn't know

527

what encoding data is in, we use so-called UTF-8b.

502

what encoding data is in, we use so-called UTF-8b.

528

503

529

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

504

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

530

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

505

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

531

uDC00-uDCFF.

506

uDC00-uDCFF.

532

507

533

Principles of operation:

508

Principles of operation:

534

509

535

- ASCII and UTF-8 data successfully round-trips and is understood

510

- ASCII and UTF-8 data successfully round-trips and is understood

536

by Unicode-oriented clients

511

by Unicode-oriented clients

537

- filenames and file contents in arbitrary other encodings can have

512

- filenames and file contents in arbitrary other encodings can have

538

be round-tripped or recovered by clueful clients

513

be round-tripped or recovered by clueful clients

539

- local strings that have a cached known UTF-8 encoding (aka

514

- local strings that have a cached known UTF-8 encoding (aka

540

localstr) get sent as UTF-8 so Unicode-oriented clients get the

515

localstr) get sent as UTF-8 so Unicode-oriented clients get the

541

Unicode data they want

516

Unicode data they want

542

- because we must preserve UTF-8 bytestring in places such as

517

- because we must preserve UTF-8 bytestring in places such as

543

filenames, metadata can't be roundtripped without help

518

filenames, metadata can't be roundtripped without help

544

519

545

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

520

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

546

arbitrary bytes into an internal Unicode format that can be

521

arbitrary bytes into an internal Unicode format that can be

547

re-encoded back into the original. Here we are exposing the

522

re-encoded back into the original. Here we are exposing the

548

internal surrogate encoding as a UTF-8 string.)

523

internal surrogate encoding as a UTF-8 string.)

549

'''

524

'''

550

525

551

if "\xed" not in s:

526

if "\xed" not in s:

552

if isinstance(s, localstr):

527

if isinstance(s, localstr):

553

return s._utf8

528

return s._utf8

554

try:

529

try:

555

s.decode('utf-8')

530

s.decode('utf-8')

556

return s

531

return s

557

except UnicodeDecodeError:

532

except UnicodeDecodeError:

558

pass

533

pass

559

534

560

r = ""

535

r = ""

561

pos = 0

536

pos = 0

562

l = len(s)

537

l = len(s)

563

while pos < l:

538

while pos < l:

564

try:

539

try:

565

c = getutf8char(s, pos)

540

c = getutf8char(s, pos)

566

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

541

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

567

# have to re-escape existing U+DCxx characters

542

# have to re-escape existing U+DCxx characters

568

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

543

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

569

pos += 1

544

pos += 1

570

else:

545

else:

571

pos += len(c)

546

pos += len(c)

572

except UnicodeDecodeError:

547

except UnicodeDecodeError:

573

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

548

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

574

pos += 1

549

pos += 1

575

r += c

550

r += c

576

return r

551

return r

577

552

578

def fromutf8b(s):

553

def fromutf8b(s):

579

'''Given a UTF-8b string, return a local, possibly-binary string.

554

'''Given a UTF-8b string, return a local, possibly-binary string.

580

555

581

return the original binary string. This

556

return the original binary string. This

582

is a round-trip process for strings like filenames, but metadata

557

is a round-trip process for strings like filenames, but metadata

583

that's was passed through tolocal will remain in UTF-8.

558

that's was passed through tolocal will remain in UTF-8.

584

559

585

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

560

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

586

>>> m = "\\xc3\\xa9\\x99abcd"

561

>>> m = "\\xc3\\xa9\\x99abcd"

587

>>> toutf8b(m)

562

>>> toutf8b(m)

588

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

563

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

589

>>> roundtrip(m)

564

>>> roundtrip(m)

590

True

565

True

591

>>> roundtrip("\\xc2\\xc2\\x80")

566

>>> roundtrip("\\xc2\\xc2\\x80")

592

True

567

True

593

>>> roundtrip("\\xef\\xbf\\xbd")

568

>>> roundtrip("\\xef\\xbf\\xbd")

594

True

569

True

595

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

570

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

596

True

571

True

597

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

572

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

598

True

573

True

599

'''

574

'''

600

575

601

# fast path - look for uDxxx prefixes in s

576

# fast path - look for uDxxx prefixes in s

602

if "\xed" not in s:

577

if "\xed" not in s:

603

return s

578

return s

604

579

605

# We could do this with the unicode type but some Python builds

580

# We could do this with the unicode type but some Python builds

606

# use UTF-16 internally (issue5031) which causes non-BMP code

581

# use UTF-16 internally (issue5031) which causes non-BMP code

607

# points to be escaped. Instead, we use our handy getutf8char

582

# points to be escaped. Instead, we use our handy getutf8char

608

# helper again to walk the string without "decoding" it.

583

# helper again to walk the string without "decoding" it.

609

584

610

r = ""

585

r = ""

611

pos = 0

586

pos = 0

612

l = len(s)

587

l = len(s)

613

while pos < l:

588

while pos < l:

614

c = getutf8char(s, pos)

589

c = getutf8char(s, pos)

615

pos += len(c)

590

pos += len(c)

616

# unescape U+DCxx characters

591

# unescape U+DCxx characters

617

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

592

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

618

c = chr(ord(c.decode("utf-8")) & 0xff)

593

c = chr(ord(c.decode("utf-8")) & 0xff)

619

r += c

594

r += c

620

return r

595

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 pycompat,
             )
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             if pycompat.ispy3:
                 assert set(i[0] for i in _ignore) == set([ord(b'\xe2'), ord(b'\xef')])
             else:
                 assert set(i[0] for i in _ignore) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
-            def _getpreferredencoding():
-                '''
-                On darwin, getpreferredencoding ignores the locale environment and
-                always returns mac-roman. http://bugs.python.org/issue6202 fixes this
-                for Python 2.7 and up. This is the same corrected code for earlier
-                Python versions.
-                However, we can't use a version check for this method, as some distributions
-                patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
-                encoding, as it is unlikely that this encoding is the actually expected.
-                '''
-                try:
-                    locale.CODESET
-                except AttributeError:
-                    # Fall back to parsing environment variables :-(
-                    return locale.getdefaultlocale()[1]
-                oldloc = locale.setlocale(locale.LC_CTYPE)
-                locale.setlocale(locale.LC_CTYPE, "")
-                result = locale.nl_langinfo(locale.CODESET)
-                locale.setlocale(locale.LC_CTYPE, oldloc)
-                return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
-                'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r