upstream/mercurial-mirror Commit - r32537:044f3d7e

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

_sysstr = pycompat.sysstr

21

_sysstr = pycompat.sysstr

22

23

if pycompat.ispy3:

23

if pycompat.ispy3:

24

unichr = chr

24

unichr = chr

25

26

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

26

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

27

# "Unicode Subtleties"), so we need to ignore them in some places for

27

# "Unicode Subtleties"), so we need to ignore them in some places for

28

# sanity.

28

# sanity.

29

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

29

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

30

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

30

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

31

"206a 206b 206c 206d 206e 206f feff".split()]

31

"206a 206b 206c 206d 206e 206f feff".split()]

32

# verify the next function will work

32

# verify the next function will work

33

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

33

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

34

35

def hfsignoreclean(s):

35

def hfsignoreclean(s):

36

"""Remove codepoints ignored by HFS+ from s.

36

"""Remove codepoints ignored by HFS+ from s.

37

38

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

38

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

39

'.hg'

39

'.hg'

40

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

40

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

41

'.hg'

41

'.hg'

42

"""

42

"""

43

if "\xe2" in s or "\xef" in s:

43

if "\xe2" in s or "\xef" in s:

44

for c in _ignore:

44

for c in _ignore:

45

s = s.replace(c, '')

45

s = s.replace(c, '')

46

return s

46

return s

47

48

# encoding.environ is provided read-only, which may not be used to modify

48

# encoding.environ is provided read-only, which may not be used to modify

49

# the process environment

49

# the process environment

50

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

50

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

51

if not pycompat.ispy3:

51

if not pycompat.ispy3:

52

environ = os.environ # re-exports

52

environ = os.environ # re-exports

53

elif _nativeenviron:

53

elif _nativeenviron:

54

environ = os.environb # re-exports

54

environ = os.environb # re-exports

55

else:

55

else:

56

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

56

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

57

# and recreate it once encoding is settled

57

# and recreate it once encoding is settled

58

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

58

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

59

for k, v in os.environ.items()) # re-exports

59

for k, v in os.environ.items()) # re-exports

60

61

_encodingfixers = {

61

_encodingfixers = {

62

'646': lambda: 'ascii',

62

'646': lambda: 'ascii',

63

'ANSI_X3.4-1968': lambda: 'ascii',

63

'ANSI_X3.4-1968': lambda: 'ascii',

64

}

64

}

65

66

try:

66

try:

67

encoding = environ.get("HGENCODING")

67

encoding = environ.get("HGENCODING")

68

if not encoding:

68

if not encoding:

69

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

69

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

70

encoding = _encodingfixers.get(encoding, lambda: encoding)()

70

encoding = _encodingfixers.get(encoding, lambda: encoding)()

71

except locale.Error:

71

except locale.Error:

72

encoding = 'ascii'

72

encoding = 'ascii'

73

encodingmode = environ.get("HGENCODINGMODE", "strict")

73

encodingmode = environ.get("HGENCODINGMODE", "strict")

74

fallbackencoding = 'ISO-8859-1'

74

fallbackencoding = 'ISO-8859-1'

75

76

class localstr(str):

76

class localstr(str):

77

'''This class allows strings that are unmodified to be

77

'''This class allows strings that are unmodified to be

78

round-tripped to the local encoding and back'''

78

round-tripped to the local encoding and back'''

79

def __new__(cls, u, l):

79

def __new__(cls, u, l):

80

s = str.__new__(cls, l)

80

s = str.__new__(cls, l)

81

s._utf8 = u

81

s._utf8 = u

82

return s

82

return s

83

def __hash__(self):

83

def __hash__(self):

84

return hash(self._utf8) # avoid collisions in local string space

84

return hash(self._utf8) # avoid collisions in local string space

85

86

def tolocal(s):

86

def tolocal(s):

87

"""

87

"""

88

Convert a string from internal UTF-8 to local encoding

88

Convert a string from internal UTF-8 to local encoding

89

90

All internal strings should be UTF-8 but some repos before the

90

All internal strings should be UTF-8 but some repos before the

91

implementation of locale support may contain latin1 or possibly

91

implementation of locale support may contain latin1 or possibly

92

other character sets. We attempt to decode everything strictly

92

other character sets. We attempt to decode everything strictly

93

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

93

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

94

replace unknown characters.

94

replace unknown characters.

95

96

The localstr class is used to cache the known UTF-8 encoding of

96

The localstr class is used to cache the known UTF-8 encoding of

97

strings next to their local representation to allow lossless

97

strings next to their local representation to allow lossless

98

round-trip conversion back to UTF-8.

98

round-trip conversion back to UTF-8.

99

100

>>> u = 'foo: \\xc3\\xa4' # utf-8

100

>>> u = 'foo: \\xc3\\xa4' # utf-8

101

>>> l = tolocal(u)

101

>>> l = tolocal(u)

102

>>> l

102

>>> l

103

'foo: ?'

103

'foo: ?'

104

>>> fromlocal(l)

104

>>> fromlocal(l)

105

'foo: \\xc3\\xa4'

105

'foo: \\xc3\\xa4'

106

>>> u2 = 'foo: \\xc3\\xa1'

106

>>> u2 = 'foo: \\xc3\\xa1'

107

>>> d = { l: 1, tolocal(u2): 2 }

107

>>> d = { l: 1, tolocal(u2): 2 }

108

>>> len(d) # no collision

108

>>> len(d) # no collision

109

2

109

2

110

>>> 'foo: ?' in d

110

>>> 'foo: ?' in d

111

False

111

False

112

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

112

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

113

>>> l = tolocal(l1)

113

>>> l = tolocal(l1)

114

>>> l

114

>>> l

115

'foo: ?'

115

'foo: ?'

116

>>> fromlocal(l) # magically in utf-8

116

>>> fromlocal(l) # magically in utf-8

117

'foo: \\xc3\\xa4'

117

'foo: \\xc3\\xa4'

118

"""

118

"""

119

120

try:

120

try:

121

try:

121

try:

122

# make sure string is actually stored in UTF-8

122

# make sure string is actually stored in UTF-8

123

u = s.decode('UTF-8')

123

u = s.decode('UTF-8')

124

if encoding == 'UTF-8':

124

if encoding == 'UTF-8':

125

# fast path

125

# fast path

126

return s

126

return s

127

r = u.encode(_sysstr(encoding), u"replace")

127

r = u.encode(_sysstr(encoding), u"replace")

128

if u == r.decode(_sysstr(encoding)):

128

if u == r.decode(_sysstr(encoding)):

129

# r is a safe, non-lossy encoding of s

129

# r is a safe, non-lossy encoding of s

130

return r

130

return r

131

return localstr(s, r)

131

return localstr(s, r)

132

except UnicodeDecodeError:

132

except UnicodeDecodeError:

133

# we should only get here if we're looking at an ancient changeset

133

# we should only get here if we're looking at an ancient changeset

134

try:

134

try:

135

u = s.decode(_sysstr(fallbackencoding))

135

u = s.decode(_sysstr(fallbackencoding))

136

r = u.encode(_sysstr(encoding), u"replace")

136

r = u.encode(_sysstr(encoding), u"replace")

137

if u == r.decode(_sysstr(encoding)):

137

if u == r.decode(_sysstr(encoding)):

138

# r is a safe, non-lossy encoding of s

138

# r is a safe, non-lossy encoding of s

139

return r

139

return r

140

return localstr(u.encode('UTF-8'), r)

140

return localstr(u.encode('UTF-8'), r)

141

except UnicodeDecodeError:

141

except UnicodeDecodeError:

142

u = s.decode("utf-8", "replace") # last ditch

142

u = s.decode("utf-8", "replace") # last ditch

143

# can't round-trip

143

# can't round-trip

144

return u.encode(_sysstr(encoding), u"replace")

144

return u.encode(_sysstr(encoding), u"replace")

145

except LookupError as k:

145

except LookupError as k:

146

raise error.Abort(k, hint="please check your locale settings")

146

raise error.Abort(k, hint="please check your locale settings")

147

148

def fromlocal(s):

148

def fromlocal(s):

149

"""

149

"""

150

Convert a string from the local character encoding to UTF-8

150

Convert a string from the local character encoding to UTF-8

151

152

We attempt to decode strings using the encoding mode set by

152

We attempt to decode strings using the encoding mode set by

153

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

153

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

154

characters will cause an error message. Other modes include

154

characters will cause an error message. Other modes include

155

'replace', which replaces unknown characters with a special

155

'replace', which replaces unknown characters with a special

156

Unicode character, and 'ignore', which drops the character.

156

Unicode character, and 'ignore', which drops the character.

157

"""

157

"""

158

159

# can we do a lossless round-trip?

159

# can we do a lossless round-trip?

160

if isinstance(s, localstr):

160

if isinstance(s, localstr):

161

return s._utf8

161

return s._utf8

162

163

try:

163

try:

164

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

164

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

165

return u.encode("utf-8")

165

return u.encode("utf-8")

166

except UnicodeDecodeError as inst:

166

except UnicodeDecodeError as inst:

167

sub = s[max(0, inst.start - 10):inst.start + 10]

167

sub = s[max(0, inst.start - 10):inst.start + 10]

168

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

168

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

169

except LookupError as k:

169

except LookupError as k:

170

raise error.Abort(k, hint="please check your locale settings")

170

raise error.Abort(k, hint="please check your locale settings")

171

172

def unitolocal(u):

172

def unitolocal(u):

173

"""Convert a unicode string to a byte string of local encoding"""

173

"""Convert a unicode string to a byte string of local encoding"""

174

return tolocal(u.encode('utf-8'))

174

return tolocal(u.encode('utf-8'))

175

176

def unifromlocal(s):

176

def unifromlocal(s):

177

"""Convert a byte string of local encoding to a unicode string"""

177

"""Convert a byte string of local encoding to a unicode string"""

178

return fromlocal(s).decode('utf-8')

178

return fromlocal(s).decode('utf-8')

179

180

# converter functions between native str and byte string. use these if the

180

# converter functions between native str and byte string. use these if the

181

# character encoding is not aware (e.g. exception message) or is known to

181

# character encoding is not aware (e.g. exception message) or is known to

182

# be locale dependent (e.g. date formatting.)

182

# be locale dependent (e.g. date formatting.)

183

if pycompat.ispy3:

183

if pycompat.ispy3:

184

strtolocal = unitolocal

184

strtolocal = unitolocal

185

strfromlocal = unifromlocal

185

strfromlocal = unifromlocal

186

else:

186

else:

187

strtolocal = pycompat.identity

187

strtolocal = pycompat.identity

188

strfromlocal = pycompat.identity

188

strfromlocal = pycompat.identity

189

190

if not _nativeenviron:

190

if not _nativeenviron:

191

# now encoding and helper functions are available, recreate the environ

191

# now encoding and helper functions are available, recreate the environ

192

# dict to be exported to other modules

192

# dict to be exported to other modules

193

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

193

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

194

for k, v in os.environ.items()) # re-exports

194

for k, v in os.environ.items()) # re-exports

195

196

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

196

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

197

wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

197

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

198

and "WFA" or "WF")

198

and "WFA" or "WF")

199

200

def colwidth(s):

200

def colwidth(s):

201

"Find the column width of a string for display in the local encoding"

201

"Find the column width of a string for display in the local encoding"

202

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

202

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

203

204

def ucolwidth(d):

204

def ucolwidth(d):

205

"Find the column width of a Unicode string for display"

205

"Find the column width of a Unicode string for display"

206

eaw = getattr(unicodedata, 'east_asian_width', None)

206

eaw = getattr(unicodedata, 'east_asian_width', None)

207

if eaw is not None:

207

if eaw is not None:

208

return sum([eaw(c) in wide and 2 or 1 for c in d])

208

return sum([eaw(c) in _wide and 2 or 1 for c in d])

209

return len(d)

209

return len(d)

210

211

def getcols(s, start, c):

211

def getcols(s, start, c):

212

'''Use colwidth to find a c-column substring of s starting at byte

212

'''Use colwidth to find a c-column substring of s starting at byte

213

index start'''

213

index start'''

214

for x in xrange(start + c, len(s)):

214

for x in xrange(start + c, len(s)):

215

t = s[start:x]

215

t = s[start:x]

216

if colwidth(t) == c:

216

if colwidth(t) == c:

217

return t

217

return t

218

219

def trim(s, width, ellipsis='', leftside=False):

219

def trim(s, width, ellipsis='', leftside=False):

220

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

220

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

221

222

If 'leftside' is True, left side of string 's' is trimmed.

222

If 'leftside' is True, left side of string 's' is trimmed.

223

'ellipsis' is always placed at trimmed side.

223

'ellipsis' is always placed at trimmed side.

224

225

>>> ellipsis = '+++'

225

>>> ellipsis = '+++'

226

>>> from . import encoding

226

>>> from . import encoding

227

>>> encoding.encoding = 'utf-8'

227

>>> encoding.encoding = 'utf-8'

228

>>> t= '1234567890'

228

>>> t= '1234567890'

229

>>> print trim(t, 12, ellipsis=ellipsis)

229

>>> print trim(t, 12, ellipsis=ellipsis)

230

1234567890

230

1234567890

231

>>> print trim(t, 10, ellipsis=ellipsis)

231

>>> print trim(t, 10, ellipsis=ellipsis)

232

1234567890

232

1234567890

233

>>> print trim(t, 8, ellipsis=ellipsis)

233

>>> print trim(t, 8, ellipsis=ellipsis)

234

12345+++

234

12345+++

235

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

235

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

236

+++67890

236

+++67890

237

>>> print trim(t, 8)

237

>>> print trim(t, 8)

238

12345678

238

12345678

239

>>> print trim(t, 8, leftside=True)

239

>>> print trim(t, 8, leftside=True)

240

34567890

240

34567890

241

>>> print trim(t, 3, ellipsis=ellipsis)

241

>>> print trim(t, 3, ellipsis=ellipsis)

242

+++

242

+++

243

>>> print trim(t, 1, ellipsis=ellipsis)

243

>>> print trim(t, 1, ellipsis=ellipsis)

244

+

244

+

245

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

245

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

246

>>> t = u.encode(encoding.encoding)

246

>>> t = u.encode(encoding.encoding)

247

>>> print trim(t, 12, ellipsis=ellipsis)

247

>>> print trim(t, 12, ellipsis=ellipsis)

248

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

248

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

249

>>> print trim(t, 10, ellipsis=ellipsis)

249

>>> print trim(t, 10, ellipsis=ellipsis)

250

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

250

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

251

>>> print trim(t, 8, ellipsis=ellipsis)

251

>>> print trim(t, 8, ellipsis=ellipsis)

252

\xe3\x81\x82\xe3\x81\x84+++

252

\xe3\x81\x82\xe3\x81\x84+++

253

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

253

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

254

+++\xe3\x81\x88\xe3\x81\x8a

254

+++\xe3\x81\x88\xe3\x81\x8a

255

>>> print trim(t, 5)

255

>>> print trim(t, 5)

256

\xe3\x81\x82\xe3\x81\x84

256

\xe3\x81\x82\xe3\x81\x84

257

>>> print trim(t, 5, leftside=True)

257

>>> print trim(t, 5, leftside=True)

258

\xe3\x81\x88\xe3\x81\x8a

258

\xe3\x81\x88\xe3\x81\x8a

259

>>> print trim(t, 4, ellipsis=ellipsis)

259

>>> print trim(t, 4, ellipsis=ellipsis)

260

+++

260

+++

261

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

261

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

262

+++

262

+++

263

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

263

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

264

>>> print trim(t, 12, ellipsis=ellipsis)

264

>>> print trim(t, 12, ellipsis=ellipsis)

265

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

265

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

266

>>> print trim(t, 10, ellipsis=ellipsis)

266

>>> print trim(t, 10, ellipsis=ellipsis)

267

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

267

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

268

>>> print trim(t, 8, ellipsis=ellipsis)

268

>>> print trim(t, 8, ellipsis=ellipsis)

269

\x11\x22\x33\x44\x55+++

269

\x11\x22\x33\x44\x55+++

270

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

270

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

271

+++\x66\x77\x88\x99\xaa

271

+++\x66\x77\x88\x99\xaa

272

>>> print trim(t, 8)

272

>>> print trim(t, 8)

273

\x11\x22\x33\x44\x55\x66\x77\x88

273

\x11\x22\x33\x44\x55\x66\x77\x88

274

>>> print trim(t, 8, leftside=True)

274

>>> print trim(t, 8, leftside=True)

275

\x33\x44\x55\x66\x77\x88\x99\xaa

275

\x33\x44\x55\x66\x77\x88\x99\xaa

276

>>> print trim(t, 3, ellipsis=ellipsis)

276

>>> print trim(t, 3, ellipsis=ellipsis)

277

+++

277

+++

278

>>> print trim(t, 1, ellipsis=ellipsis)

278

>>> print trim(t, 1, ellipsis=ellipsis)

279

+

279

+

280

"""

280

"""

281

try:

281

try:

282

u = s.decode(_sysstr(encoding))

282

u = s.decode(_sysstr(encoding))

283

except UnicodeDecodeError:

283

except UnicodeDecodeError:

284

if len(s) <= width: # trimming is not needed

284

if len(s) <= width: # trimming is not needed

285

return s

285

return s

286

width -= len(ellipsis)

286

width -= len(ellipsis)

287

if width <= 0: # no enough room even for ellipsis

287

if width <= 0: # no enough room even for ellipsis

288

return ellipsis[:width + len(ellipsis)]

288

return ellipsis[:width + len(ellipsis)]

289

if leftside:

289

if leftside:

290

return ellipsis + s[-width:]

290

return ellipsis + s[-width:]

291

return s[:width] + ellipsis

291

return s[:width] + ellipsis

292

293

if ucolwidth(u) <= width: # trimming is not needed

293

if ucolwidth(u) <= width: # trimming is not needed

294

return s

294

return s

295

296

width -= len(ellipsis)

296

width -= len(ellipsis)

297

if width <= 0: # no enough room even for ellipsis

297

if width <= 0: # no enough room even for ellipsis

298

return ellipsis[:width + len(ellipsis)]

298

return ellipsis[:width + len(ellipsis)]

299

300

if leftside:

300

if leftside:

301

uslice = lambda i: u[i:]

301

uslice = lambda i: u[i:]

302

concat = lambda s: ellipsis + s

302

concat = lambda s: ellipsis + s

303

else:

303

else:

304

uslice = lambda i: u[:-i]

304

uslice = lambda i: u[:-i]

305

concat = lambda s: s + ellipsis

305

concat = lambda s: s + ellipsis

306

for i in xrange(1, len(u)):

306

for i in xrange(1, len(u)):

307

usub = uslice(i)

307

usub = uslice(i)

308

if ucolwidth(usub) <= width:

308

if ucolwidth(usub) <= width:

309

return concat(usub.encode(_sysstr(encoding)))

309

return concat(usub.encode(_sysstr(encoding)))

310

return ellipsis # no enough room for multi-column characters

310

return ellipsis # no enough room for multi-column characters

311

312

def _asciilower(s):

312

def _asciilower(s):

313

'''convert a string to lowercase if ASCII

313

'''convert a string to lowercase if ASCII

314

315

Raises UnicodeDecodeError if non-ASCII characters are found.'''

315

Raises UnicodeDecodeError if non-ASCII characters are found.'''

316

s.decode('ascii')

316

s.decode('ascii')

317

return s.lower()

317

return s.lower()

318

319

def asciilower(s):

319

def asciilower(s):

320

# delay importing avoids cyclic dependency around "parsers" in

320

# delay importing avoids cyclic dependency around "parsers" in

321

# pure Python build (util => i18n => encoding => parsers => util)

321

# pure Python build (util => i18n => encoding => parsers => util)

322

parsers = policy.importmod(r'parsers')

322

parsers = policy.importmod(r'parsers')

323

impl = getattr(parsers, 'asciilower', _asciilower)

323

impl = getattr(parsers, 'asciilower', _asciilower)

324

global asciilower

324

global asciilower

325

asciilower = impl

325

asciilower = impl

326

return impl(s)

326

return impl(s)

327

328

def _asciiupper(s):

328

def _asciiupper(s):

329

'''convert a string to uppercase if ASCII

329

'''convert a string to uppercase if ASCII

330

331

Raises UnicodeDecodeError if non-ASCII characters are found.'''

331

Raises UnicodeDecodeError if non-ASCII characters are found.'''

332

s.decode('ascii')

332

s.decode('ascii')

333

return s.upper()

333

return s.upper()

334

335

def asciiupper(s):

335

def asciiupper(s):

336

# delay importing avoids cyclic dependency around "parsers" in

336

# delay importing avoids cyclic dependency around "parsers" in

337

# pure Python build (util => i18n => encoding => parsers => util)

337

# pure Python build (util => i18n => encoding => parsers => util)

338

parsers = policy.importmod(r'parsers')

338

parsers = policy.importmod(r'parsers')

339

impl = getattr(parsers, 'asciiupper', _asciiupper)

339

impl = getattr(parsers, 'asciiupper', _asciiupper)

340

global asciiupper

340

global asciiupper

341

asciiupper = impl

341

asciiupper = impl

342

return impl(s)

342

return impl(s)

343

344

def lower(s):

344

def lower(s):

345

"best-effort encoding-aware case-folding of local string s"

345

"best-effort encoding-aware case-folding of local string s"

346

try:

346

try:

347

return asciilower(s)

347

return asciilower(s)

348

except UnicodeDecodeError:

348

except UnicodeDecodeError:

349

pass

349

pass

350

try:

350

try:

351

if isinstance(s, localstr):

351

if isinstance(s, localstr):

352

u = s._utf8.decode("utf-8")

352

u = s._utf8.decode("utf-8")

353

else:

353

else:

354

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

354

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

355

356

lu = u.lower()

356

lu = u.lower()

357

if u == lu:

357

if u == lu:

358

return s # preserve localstring

358

return s # preserve localstring

359

return lu.encode(_sysstr(encoding))

359

return lu.encode(_sysstr(encoding))

360

except UnicodeError:

360

except UnicodeError:

361

return s.lower() # we don't know how to fold this except in ASCII

361

return s.lower() # we don't know how to fold this except in ASCII

362

except LookupError as k:

362

except LookupError as k:

363

raise error.Abort(k, hint="please check your locale settings")

363

raise error.Abort(k, hint="please check your locale settings")

364

365

def upper(s):

365

def upper(s):

366

"best-effort encoding-aware case-folding of local string s"

366

"best-effort encoding-aware case-folding of local string s"

367

try:

367

try:

368

return asciiupper(s)

368

return asciiupper(s)

369

except UnicodeDecodeError:

369

except UnicodeDecodeError:

370

return upperfallback(s)

370

return upperfallback(s)

371

372

def upperfallback(s):

372

def upperfallback(s):

373

try:

373

try:

374

if isinstance(s, localstr):

374

if isinstance(s, localstr):

375

u = s._utf8.decode("utf-8")

375

u = s._utf8.decode("utf-8")

376

else:

376

else:

377

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

377

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

378

379

uu = u.upper()

379

uu = u.upper()

380

if u == uu:

380

if u == uu:

381

return s # preserve localstring

381

return s # preserve localstring

382

return uu.encode(_sysstr(encoding))

382

return uu.encode(_sysstr(encoding))

383

except UnicodeError:

383

except UnicodeError:

384

return s.upper() # we don't know how to fold this except in ASCII

384

return s.upper() # we don't know how to fold this except in ASCII

385

except LookupError as k:

385

except LookupError as k:

386

raise error.Abort(k, hint="please check your locale settings")

386

raise error.Abort(k, hint="please check your locale settings")

387

388

class normcasespecs(object):

388

class normcasespecs(object):

389

'''what a platform's normcase does to ASCII strings

389

'''what a platform's normcase does to ASCII strings

390

391

This is specified per platform, and should be consistent with what normcase

391

This is specified per platform, and should be consistent with what normcase

392

on that platform actually does.

392

on that platform actually does.

393

394

lower: normcase lowercases ASCII strings

394

lower: normcase lowercases ASCII strings

395

upper: normcase uppercases ASCII strings

395

upper: normcase uppercases ASCII strings

396

other: the fallback function should always be called

396

other: the fallback function should always be called

397

398

This should be kept in sync with normcase_spec in util.h.'''

398

This should be kept in sync with normcase_spec in util.h.'''

399

lower = -1

399

lower = -1

400

upper = 1

400

upper = 1

401

other = 0

401

other = 0

402

403

_jsonmap = []

403

_jsonmap = []

404

_jsonmap.extend("\\u%04x" % x for x in range(32))

404

_jsonmap.extend("\\u%04x" % x for x in range(32))

405

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

405

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

406

_jsonmap.append('\\u007f')

406

_jsonmap.append('\\u007f')

407

_jsonmap[0x09] = '\\t'

407

_jsonmap[0x09] = '\\t'

408

_jsonmap[0x0a] = '\\n'

408

_jsonmap[0x0a] = '\\n'

409

_jsonmap[0x22] = '\\"'

409

_jsonmap[0x22] = '\\"'

410

_jsonmap[0x5c] = '\\\\'

410

_jsonmap[0x5c] = '\\\\'

411

_jsonmap[0x08] = '\\b'

411

_jsonmap[0x08] = '\\b'

412

_jsonmap[0x0c] = '\\f'

412

_jsonmap[0x0c] = '\\f'

413

_jsonmap[0x0d] = '\\r'

413

_jsonmap[0x0d] = '\\r'

414

_paranoidjsonmap = _jsonmap[:]

414

_paranoidjsonmap = _jsonmap[:]

415

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

415

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

416

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

416

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

417

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

417

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

418

419

def jsonescape(s, paranoid=False):

419

def jsonescape(s, paranoid=False):

420

'''returns a string suitable for JSON

420

'''returns a string suitable for JSON

421

422

JSON is problematic for us because it doesn't support non-Unicode

422

JSON is problematic for us because it doesn't support non-Unicode

423

bytes. To deal with this, we take the following approach:

423

bytes. To deal with this, we take the following approach:

424

425

- localstr objects are converted back to UTF-8

425

- localstr objects are converted back to UTF-8

426

- valid UTF-8/ASCII strings are passed as-is

426

- valid UTF-8/ASCII strings are passed as-is

427

- other strings are converted to UTF-8b surrogate encoding

427

- other strings are converted to UTF-8b surrogate encoding

428

- apply JSON-specified string escaping

428

- apply JSON-specified string escaping

429

430

(escapes are doubled in these tests)

430

(escapes are doubled in these tests)

431

432

>>> jsonescape('this is a test')

432

>>> jsonescape('this is a test')

433

'this is a test'

433

'this is a test'

434

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

434

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

435

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

435

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

436

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

436

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

437

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

437

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

438

>>> jsonescape('a weird byte: \\xdd')

438

>>> jsonescape('a weird byte: \\xdd')

439

'a weird byte: \\xed\\xb3\\x9d'

439

'a weird byte: \\xed\\xb3\\x9d'

440

>>> jsonescape('utf-8: caf\\xc3\\xa9')

440

>>> jsonescape('utf-8: caf\\xc3\\xa9')

441

'utf-8: caf\\xc3\\xa9'

441

'utf-8: caf\\xc3\\xa9'

442

>>> jsonescape('')

442

>>> jsonescape('')

443

''

443

''

444

445

If paranoid, non-ascii and common troublesome characters are also escaped.

445

If paranoid, non-ascii and common troublesome characters are also escaped.

446

This is suitable for web output.

446

This is suitable for web output.

447

448

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

448

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

449

'escape boundary: ~ \\\\u007f \\\\u0080'

449

'escape boundary: ~ \\\\u007f \\\\u0080'

450

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

450

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

451

'a weird byte: \\\\udcdd'

451

'a weird byte: \\\\udcdd'

452

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

452

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

453

'utf-8: caf\\\\u00e9'

453

'utf-8: caf\\\\u00e9'

454

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

454

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

455

'non-BMP: \\\\ud834\\\\udd1e'

455

'non-BMP: \\\\ud834\\\\udd1e'

456

>>> jsonescape('<foo@example.org>', paranoid=True)

456

>>> jsonescape('<foo@example.org>', paranoid=True)

457

'\\\\u003cfoo@example.org\\\\u003e'

457

'\\\\u003cfoo@example.org\\\\u003e'

458

'''

458

'''

459

460

if paranoid:

460

if paranoid:

461

jm = _paranoidjsonmap

461

jm = _paranoidjsonmap

462

else:

462

else:

463

jm = _jsonmap

463

jm = _jsonmap

464

465

u8chars = toutf8b(s)

465

u8chars = toutf8b(s)

466

try:

466

try:

467

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

467

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

468

except IndexError:

468

except IndexError:

469

pass

469

pass

470

# non-BMP char is represented as UTF-16 surrogate pair

470

# non-BMP char is represented as UTF-16 surrogate pair

471

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

471

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

472

u16codes.pop(0) # drop BOM

472

u16codes.pop(0) # drop BOM

473

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

473

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

474

475

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

475

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

476

477

def getutf8char(s, pos):

477

def getutf8char(s, pos):

478

'''get the next full utf-8 character in the given string, starting at pos

478

'''get the next full utf-8 character in the given string, starting at pos

479

480

Raises a UnicodeError if the given location does not start a valid

480

Raises a UnicodeError if the given location does not start a valid

481

utf-8 character.

481

utf-8 character.

482

'''

482

'''

483

484

# find how many bytes to attempt decoding from first nibble

484

# find how many bytes to attempt decoding from first nibble

485

l = _utf8len[ord(s[pos]) >> 4]

485

l = _utf8len[ord(s[pos]) >> 4]

486

if not l: # ascii

486

if not l: # ascii

487

return s[pos]

487

return s[pos]

488

489

c = s[pos:pos + l]

489

c = s[pos:pos + l]

490

# validate with attempted decode

490

# validate with attempted decode

491

c.decode("utf-8")

491

c.decode("utf-8")

492

return c

492

return c

493

494

def toutf8b(s):

494

def toutf8b(s):

495

'''convert a local, possibly-binary string into UTF-8b

495

'''convert a local, possibly-binary string into UTF-8b

496

497

This is intended as a generic method to preserve data when working

497

This is intended as a generic method to preserve data when working

498

with schemes like JSON and XML that have no provision for

498

with schemes like JSON and XML that have no provision for

499

arbitrary byte strings. As Mercurial often doesn't know

499

arbitrary byte strings. As Mercurial often doesn't know

500

what encoding data is in, we use so-called UTF-8b.

500

what encoding data is in, we use so-called UTF-8b.

501

502

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

502

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

503

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

503

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

504

uDC00-uDCFF.

504

uDC00-uDCFF.

505

506

Principles of operation:

506

Principles of operation:

507

508

- ASCII and UTF-8 data successfully round-trips and is understood

508

- ASCII and UTF-8 data successfully round-trips and is understood

509

by Unicode-oriented clients

509

by Unicode-oriented clients

510

- filenames and file contents in arbitrary other encodings can have

510

- filenames and file contents in arbitrary other encodings can have

511

be round-tripped or recovered by clueful clients

511

be round-tripped or recovered by clueful clients

512

- local strings that have a cached known UTF-8 encoding (aka

512

- local strings that have a cached known UTF-8 encoding (aka

513

localstr) get sent as UTF-8 so Unicode-oriented clients get the

513

localstr) get sent as UTF-8 so Unicode-oriented clients get the

514

Unicode data they want

514

Unicode data they want

515

- because we must preserve UTF-8 bytestring in places such as

515

- because we must preserve UTF-8 bytestring in places such as

516

filenames, metadata can't be roundtripped without help

516

filenames, metadata can't be roundtripped without help

517

518

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

518

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

519

arbitrary bytes into an internal Unicode format that can be

519

arbitrary bytes into an internal Unicode format that can be

520

re-encoded back into the original. Here we are exposing the

520

re-encoded back into the original. Here we are exposing the

521

internal surrogate encoding as a UTF-8 string.)

521

internal surrogate encoding as a UTF-8 string.)

522

'''

522

'''

523

524

if "\xed" not in s:

524

if "\xed" not in s:

525

if isinstance(s, localstr):

525

if isinstance(s, localstr):

526

return s._utf8

526

return s._utf8

527

try:

527

try:

528

s.decode('utf-8')

528

s.decode('utf-8')

529

return s

529

return s

530

except UnicodeDecodeError:

530

except UnicodeDecodeError:

531

pass

531

pass

532

533

r = ""

533

r = ""

534

pos = 0

534

pos = 0

535

l = len(s)

535

l = len(s)

536

while pos < l:

536

while pos < l:

537

try:

537

try:

538

c = getutf8char(s, pos)

538

c = getutf8char(s, pos)

539

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

539

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

540

# have to re-escape existing U+DCxx characters

540

# have to re-escape existing U+DCxx characters

541

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

541

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

542

pos += 1

542

pos += 1

543

else:

543

else:

544

pos += len(c)

544

pos += len(c)

545

except UnicodeDecodeError:

545

except UnicodeDecodeError:

546

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

546

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

547

pos += 1

547

pos += 1

548

r += c

548

r += c

549

return r

549

return r

550

551

def fromutf8b(s):

551

def fromutf8b(s):

552

'''Given a UTF-8b string, return a local, possibly-binary string.

552

'''Given a UTF-8b string, return a local, possibly-binary string.

553

554

return the original binary string. This

554

return the original binary string. This

555

is a round-trip process for strings like filenames, but metadata

555

is a round-trip process for strings like filenames, but metadata

556

that's was passed through tolocal will remain in UTF-8.

556

that's was passed through tolocal will remain in UTF-8.

557

558

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

558

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

559

>>> m = "\\xc3\\xa9\\x99abcd"

559

>>> m = "\\xc3\\xa9\\x99abcd"

560

>>> toutf8b(m)

560

>>> toutf8b(m)

561

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

561

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

562

>>> roundtrip(m)

562

>>> roundtrip(m)

563

True

563

True

564

>>> roundtrip("\\xc2\\xc2\\x80")

564

>>> roundtrip("\\xc2\\xc2\\x80")

565

True

565

True

566

>>> roundtrip("\\xef\\xbf\\xbd")

566

>>> roundtrip("\\xef\\xbf\\xbd")

567

True

567

True

568

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

568

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

569

True

569

True

570

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

570

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

571

True

571

True

572

'''

572

'''

573

574

# fast path - look for uDxxx prefixes in s

574

# fast path - look for uDxxx prefixes in s

575

if "\xed" not in s:

575

if "\xed" not in s:

576

return s

576

return s

577

578

# We could do this with the unicode type but some Python builds

578

# We could do this with the unicode type but some Python builds

579

# use UTF-16 internally (issue5031) which causes non-BMP code

579

# use UTF-16 internally (issue5031) which causes non-BMP code

580

# points to be escaped. Instead, we use our handy getutf8char

580

# points to be escaped. Instead, we use our handy getutf8char

581

# helper again to walk the string without "decoding" it.

581

# helper again to walk the string without "decoding" it.

582

583

r = ""

583

r = ""

584

pos = 0

584

pos = 0

585

l = len(s)

585

l = len(s)

586

while pos < l:

586

while pos < l:

587

c = getutf8char(s, pos)

587

c = getutf8char(s, pos)

588

pos += len(c)

588

pos += len(c)

589

# unescape U+DCxx characters

589

# unescape U+DCxx characters

590

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

590

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

591

c = chr(ord(c.decode("utf-8")) & 0xff)

591

c = chr(ord(c.decode("utf-8")) & 0xff)

592

r += c

592

r += c

593

return r

593

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
-            wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
+            _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
-                           and "WFA" or "WF")
+                            and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
-                    return sum([eaw(c) in wide and 2 or 1 for c in d])
+                    return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 parsers = policy.importmod(r'parsers')
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 parsers = policy.importmod(r'parsers')
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r