upstream/mercurial-mirror Commit - r38633:44302901

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from . import (

14

from . import (

15

error,

15

error,

16

policy,

16

policy,

17

pycompat,

17

pycompat,

18

)

18

)

19

20

from .pure import (

20

from .pure import (

21

charencode as charencodepure,

21

charencode as charencodepure,

22

)

22

)

23

24

charencode = policy.importmod(r'charencode')

24

charencode = policy.importmod(r'charencode')

25

26

isasciistr = charencode.isasciistr

26

isasciistr = charencode.isasciistr

27

asciilower = charencode.asciilower

27

asciilower = charencode.asciilower

28

asciiupper = charencode.asciiupper

28

asciiupper = charencode.asciiupper

29

_jsonescapeu8fast = charencode.jsonescapeu8fast

29

_jsonescapeu8fast = charencode.jsonescapeu8fast

30

31

_sysstr = pycompat.sysstr

31

_sysstr = pycompat.sysstr

32

33

if pycompat.ispy3:

33

if pycompat.ispy3:

34

unichr = chr

34

unichr = chr

35

36

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

36

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

37

# "Unicode Subtleties"), so we need to ignore them in some places for

37

# "Unicode Subtleties"), so we need to ignore them in some places for

38

# sanity.

38

# sanity.

39

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

39

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

40

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

40

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

"206a 206b 206c 206d 206e 206f feff".split()]

41

"206a 206b 206c 206d 206e 206f feff".split()]

42

# verify the next function will work

42

# verify the next function will work

43

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

43

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

44

45

def hfsignoreclean(s):

45

def hfsignoreclean(s):

46

"""Remove codepoints ignored by HFS+ from s.

46

"""Remove codepoints ignored by HFS+ from s.

47

48

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

48

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

49

'.hg'

49

'.hg'

50

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

50

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

51

'.hg'

51

'.hg'

52

"""

52

"""

53

if "\xe2" in s or "\xef" in s:

53

if "\xe2" in s or "\xef" in s:

54

for c in _ignore:

54

for c in _ignore:

55

s = s.replace(c, '')

55

s = s.replace(c, '')

56

return s

56

return s

57

58

# encoding.environ is provided read-only, which may not be used to modify

58

# encoding.environ is provided read-only, which may not be used to modify

59

# the process environment

59

# the process environment

60

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

60

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

61

if not pycompat.ispy3:

61

if not pycompat.ispy3:

62

environ = os.environ # re-exports

62

environ = os.environ # re-exports

63

elif _nativeenviron:

63

elif _nativeenviron:

64

environ = os.environb # re-exports

64

environ = os.environb # re-exports

65

else:

65

else:

66

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

66

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

67

# and recreate it once encoding is settled

67

# and recreate it once encoding is settled

68

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

68

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

69

for k, v in os.environ.items()) # re-exports

69

for k, v in os.environ.items()) # re-exports

70

71

_encodingfixers = {

71

_encodingfixers = {

72

'646': lambda: 'ascii',

72

'646': lambda: 'ascii',

73

'ANSI_X3.4-1968': lambda: 'ascii',

73

'ANSI_X3.4-1968': lambda: 'ascii',

74

}

74

}

75

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

76

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

77

# https://bugs.python.org/issue13216

78

if pycompat.iswindows and not pycompat.ispy3:

79

_encodingfixers['cp65001'] = lambda: 'utf-8'

75

80

76

try:

81

try:

77

encoding = environ.get("HGENCODING")

82

encoding = environ.get("HGENCODING")

78

if not encoding:

83

if not encoding:

79

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

84

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

80

encoding = _encodingfixers.get(encoding, lambda: encoding)()

85

encoding = _encodingfixers.get(encoding, lambda: encoding)()

81

except locale.Error:

86

except locale.Error:

82

encoding = 'ascii'

87

encoding = 'ascii'

83

encodingmode = environ.get("HGENCODINGMODE", "strict")

88

encodingmode = environ.get("HGENCODINGMODE", "strict")

84

fallbackencoding = 'ISO-8859-1'

89

fallbackencoding = 'ISO-8859-1'

85

90

86

class localstr(bytes):

91

class localstr(bytes):

87

'''This class allows strings that are unmodified to be

92

'''This class allows strings that are unmodified to be

88

round-tripped to the local encoding and back'''

93

round-tripped to the local encoding and back'''

89

def __new__(cls, u, l):

94

def __new__(cls, u, l):

90

s = bytes.__new__(cls, l)

95

s = bytes.__new__(cls, l)

91

s._utf8 = u

96

s._utf8 = u

92

return s

97

return s

93

def __hash__(self):

98

def __hash__(self):

94

return hash(self._utf8) # avoid collisions in local string space

99

return hash(self._utf8) # avoid collisions in local string space

95

100

96

def tolocal(s):

101

def tolocal(s):

97

"""

102

"""

98

Convert a string from internal UTF-8 to local encoding

103

Convert a string from internal UTF-8 to local encoding

99

104

100

All internal strings should be UTF-8 but some repos before the

105

All internal strings should be UTF-8 but some repos before the

101

implementation of locale support may contain latin1 or possibly

106

implementation of locale support may contain latin1 or possibly

102

other character sets. We attempt to decode everything strictly

107

other character sets. We attempt to decode everything strictly

103

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

108

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

104

replace unknown characters.

109

replace unknown characters.

105

110

106

The localstr class is used to cache the known UTF-8 encoding of

111

The localstr class is used to cache the known UTF-8 encoding of

107

strings next to their local representation to allow lossless

112

strings next to their local representation to allow lossless

108

round-trip conversion back to UTF-8.

113

round-trip conversion back to UTF-8.

109

114

110

>>> u = b'foo: \\xc3\\xa4' # utf-8

115

>>> u = b'foo: \\xc3\\xa4' # utf-8

111

>>> l = tolocal(u)

116

>>> l = tolocal(u)

112

>>> l

117

>>> l

113

'foo: ?'

118

'foo: ?'

114

>>> fromlocal(l)

119

>>> fromlocal(l)

115

'foo: \\xc3\\xa4'

120

'foo: \\xc3\\xa4'

116

>>> u2 = b'foo: \\xc3\\xa1'

121

>>> u2 = b'foo: \\xc3\\xa1'

117

>>> d = { l: 1, tolocal(u2): 2 }

122

>>> d = { l: 1, tolocal(u2): 2 }

118

>>> len(d) # no collision

123

>>> len(d) # no collision

119

2

124

2

120

>>> b'foo: ?' in d

125

>>> b'foo: ?' in d

121

False

126

False

122

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

127

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

123

>>> l = tolocal(l1)

128

>>> l = tolocal(l1)

124

>>> l

129

>>> l

125

'foo: ?'

130

'foo: ?'

126

>>> fromlocal(l) # magically in utf-8

131

>>> fromlocal(l) # magically in utf-8

127

'foo: \\xc3\\xa4'

132

'foo: \\xc3\\xa4'

128

"""

133

"""

129

134

130

if isasciistr(s):

135

if isasciistr(s):

131

return s

136

return s

132

137

133

try:

138

try:

134

try:

139

try:

135

# make sure string is actually stored in UTF-8

140

# make sure string is actually stored in UTF-8

136

u = s.decode('UTF-8')

141

u = s.decode('UTF-8')

137

if encoding == 'UTF-8':

142

if encoding == 'UTF-8':

138

# fast path

143

# fast path

139

return s

144

return s

140

r = u.encode(_sysstr(encoding), u"replace")

145

r = u.encode(_sysstr(encoding), u"replace")

141

if u == r.decode(_sysstr(encoding)):

146

if u == r.decode(_sysstr(encoding)):

142

# r is a safe, non-lossy encoding of s

147

# r is a safe, non-lossy encoding of s

143

return r

148

return r

144

return localstr(s, r)

149

return localstr(s, r)

145

except UnicodeDecodeError:

150

except UnicodeDecodeError:

146

# we should only get here if we're looking at an ancient changeset

151

# we should only get here if we're looking at an ancient changeset

147

try:

152

try:

148

u = s.decode(_sysstr(fallbackencoding))

153

u = s.decode(_sysstr(fallbackencoding))

149

r = u.encode(_sysstr(encoding), u"replace")

154

r = u.encode(_sysstr(encoding), u"replace")

150

if u == r.decode(_sysstr(encoding)):

155

if u == r.decode(_sysstr(encoding)):

151

# r is a safe, non-lossy encoding of s

156

# r is a safe, non-lossy encoding of s

152

return r

157

return r

153

return localstr(u.encode('UTF-8'), r)

158

return localstr(u.encode('UTF-8'), r)

154

except UnicodeDecodeError:

159

except UnicodeDecodeError:

155

u = s.decode("utf-8", "replace") # last ditch

160

u = s.decode("utf-8", "replace") # last ditch

156

# can't round-trip

161

# can't round-trip

157

return u.encode(_sysstr(encoding), u"replace")

162

return u.encode(_sysstr(encoding), u"replace")

158

except LookupError as k:

163

except LookupError as k:

159

raise error.Abort(k, hint="please check your locale settings")

164

raise error.Abort(k, hint="please check your locale settings")

160

165

161

def fromlocal(s):

166

def fromlocal(s):

162

"""

167

"""

163

Convert a string from the local character encoding to UTF-8

168

Convert a string from the local character encoding to UTF-8

164

169

165

We attempt to decode strings using the encoding mode set by

170

We attempt to decode strings using the encoding mode set by

166

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

171

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

167

characters will cause an error message. Other modes include

172

characters will cause an error message. Other modes include

168

'replace', which replaces unknown characters with a special

173

'replace', which replaces unknown characters with a special

169

Unicode character, and 'ignore', which drops the character.

174

Unicode character, and 'ignore', which drops the character.

170

"""

175

"""

171

176

172

# can we do a lossless round-trip?

177

# can we do a lossless round-trip?

173

if isinstance(s, localstr):

178

if isinstance(s, localstr):

174

return s._utf8

179

return s._utf8

175

if isasciistr(s):

180

if isasciistr(s):

176

return s

181

return s

177

182

178

try:

183

try:

179

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

184

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

180

return u.encode("utf-8")

185

return u.encode("utf-8")

181

except UnicodeDecodeError as inst:

186

except UnicodeDecodeError as inst:

182

sub = s[max(0, inst.start - 10):inst.start + 10]

187

sub = s[max(0, inst.start - 10):inst.start + 10]

183

raise error.Abort("decoding near '%s': %s!"

188

raise error.Abort("decoding near '%s': %s!"

184

% (sub, pycompat.bytestr(inst)))

189

% (sub, pycompat.bytestr(inst)))

185

except LookupError as k:

190

except LookupError as k:

186

raise error.Abort(k, hint="please check your locale settings")

191

raise error.Abort(k, hint="please check your locale settings")

187

192

188

def unitolocal(u):

193

def unitolocal(u):

189

"""Convert a unicode string to a byte string of local encoding"""

194

"""Convert a unicode string to a byte string of local encoding"""

190

return tolocal(u.encode('utf-8'))

195

return tolocal(u.encode('utf-8'))

191

196

192

def unifromlocal(s):

197

def unifromlocal(s):

193

"""Convert a byte string of local encoding to a unicode string"""

198

"""Convert a byte string of local encoding to a unicode string"""

194

return fromlocal(s).decode('utf-8')

199

return fromlocal(s).decode('utf-8')

195

200

196

def unimethod(bytesfunc):

201

def unimethod(bytesfunc):

197

"""Create a proxy method that forwards __unicode__() and __str__() of

202

"""Create a proxy method that forwards __unicode__() and __str__() of

198

Python 3 to __bytes__()"""

203

Python 3 to __bytes__()"""

199

def unifunc(obj):

204

def unifunc(obj):

200

return unifromlocal(bytesfunc(obj))

205

return unifromlocal(bytesfunc(obj))

201

return unifunc

206

return unifunc

202

207

203

# converter functions between native str and byte string. use these if the

208

# converter functions between native str and byte string. use these if the

204

# character encoding is not aware (e.g. exception message) or is known to

209

# character encoding is not aware (e.g. exception message) or is known to

205

# be locale dependent (e.g. date formatting.)

210

# be locale dependent (e.g. date formatting.)

206

if pycompat.ispy3:

211

if pycompat.ispy3:

207

strtolocal = unitolocal

212

strtolocal = unitolocal

208

strfromlocal = unifromlocal

213

strfromlocal = unifromlocal

209

strmethod = unimethod

214

strmethod = unimethod

210

else:

215

else:

211

strtolocal = pycompat.identity

216

strtolocal = pycompat.identity

212

strfromlocal = pycompat.identity

217

strfromlocal = pycompat.identity

213

strmethod = pycompat.identity

218

strmethod = pycompat.identity

214

219

215

if not _nativeenviron:

220

if not _nativeenviron:

216

# now encoding and helper functions are available, recreate the environ

221

# now encoding and helper functions are available, recreate the environ

217

# dict to be exported to other modules

222

# dict to be exported to other modules

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

223

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

219

for k, v in os.environ.items()) # re-exports

224

for k, v in os.environ.items()) # re-exports

220

225

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

226

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

227

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

223

and "WFA" or "WF")

228

and "WFA" or "WF")

224

229

225

def colwidth(s):

230

def colwidth(s):

226

"Find the column width of a string for display in the local encoding"

231

"Find the column width of a string for display in the local encoding"

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

232

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

228

233

229

def ucolwidth(d):

234

def ucolwidth(d):

230

"Find the column width of a Unicode string for display"

235

"Find the column width of a Unicode string for display"

231

eaw = getattr(unicodedata, 'east_asian_width', None)

236

eaw = getattr(unicodedata, 'east_asian_width', None)

232

if eaw is not None:

237

if eaw is not None:

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

238

return sum([eaw(c) in _wide and 2 or 1 for c in d])

234

return len(d)

239

return len(d)

235

240

236

def getcols(s, start, c):

241

def getcols(s, start, c):

237

'''Use colwidth to find a c-column substring of s starting at byte

242

'''Use colwidth to find a c-column substring of s starting at byte

238

index start'''

243

index start'''

239

for x in xrange(start + c, len(s)):

244

for x in xrange(start + c, len(s)):

240

t = s[start:x]

245

t = s[start:x]

241

if colwidth(t) == c:

246

if colwidth(t) == c:

242

return t

247

return t

243

248

244

def trim(s, width, ellipsis='', leftside=False):

249

def trim(s, width, ellipsis='', leftside=False):

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

250

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

246

251

247

If 'leftside' is True, left side of string 's' is trimmed.

252

If 'leftside' is True, left side of string 's' is trimmed.

248

'ellipsis' is always placed at trimmed side.

253

'ellipsis' is always placed at trimmed side.

249

254

250

>>> from .node import bin

255

>>> from .node import bin

251

>>> def bprint(s):

256

>>> def bprint(s):

252

... print(pycompat.sysstr(s))

257

... print(pycompat.sysstr(s))

253

>>> ellipsis = b'+++'

258

>>> ellipsis = b'+++'

254

>>> from . import encoding

259

>>> from . import encoding

255

>>> encoding.encoding = b'utf-8'

260

>>> encoding.encoding = b'utf-8'

256

>>> t = b'1234567890'

261

>>> t = b'1234567890'

257

>>> bprint(trim(t, 12, ellipsis=ellipsis))

262

>>> bprint(trim(t, 12, ellipsis=ellipsis))

258

1234567890

263

1234567890

259

>>> bprint(trim(t, 10, ellipsis=ellipsis))

264

>>> bprint(trim(t, 10, ellipsis=ellipsis))

260

1234567890

265

1234567890

261

>>> bprint(trim(t, 8, ellipsis=ellipsis))

266

>>> bprint(trim(t, 8, ellipsis=ellipsis))

262

12345+++

267

12345+++

263

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

268

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

264

+++67890

269

+++67890

265

>>> bprint(trim(t, 8))

270

>>> bprint(trim(t, 8))

266

12345678

271

12345678

267

>>> bprint(trim(t, 8, leftside=True))

272

>>> bprint(trim(t, 8, leftside=True))

268

34567890

273

34567890

269

>>> bprint(trim(t, 3, ellipsis=ellipsis))

274

>>> bprint(trim(t, 3, ellipsis=ellipsis))

270

+++

275

+++

271

>>> bprint(trim(t, 1, ellipsis=ellipsis))

276

>>> bprint(trim(t, 1, ellipsis=ellipsis))

272

+

277

+

273

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

278

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

274

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

279

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

275

>>> bprint(trim(t, 12, ellipsis=ellipsis))

280

>>> bprint(trim(t, 12, ellipsis=ellipsis))

276

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

281

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

277

>>> bprint(trim(t, 10, ellipsis=ellipsis))

282

>>> bprint(trim(t, 10, ellipsis=ellipsis))

278

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

283

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

279

>>> bprint(trim(t, 8, ellipsis=ellipsis))

284

>>> bprint(trim(t, 8, ellipsis=ellipsis))

280

\xe3\x81\x82\xe3\x81\x84+++

285

\xe3\x81\x82\xe3\x81\x84+++

281

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

286

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

282

+++\xe3\x81\x88\xe3\x81\x8a

287

+++\xe3\x81\x88\xe3\x81\x8a

283

>>> bprint(trim(t, 5))

288

>>> bprint(trim(t, 5))

284

\xe3\x81\x82\xe3\x81\x84

289

\xe3\x81\x82\xe3\x81\x84

285

>>> bprint(trim(t, 5, leftside=True))

290

>>> bprint(trim(t, 5, leftside=True))

286

\xe3\x81\x88\xe3\x81\x8a

291

\xe3\x81\x88\xe3\x81\x8a

287

>>> bprint(trim(t, 4, ellipsis=ellipsis))

292

>>> bprint(trim(t, 4, ellipsis=ellipsis))

288

+++

293

+++

289

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

294

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

290

+++

295

+++

291

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

296

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

292

>>> bprint(trim(t, 12, ellipsis=ellipsis))

297

>>> bprint(trim(t, 12, ellipsis=ellipsis))

293

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

298

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

294

>>> bprint(trim(t, 10, ellipsis=ellipsis))

299

>>> bprint(trim(t, 10, ellipsis=ellipsis))

295

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

300

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

296

>>> bprint(trim(t, 8, ellipsis=ellipsis))

301

>>> bprint(trim(t, 8, ellipsis=ellipsis))

297

\x11\x22\x33\x44\x55+++

302

\x11\x22\x33\x44\x55+++

298

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

303

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

299

+++\x66\x77\x88\x99\xaa

304

+++\x66\x77\x88\x99\xaa

300

>>> bprint(trim(t, 8))

305

>>> bprint(trim(t, 8))

301

\x11\x22\x33\x44\x55\x66\x77\x88

306

\x11\x22\x33\x44\x55\x66\x77\x88

302

>>> bprint(trim(t, 8, leftside=True))

307

>>> bprint(trim(t, 8, leftside=True))

303

\x33\x44\x55\x66\x77\x88\x99\xaa

308

\x33\x44\x55\x66\x77\x88\x99\xaa

304

>>> bprint(trim(t, 3, ellipsis=ellipsis))

309

>>> bprint(trim(t, 3, ellipsis=ellipsis))

305

+++

310

+++

306

>>> bprint(trim(t, 1, ellipsis=ellipsis))

311

>>> bprint(trim(t, 1, ellipsis=ellipsis))

307

+

312

+

308

"""

313

"""

309

try:

314

try:

310

u = s.decode(_sysstr(encoding))

315

u = s.decode(_sysstr(encoding))

311

except UnicodeDecodeError:

316

except UnicodeDecodeError:

312

if len(s) <= width: # trimming is not needed

317

if len(s) <= width: # trimming is not needed

313

return s

318

return s

314

width -= len(ellipsis)

319

width -= len(ellipsis)

315

if width <= 0: # no enough room even for ellipsis

320

if width <= 0: # no enough room even for ellipsis

316

return ellipsis[:width + len(ellipsis)]

321

return ellipsis[:width + len(ellipsis)]

317

if leftside:

322

if leftside:

318

return ellipsis + s[-width:]

323

return ellipsis + s[-width:]

319

return s[:width] + ellipsis

324

return s[:width] + ellipsis

320

325

321

if ucolwidth(u) <= width: # trimming is not needed

326

if ucolwidth(u) <= width: # trimming is not needed

322

return s

327

return s

323

328

324

width -= len(ellipsis)

329

width -= len(ellipsis)

325

if width <= 0: # no enough room even for ellipsis

330

if width <= 0: # no enough room even for ellipsis

326

return ellipsis[:width + len(ellipsis)]

331

return ellipsis[:width + len(ellipsis)]

327

332

328

if leftside:

333

if leftside:

329

uslice = lambda i: u[i:]

334

uslice = lambda i: u[i:]

330

concat = lambda s: ellipsis + s

335

concat = lambda s: ellipsis + s

331

else:

336

else:

332

uslice = lambda i: u[:-i]

337

uslice = lambda i: u[:-i]

333

concat = lambda s: s + ellipsis

338

concat = lambda s: s + ellipsis

334

for i in xrange(1, len(u)):

339

for i in xrange(1, len(u)):

335

usub = uslice(i)

340

usub = uslice(i)

336

if ucolwidth(usub) <= width:

341

if ucolwidth(usub) <= width:

337

return concat(usub.encode(_sysstr(encoding)))

342

return concat(usub.encode(_sysstr(encoding)))

338

return ellipsis # no enough room for multi-column characters

343

return ellipsis # no enough room for multi-column characters

339

344

340

def lower(s):

345

def lower(s):

341

"best-effort encoding-aware case-folding of local string s"

346

"best-effort encoding-aware case-folding of local string s"

342

try:

347

try:

343

return asciilower(s)

348

return asciilower(s)

344

except UnicodeDecodeError:

349

except UnicodeDecodeError:

345

pass

350

pass

346

try:

351

try:

347

if isinstance(s, localstr):

352

if isinstance(s, localstr):

348

u = s._utf8.decode("utf-8")

353

u = s._utf8.decode("utf-8")

349

else:

354

else:

350

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

355

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

351

356

352

lu = u.lower()

357

lu = u.lower()

353

if u == lu:

358

if u == lu:

354

return s # preserve localstring

359

return s # preserve localstring

355

return lu.encode(_sysstr(encoding))

360

return lu.encode(_sysstr(encoding))

356

except UnicodeError:

361

except UnicodeError:

357

return s.lower() # we don't know how to fold this except in ASCII

362

return s.lower() # we don't know how to fold this except in ASCII

358

except LookupError as k:

363

except LookupError as k:

359

raise error.Abort(k, hint="please check your locale settings")

364

raise error.Abort(k, hint="please check your locale settings")

360

365

361

def upper(s):

366

def upper(s):

362

"best-effort encoding-aware case-folding of local string s"

367

"best-effort encoding-aware case-folding of local string s"

363

try:

368

try:

364

return asciiupper(s)

369

return asciiupper(s)

365

except UnicodeDecodeError:

370

except UnicodeDecodeError:

366

return upperfallback(s)

371

return upperfallback(s)

367

372

368

def upperfallback(s):

373

def upperfallback(s):

369

try:

374

try:

370

if isinstance(s, localstr):

375

if isinstance(s, localstr):

371

u = s._utf8.decode("utf-8")

376

u = s._utf8.decode("utf-8")

372

else:

377

else:

373

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

378

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

374

379

375

uu = u.upper()

380

uu = u.upper()

376

if u == uu:

381

if u == uu:

377

return s # preserve localstring

382

return s # preserve localstring

378

return uu.encode(_sysstr(encoding))

383

return uu.encode(_sysstr(encoding))

379

except UnicodeError:

384

except UnicodeError:

380

return s.upper() # we don't know how to fold this except in ASCII

385

return s.upper() # we don't know how to fold this except in ASCII

381

except LookupError as k:

386

except LookupError as k:

382

raise error.Abort(k, hint="please check your locale settings")

387

raise error.Abort(k, hint="please check your locale settings")

383

388

384

class normcasespecs(object):

389

class normcasespecs(object):

385

'''what a platform's normcase does to ASCII strings

390

'''what a platform's normcase does to ASCII strings

386

391

387

This is specified per platform, and should be consistent with what normcase

392

This is specified per platform, and should be consistent with what normcase

388

on that platform actually does.

393

on that platform actually does.

389

394

390

lower: normcase lowercases ASCII strings

395

lower: normcase lowercases ASCII strings

391

upper: normcase uppercases ASCII strings

396

upper: normcase uppercases ASCII strings

392

other: the fallback function should always be called

397

other: the fallback function should always be called

393

398

394

This should be kept in sync with normcase_spec in util.h.'''

399

This should be kept in sync with normcase_spec in util.h.'''

395

lower = -1

400

lower = -1

396

upper = 1

401

upper = 1

397

other = 0

402

other = 0

398

403

399

def jsonescape(s, paranoid=False):

404

def jsonescape(s, paranoid=False):

400

'''returns a string suitable for JSON

405

'''returns a string suitable for JSON

401

406

402

JSON is problematic for us because it doesn't support non-Unicode

407

JSON is problematic for us because it doesn't support non-Unicode

403

bytes. To deal with this, we take the following approach:

408

bytes. To deal with this, we take the following approach:

404

409

405

- localstr objects are converted back to UTF-8

410

- localstr objects are converted back to UTF-8

406

- valid UTF-8/ASCII strings are passed as-is

411

- valid UTF-8/ASCII strings are passed as-is

407

- other strings are converted to UTF-8b surrogate encoding

412

- other strings are converted to UTF-8b surrogate encoding

408

- apply JSON-specified string escaping

413

- apply JSON-specified string escaping

409

414

410

(escapes are doubled in these tests)

415

(escapes are doubled in these tests)

411

416

412

>>> jsonescape(b'this is a test')

417

>>> jsonescape(b'this is a test')

413

'this is a test'

418

'this is a test'

414

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

419

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

415

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

420

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

416

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

421

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

417

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

422

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

418

>>> jsonescape(b'a weird byte: \\xdd')

423

>>> jsonescape(b'a weird byte: \\xdd')

419

'a weird byte: \\xed\\xb3\\x9d'

424

'a weird byte: \\xed\\xb3\\x9d'

420

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

425

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

421

'utf-8: caf\\xc3\\xa9'

426

'utf-8: caf\\xc3\\xa9'

422

>>> jsonescape(b'')

427

>>> jsonescape(b'')

423

''

428

''

424

429

425

If paranoid, non-ascii and common troublesome characters are also escaped.

430

If paranoid, non-ascii and common troublesome characters are also escaped.

426

This is suitable for web output.

431

This is suitable for web output.

427

432

428

>>> s = b'escape characters: \\0 \\x0b \\x7f'

433

>>> s = b'escape characters: \\0 \\x0b \\x7f'

429

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

434

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

430

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

435

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

431

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

436

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

432

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

437

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

433

'escape boundary: ~ \\\\u007f \\\\u0080'

438

'escape boundary: ~ \\\\u007f \\\\u0080'

434

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

439

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

435

'a weird byte: \\\\udcdd'

440

'a weird byte: \\\\udcdd'

436

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

441

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

437

'utf-8: caf\\\\u00e9'

442

'utf-8: caf\\\\u00e9'

438

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

443

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

439

'non-BMP: \\\\ud834\\\\udd1e'

444

'non-BMP: \\\\ud834\\\\udd1e'

440

>>> jsonescape(b'<foo@example.org>', paranoid=True)

445

>>> jsonescape(b'<foo@example.org>', paranoid=True)

441

'\\\\u003cfoo@example.org\\\\u003e'

446

'\\\\u003cfoo@example.org\\\\u003e'

442

'''

447

'''

443

448

444

u8chars = toutf8b(s)

449

u8chars = toutf8b(s)

445

try:

450

try:

446

return _jsonescapeu8fast(u8chars, paranoid)

451

return _jsonescapeu8fast(u8chars, paranoid)

447

except ValueError:

452

except ValueError:

448

pass

453

pass

449

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

454

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

450

455

451

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

456

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

452

# bytes are mapped to that range.

457

# bytes are mapped to that range.

453

if pycompat.ispy3:

458

if pycompat.ispy3:

454

_utf8strict = r'surrogatepass'

459

_utf8strict = r'surrogatepass'

455

else:

460

else:

456

_utf8strict = r'strict'

461

_utf8strict = r'strict'

457

462

458

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

463

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

459

464

460

def getutf8char(s, pos):

465

def getutf8char(s, pos):

461

'''get the next full utf-8 character in the given string, starting at pos

466

'''get the next full utf-8 character in the given string, starting at pos

462

467

463

Raises a UnicodeError if the given location does not start a valid

468

Raises a UnicodeError if the given location does not start a valid

464

utf-8 character.

469

utf-8 character.

465

'''

470

'''

466

471

467

# find how many bytes to attempt decoding from first nibble

472

# find how many bytes to attempt decoding from first nibble

468

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

473

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

469

if not l: # ascii

474

if not l: # ascii

470

return s[pos:pos + 1]

475

return s[pos:pos + 1]

471

476

472

c = s[pos:pos + l]

477

c = s[pos:pos + l]

473

# validate with attempted decode

478

# validate with attempted decode

474

c.decode("utf-8", _utf8strict)

479

c.decode("utf-8", _utf8strict)

475

return c

480

return c

476

481

477

def toutf8b(s):

482

def toutf8b(s):

478

'''convert a local, possibly-binary string into UTF-8b

483

'''convert a local, possibly-binary string into UTF-8b

479

484

480

This is intended as a generic method to preserve data when working

485

This is intended as a generic method to preserve data when working

481

with schemes like JSON and XML that have no provision for

486

with schemes like JSON and XML that have no provision for

482

arbitrary byte strings. As Mercurial often doesn't know

487

arbitrary byte strings. As Mercurial often doesn't know

483

what encoding data is in, we use so-called UTF-8b.

488

what encoding data is in, we use so-called UTF-8b.

484

489

485

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

490

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

486

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

491

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

487

uDC00-uDCFF.

492

uDC00-uDCFF.

488

493

489

Principles of operation:

494

Principles of operation:

490

495

491

- ASCII and UTF-8 data successfully round-trips and is understood

496

- ASCII and UTF-8 data successfully round-trips and is understood

492

by Unicode-oriented clients

497

by Unicode-oriented clients

493

- filenames and file contents in arbitrary other encodings can have

498

- filenames and file contents in arbitrary other encodings can have

494

be round-tripped or recovered by clueful clients

499

be round-tripped or recovered by clueful clients

495

- local strings that have a cached known UTF-8 encoding (aka

500

- local strings that have a cached known UTF-8 encoding (aka

496

localstr) get sent as UTF-8 so Unicode-oriented clients get the

501

localstr) get sent as UTF-8 so Unicode-oriented clients get the

497

Unicode data they want

502

Unicode data they want

498

- because we must preserve UTF-8 bytestring in places such as

503

- because we must preserve UTF-8 bytestring in places such as

499

filenames, metadata can't be roundtripped without help

504

filenames, metadata can't be roundtripped without help

500

505

501

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

506

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

502

arbitrary bytes into an internal Unicode format that can be

507

arbitrary bytes into an internal Unicode format that can be

503

re-encoded back into the original. Here we are exposing the

508

re-encoded back into the original. Here we are exposing the

504

internal surrogate encoding as a UTF-8 string.)

509

internal surrogate encoding as a UTF-8 string.)

505

'''

510

'''

506

511

507

if not isinstance(s, localstr) and isasciistr(s):

512

if not isinstance(s, localstr) and isasciistr(s):

508

return s

513

return s

509

if "\xed" not in s:

514

if "\xed" not in s:

510

if isinstance(s, localstr):

515

if isinstance(s, localstr):

511

return s._utf8

516

return s._utf8

512

try:

517

try:

513

s.decode('utf-8', _utf8strict)

518

s.decode('utf-8', _utf8strict)

514

return s

519

return s

515

except UnicodeDecodeError:

520

except UnicodeDecodeError:

516

pass

521

pass

517

522

518

s = pycompat.bytestr(s)

523

s = pycompat.bytestr(s)

519

r = ""

524

r = ""

520

pos = 0

525

pos = 0

521

l = len(s)

526

l = len(s)

522

while pos < l:

527

while pos < l:

523

try:

528

try:

524

c = getutf8char(s, pos)

529

c = getutf8char(s, pos)

525

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

530

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

526

# have to re-escape existing U+DCxx characters

531

# have to re-escape existing U+DCxx characters

527

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

532

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

528

pos += 1

533

pos += 1

529

else:

534

else:

530

pos += len(c)

535

pos += len(c)

531

except UnicodeDecodeError:

536

except UnicodeDecodeError:

532

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

537

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

533

pos += 1

538

pos += 1

534

r += c

539

r += c

535

return r

540

return r

536

541

537

def fromutf8b(s):

542

def fromutf8b(s):

538

'''Given a UTF-8b string, return a local, possibly-binary string.

543

'''Given a UTF-8b string, return a local, possibly-binary string.

539

544

540

return the original binary string. This

545

return the original binary string. This

541

is a round-trip process for strings like filenames, but metadata

546

is a round-trip process for strings like filenames, but metadata

542

that's was passed through tolocal will remain in UTF-8.

547

that's was passed through tolocal will remain in UTF-8.

543

548

544

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

549

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

545

>>> m = b"\\xc3\\xa9\\x99abcd"

550

>>> m = b"\\xc3\\xa9\\x99abcd"

546

>>> toutf8b(m)

551

>>> toutf8b(m)

547

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

552

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

548

>>> roundtrip(m)

553

>>> roundtrip(m)

549

True

554

True

550

>>> roundtrip(b"\\xc2\\xc2\\x80")

555

>>> roundtrip(b"\\xc2\\xc2\\x80")

551

True

556

True

552

>>> roundtrip(b"\\xef\\xbf\\xbd")

557

>>> roundtrip(b"\\xef\\xbf\\xbd")

553

True

558

True

554

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

559

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

555

True

560

True

556

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

561

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

557

True

562

True

558

'''

563

'''

559

564

560

if isasciistr(s):

565

if isasciistr(s):

561

return s

566

return s

562

# fast path - look for uDxxx prefixes in s

567

# fast path - look for uDxxx prefixes in s

563

if "\xed" not in s:

568

if "\xed" not in s:

564

return s

569

return s

565

570

566

# We could do this with the unicode type but some Python builds

571

# We could do this with the unicode type but some Python builds

567

# use UTF-16 internally (issue5031) which causes non-BMP code

572

# use UTF-16 internally (issue5031) which causes non-BMP code

568

# points to be escaped. Instead, we use our handy getutf8char

573

# points to be escaped. Instead, we use our handy getutf8char

569

# helper again to walk the string without "decoding" it.

574

# helper again to walk the string without "decoding" it.

570

575

571

s = pycompat.bytestr(s)

576

s = pycompat.bytestr(s)

572

r = ""

577

r = ""

573

pos = 0

578

pos = 0

574

l = len(s)

579

l = len(s)

575

while pos < l:

580

while pos < l:

576

c = getutf8char(s, pos)

581

c = getutf8char(s, pos)

577

pos += len(c)

582

pos += len(c)

578

# unescape U+DCxx characters

583

# unescape U+DCxx characters

579

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

584

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

580

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)

585

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)

581

r += c

586

r += c

582

return r

587

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import (
                 charencode as charencodepure,
             )
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
+            # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
+            # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
+            # https://bugs.python.org/issue13216
+            if pycompat.iswindows and not pycompat.ispy3:
+                _encodingfixers['cp65001'] = lambda: 'utf-8'
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!"
                                       % (sub, pycompat.bytestr(inst)))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
                 if not l: # ascii
                     return s[pos:pos + 1]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if not isinstance(s, localstr) and isasciistr(s):
                     return s
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
                     r += c
                 return r