upstream/mercurial-mirror Commit - r33810:dabe1f11

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

charencode = policy.importmod(r'charencode')

21

charencode = policy.importmod(r'charencode')

22

23

asciilower = charencode.asciilower

23

asciilower = charencode.asciilower

24

asciiupper = charencode.asciiupper

24

asciiupper = charencode.asciiupper

25

26

_sysstr = pycompat.sysstr

26

_sysstr = pycompat.sysstr

27

28

if pycompat.ispy3:

28

if pycompat.ispy3:

29

unichr = chr

29

unichr = chr

30

31

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

31

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

32

# "Unicode Subtleties"), so we need to ignore them in some places for

32

# "Unicode Subtleties"), so we need to ignore them in some places for

33

# sanity.

33

# sanity.

34

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

34

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

35

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

35

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

36

"206a 206b 206c 206d 206e 206f feff".split()]

36

"206a 206b 206c 206d 206e 206f feff".split()]

37

# verify the next function will work

37

# verify the next function will work

38

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

38

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

39

40

def hfsignoreclean(s):

40

def hfsignoreclean(s):

41

"""Remove codepoints ignored by HFS+ from s.

41

"""Remove codepoints ignored by HFS+ from s.

42

43

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

43

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

44

'.hg'

44

'.hg'

45

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

45

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

46

'.hg'

46

'.hg'

47

"""

47

"""

48

if "\xe2" in s or "\xef" in s:

48

if "\xe2" in s or "\xef" in s:

49

for c in _ignore:

49

for c in _ignore:

50

s = s.replace(c, '')

50

s = s.replace(c, '')

51

return s

51

return s

52

53

# encoding.environ is provided read-only, which may not be used to modify

53

# encoding.environ is provided read-only, which may not be used to modify

54

# the process environment

54

# the process environment

55

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

55

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

56

if not pycompat.ispy3:

56

if not pycompat.ispy3:

57

environ = os.environ # re-exports

57

environ = os.environ # re-exports

58

elif _nativeenviron:

58

elif _nativeenviron:

59

environ = os.environb # re-exports

59

environ = os.environb # re-exports

60

else:

60

else:

61

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

61

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

62

# and recreate it once encoding is settled

62

# and recreate it once encoding is settled

63

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

63

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

64

for k, v in os.environ.items()) # re-exports

64

for k, v in os.environ.items()) # re-exports

65

66

_encodingfixers = {

66

_encodingfixers = {

67

'646': lambda: 'ascii',

67

'646': lambda: 'ascii',

68

'ANSI_X3.4-1968': lambda: 'ascii',

68

'ANSI_X3.4-1968': lambda: 'ascii',

69

}

69

}

70

71

try:

71

try:

72

encoding = environ.get("HGENCODING")

72

encoding = environ.get("HGENCODING")

73

if not encoding:

73

if not encoding:

74

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

74

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

75

encoding = _encodingfixers.get(encoding, lambda: encoding)()

75

encoding = _encodingfixers.get(encoding, lambda: encoding)()

76

except locale.Error:

76

except locale.Error:

77

encoding = 'ascii'

77

encoding = 'ascii'

78

encodingmode = environ.get("HGENCODINGMODE", "strict")

78

encodingmode = environ.get("HGENCODINGMODE", "strict")

79

fallbackencoding = 'ISO-8859-1'

79

fallbackencoding = 'ISO-8859-1'

80

81

class localstr(~~str~~):

81

class localstr(bytes):

82

'''This class allows strings that are unmodified to be

82

'''This class allows strings that are unmodified to be

83

round-tripped to the local encoding and back'''

83

round-tripped to the local encoding and back'''

84

def __new__(cls, u, l):

84

def __new__(cls, u, l):

85

s = ~~str~~.__new__(cls, l)

85

s = bytes.__new__(cls, l)

86

s._utf8 = u

86

s._utf8 = u

87

return s

87

return s

88

def __hash__(self):

88

def __hash__(self):

89

return hash(self._utf8) # avoid collisions in local string space

89

return hash(self._utf8) # avoid collisions in local string space

90

91

def tolocal(s):

91

def tolocal(s):

92

"""

92

"""

93

Convert a string from internal UTF-8 to local encoding

93

Convert a string from internal UTF-8 to local encoding

94

95

All internal strings should be UTF-8 but some repos before the

95

All internal strings should be UTF-8 but some repos before the

96

implementation of locale support may contain latin1 or possibly

96

implementation of locale support may contain latin1 or possibly

97

other character sets. We attempt to decode everything strictly

97

other character sets. We attempt to decode everything strictly

98

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

98

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

99

replace unknown characters.

99

replace unknown characters.

100

101

The localstr class is used to cache the known UTF-8 encoding of

101

The localstr class is used to cache the known UTF-8 encoding of

102

strings next to their local representation to allow lossless

102

strings next to their local representation to allow lossless

103

round-trip conversion back to UTF-8.

103

round-trip conversion back to UTF-8.

104

105

>>> u = 'foo: \\xc3\\xa4' # utf-8

105

>>> u = 'foo: \\xc3\\xa4' # utf-8

106

>>> l = tolocal(u)

106

>>> l = tolocal(u)

107

>>> l

107

>>> l

108

'foo: ?'

108

'foo: ?'

109

>>> fromlocal(l)

109

>>> fromlocal(l)

110

'foo: \\xc3\\xa4'

110

'foo: \\xc3\\xa4'

111

>>> u2 = 'foo: \\xc3\\xa1'

111

>>> u2 = 'foo: \\xc3\\xa1'

112

>>> d = { l: 1, tolocal(u2): 2 }

112

>>> d = { l: 1, tolocal(u2): 2 }

113

>>> len(d) # no collision

113

>>> len(d) # no collision

114

2

114

2

115

>>> 'foo: ?' in d

115

>>> 'foo: ?' in d

116

False

116

False

117

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

117

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

118

>>> l = tolocal(l1)

118

>>> l = tolocal(l1)

119

>>> l

119

>>> l

120

'foo: ?'

120

'foo: ?'

121

>>> fromlocal(l) # magically in utf-8

121

>>> fromlocal(l) # magically in utf-8

122

'foo: \\xc3\\xa4'

122

'foo: \\xc3\\xa4'

123

"""

123

"""

124

125

try:

125

try:

126

try:

126

try:

127

# make sure string is actually stored in UTF-8

127

# make sure string is actually stored in UTF-8

128

u = s.decode('UTF-8')

128

u = s.decode('UTF-8')

129

if encoding == 'UTF-8':

129

if encoding == 'UTF-8':

130

# fast path

130

# fast path

131

return s

131

return s

132

r = u.encode(_sysstr(encoding), u"replace")

132

r = u.encode(_sysstr(encoding), u"replace")

133

if u == r.decode(_sysstr(encoding)):

133

if u == r.decode(_sysstr(encoding)):

134

# r is a safe, non-lossy encoding of s

134

# r is a safe, non-lossy encoding of s

135

return r

135

return r

136

return localstr(s, r)

136

return localstr(s, r)

137

except UnicodeDecodeError:

137

except UnicodeDecodeError:

138

# we should only get here if we're looking at an ancient changeset

138

# we should only get here if we're looking at an ancient changeset

139

try:

139

try:

140

u = s.decode(_sysstr(fallbackencoding))

140

u = s.decode(_sysstr(fallbackencoding))

141

r = u.encode(_sysstr(encoding), u"replace")

141

r = u.encode(_sysstr(encoding), u"replace")

142

if u == r.decode(_sysstr(encoding)):

142

if u == r.decode(_sysstr(encoding)):

143

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

144

return r

144

return r

145

return localstr(u.encode('UTF-8'), r)

145

return localstr(u.encode('UTF-8'), r)

146

except UnicodeDecodeError:

146

except UnicodeDecodeError:

147

u = s.decode("utf-8", "replace") # last ditch

147

u = s.decode("utf-8", "replace") # last ditch

148

# can't round-trip

148

# can't round-trip

149

return u.encode(_sysstr(encoding), u"replace")

149

return u.encode(_sysstr(encoding), u"replace")

150

except LookupError as k:

150

except LookupError as k:

151

raise error.Abort(k, hint="please check your locale settings")

151

raise error.Abort(k, hint="please check your locale settings")

152

153

def fromlocal(s):

153

def fromlocal(s):

154

"""

154

"""

155

Convert a string from the local character encoding to UTF-8

155

Convert a string from the local character encoding to UTF-8

156

157

We attempt to decode strings using the encoding mode set by

157

We attempt to decode strings using the encoding mode set by

158

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

158

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

159

characters will cause an error message. Other modes include

159

characters will cause an error message. Other modes include

160

'replace', which replaces unknown characters with a special

160

'replace', which replaces unknown characters with a special

161

Unicode character, and 'ignore', which drops the character.

161

Unicode character, and 'ignore', which drops the character.

162

"""

162

"""

163

164

# can we do a lossless round-trip?

164

# can we do a lossless round-trip?

165

if isinstance(s, localstr):

165

if isinstance(s, localstr):

166

return s._utf8

166

return s._utf8

167

168

try:

168

try:

169

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

169

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

170

return u.encode("utf-8")

170

return u.encode("utf-8")

171

except UnicodeDecodeError as inst:

171

except UnicodeDecodeError as inst:

172

sub = s[max(0, inst.start - 10):inst.start + 10]

172

sub = s[max(0, inst.start - 10):inst.start + 10]

173

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

173

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

174

except LookupError as k:

174

except LookupError as k:

175

raise error.Abort(k, hint="please check your locale settings")

175

raise error.Abort(k, hint="please check your locale settings")

176

177

def unitolocal(u):

177

def unitolocal(u):

178

"""Convert a unicode string to a byte string of local encoding"""

178

"""Convert a unicode string to a byte string of local encoding"""

179

return tolocal(u.encode('utf-8'))

179

return tolocal(u.encode('utf-8'))

180

181

def unifromlocal(s):

181

def unifromlocal(s):

182

"""Convert a byte string of local encoding to a unicode string"""

182

"""Convert a byte string of local encoding to a unicode string"""

183

return fromlocal(s).decode('utf-8')

183

return fromlocal(s).decode('utf-8')

184

185

def unimethod(bytesfunc):

185

def unimethod(bytesfunc):

186

"""Create a proxy method that forwards __unicode__() and __str__() of

186

"""Create a proxy method that forwards __unicode__() and __str__() of

187

Python 3 to __bytes__()"""

187

Python 3 to __bytes__()"""

188

def unifunc(obj):

188

def unifunc(obj):

189

return unifromlocal(bytesfunc(obj))

189

return unifromlocal(bytesfunc(obj))

190

return unifunc

190

return unifunc

191

192

# converter functions between native str and byte string. use these if the

192

# converter functions between native str and byte string. use these if the

193

# character encoding is not aware (e.g. exception message) or is known to

193

# character encoding is not aware (e.g. exception message) or is known to

194

# be locale dependent (e.g. date formatting.)

194

# be locale dependent (e.g. date formatting.)

195

if pycompat.ispy3:

195

if pycompat.ispy3:

196

strtolocal = unitolocal

196

strtolocal = unitolocal

197

strfromlocal = unifromlocal

197

strfromlocal = unifromlocal

198

strmethod = unimethod

198

strmethod = unimethod

199

else:

199

else:

200

strtolocal = pycompat.identity

200

strtolocal = pycompat.identity

201

strfromlocal = pycompat.identity

201

strfromlocal = pycompat.identity

202

strmethod = pycompat.identity

202

strmethod = pycompat.identity

203

204

if not _nativeenviron:

204

if not _nativeenviron:

205

# now encoding and helper functions are available, recreate the environ

205

# now encoding and helper functions are available, recreate the environ

206

# dict to be exported to other modules

206

# dict to be exported to other modules

207

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

207

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

208

for k, v in os.environ.items()) # re-exports

208

for k, v in os.environ.items()) # re-exports

209

210

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

210

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

211

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

211

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

212

and "WFA" or "WF")

212

and "WFA" or "WF")

213

214

def colwidth(s):

214

def colwidth(s):

215

"Find the column width of a string for display in the local encoding"

215

"Find the column width of a string for display in the local encoding"

216

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

216

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

217

218

def ucolwidth(d):

218

def ucolwidth(d):

219

"Find the column width of a Unicode string for display"

219

"Find the column width of a Unicode string for display"

220

eaw = getattr(unicodedata, 'east_asian_width', None)

220

eaw = getattr(unicodedata, 'east_asian_width', None)

221

if eaw is not None:

221

if eaw is not None:

222

return sum([eaw(c) in _wide and 2 or 1 for c in d])

222

return sum([eaw(c) in _wide and 2 or 1 for c in d])

223

return len(d)

223

return len(d)

224

225

def getcols(s, start, c):

225

def getcols(s, start, c):

226

'''Use colwidth to find a c-column substring of s starting at byte

226

'''Use colwidth to find a c-column substring of s starting at byte

227

index start'''

227

index start'''

228

for x in xrange(start + c, len(s)):

228

for x in xrange(start + c, len(s)):

229

t = s[start:x]

229

t = s[start:x]

230

if colwidth(t) == c:

230

if colwidth(t) == c:

231

return t

231

return t

232

233

def trim(s, width, ellipsis='', leftside=False):

233

def trim(s, width, ellipsis='', leftside=False):

234

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

234

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

235

236

If 'leftside' is True, left side of string 's' is trimmed.

236

If 'leftside' is True, left side of string 's' is trimmed.

237

'ellipsis' is always placed at trimmed side.

237

'ellipsis' is always placed at trimmed side.

238

239

>>> ellipsis = '+++'

239

>>> ellipsis = '+++'

240

>>> from . import encoding

240

>>> from . import encoding

241

>>> encoding.encoding = 'utf-8'

241

>>> encoding.encoding = 'utf-8'

242

>>> t= '1234567890'

242

>>> t= '1234567890'

243

>>> print trim(t, 12, ellipsis=ellipsis)

243

>>> print trim(t, 12, ellipsis=ellipsis)

244

1234567890

244

1234567890

245

>>> print trim(t, 10, ellipsis=ellipsis)

245

>>> print trim(t, 10, ellipsis=ellipsis)

246

1234567890

246

1234567890

247

>>> print trim(t, 8, ellipsis=ellipsis)

247

>>> print trim(t, 8, ellipsis=ellipsis)

248

12345+++

248

12345+++

249

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

249

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

250

+++67890

250

+++67890

251

>>> print trim(t, 8)

251

>>> print trim(t, 8)

252

12345678

252

12345678

253

>>> print trim(t, 8, leftside=True)

253

>>> print trim(t, 8, leftside=True)

254

34567890

254

34567890

255

>>> print trim(t, 3, ellipsis=ellipsis)

255

>>> print trim(t, 3, ellipsis=ellipsis)

256

+++

256

+++

257

>>> print trim(t, 1, ellipsis=ellipsis)

257

>>> print trim(t, 1, ellipsis=ellipsis)

258

+

258

+

259

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

259

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

260

>>> t = u.encode(encoding.encoding)

260

>>> t = u.encode(encoding.encoding)

261

>>> print trim(t, 12, ellipsis=ellipsis)

261

>>> print trim(t, 12, ellipsis=ellipsis)

262

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

262

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

263

>>> print trim(t, 10, ellipsis=ellipsis)

263

>>> print trim(t, 10, ellipsis=ellipsis)

264

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

264

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

265

>>> print trim(t, 8, ellipsis=ellipsis)

265

>>> print trim(t, 8, ellipsis=ellipsis)

266

\xe3\x81\x82\xe3\x81\x84+++

266

\xe3\x81\x82\xe3\x81\x84+++

267

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

267

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

268

+++\xe3\x81\x88\xe3\x81\x8a

268

+++\xe3\x81\x88\xe3\x81\x8a

269

>>> print trim(t, 5)

269

>>> print trim(t, 5)

270

\xe3\x81\x82\xe3\x81\x84

270

\xe3\x81\x82\xe3\x81\x84

271

>>> print trim(t, 5, leftside=True)

271

>>> print trim(t, 5, leftside=True)

272

\xe3\x81\x88\xe3\x81\x8a

272

\xe3\x81\x88\xe3\x81\x8a

273

>>> print trim(t, 4, ellipsis=ellipsis)

273

>>> print trim(t, 4, ellipsis=ellipsis)

274

+++

274

+++

275

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

275

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

276

+++

276

+++

277

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

277

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

278

>>> print trim(t, 12, ellipsis=ellipsis)

278

>>> print trim(t, 12, ellipsis=ellipsis)

279

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

279

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

280

>>> print trim(t, 10, ellipsis=ellipsis)

280

>>> print trim(t, 10, ellipsis=ellipsis)

281

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

281

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

282

>>> print trim(t, 8, ellipsis=ellipsis)

282

>>> print trim(t, 8, ellipsis=ellipsis)

283

\x11\x22\x33\x44\x55+++

283

\x11\x22\x33\x44\x55+++

284

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

284

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

285

+++\x66\x77\x88\x99\xaa

285

+++\x66\x77\x88\x99\xaa

286

>>> print trim(t, 8)

286

>>> print trim(t, 8)

287

\x11\x22\x33\x44\x55\x66\x77\x88

287

\x11\x22\x33\x44\x55\x66\x77\x88

288

>>> print trim(t, 8, leftside=True)

288

>>> print trim(t, 8, leftside=True)

289

\x33\x44\x55\x66\x77\x88\x99\xaa

289

\x33\x44\x55\x66\x77\x88\x99\xaa

290

>>> print trim(t, 3, ellipsis=ellipsis)

290

>>> print trim(t, 3, ellipsis=ellipsis)

291

+++

291

+++

292

>>> print trim(t, 1, ellipsis=ellipsis)

292

>>> print trim(t, 1, ellipsis=ellipsis)

293

+

293

+

294

"""

294

"""

295

try:

295

try:

296

u = s.decode(_sysstr(encoding))

296

u = s.decode(_sysstr(encoding))

297

except UnicodeDecodeError:

297

except UnicodeDecodeError:

298

if len(s) <= width: # trimming is not needed

298

if len(s) <= width: # trimming is not needed

299

return s

299

return s

300

width -= len(ellipsis)

300

width -= len(ellipsis)

301

if width <= 0: # no enough room even for ellipsis

301

if width <= 0: # no enough room even for ellipsis

302

return ellipsis[:width + len(ellipsis)]

302

return ellipsis[:width + len(ellipsis)]

303

if leftside:

303

if leftside:

304

return ellipsis + s[-width:]

304

return ellipsis + s[-width:]

305

return s[:width] + ellipsis

305

return s[:width] + ellipsis

306

307

if ucolwidth(u) <= width: # trimming is not needed

307

if ucolwidth(u) <= width: # trimming is not needed

308

return s

308

return s

309

310

width -= len(ellipsis)

310

width -= len(ellipsis)

311

if width <= 0: # no enough room even for ellipsis

311

if width <= 0: # no enough room even for ellipsis

312

return ellipsis[:width + len(ellipsis)]

312

return ellipsis[:width + len(ellipsis)]

313

314

if leftside:

314

if leftside:

315

uslice = lambda i: u[i:]

315

uslice = lambda i: u[i:]

316

concat = lambda s: ellipsis + s

316

concat = lambda s: ellipsis + s

317

else:

317

else:

318

uslice = lambda i: u[:-i]

318

uslice = lambda i: u[:-i]

319

concat = lambda s: s + ellipsis

319

concat = lambda s: s + ellipsis

320

for i in xrange(1, len(u)):

320

for i in xrange(1, len(u)):

321

usub = uslice(i)

321

usub = uslice(i)

322

if ucolwidth(usub) <= width:

322

if ucolwidth(usub) <= width:

323

return concat(usub.encode(_sysstr(encoding)))

323

return concat(usub.encode(_sysstr(encoding)))

324

return ellipsis # no enough room for multi-column characters

324

return ellipsis # no enough room for multi-column characters

325

326

def lower(s):

326

def lower(s):

327

"best-effort encoding-aware case-folding of local string s"

327

"best-effort encoding-aware case-folding of local string s"

328

try:

328

try:

329

return asciilower(s)

329

return asciilower(s)

330

except UnicodeDecodeError:

330

except UnicodeDecodeError:

331

pass

331

pass

332

try:

332

try:

333

if isinstance(s, localstr):

333

if isinstance(s, localstr):

334

u = s._utf8.decode("utf-8")

334

u = s._utf8.decode("utf-8")

335

else:

335

else:

336

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

336

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

337

338

lu = u.lower()

338

lu = u.lower()

339

if u == lu:

339

if u == lu:

340

return s # preserve localstring

340

return s # preserve localstring

341

return lu.encode(_sysstr(encoding))

341

return lu.encode(_sysstr(encoding))

342

except UnicodeError:

342

except UnicodeError:

343

return s.lower() # we don't know how to fold this except in ASCII

343

return s.lower() # we don't know how to fold this except in ASCII

344

except LookupError as k:

344

except LookupError as k:

345

raise error.Abort(k, hint="please check your locale settings")

345

raise error.Abort(k, hint="please check your locale settings")

346

347

def upper(s):

347

def upper(s):

348

"best-effort encoding-aware case-folding of local string s"

348

"best-effort encoding-aware case-folding of local string s"

349

try:

349

try:

350

return asciiupper(s)

350

return asciiupper(s)

351

except UnicodeDecodeError:

351

except UnicodeDecodeError:

352

return upperfallback(s)

352

return upperfallback(s)

353

354

def upperfallback(s):

354

def upperfallback(s):

355

try:

355

try:

356

if isinstance(s, localstr):

356

if isinstance(s, localstr):

357

u = s._utf8.decode("utf-8")

357

u = s._utf8.decode("utf-8")

358

else:

358

else:

359

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

359

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

360

361

uu = u.upper()

361

uu = u.upper()

362

if u == uu:

362

if u == uu:

363

return s # preserve localstring

363

return s # preserve localstring

364

return uu.encode(_sysstr(encoding))

364

return uu.encode(_sysstr(encoding))

365

except UnicodeError:

365

except UnicodeError:

366

return s.upper() # we don't know how to fold this except in ASCII

366

return s.upper() # we don't know how to fold this except in ASCII

367

except LookupError as k:

367

except LookupError as k:

368

raise error.Abort(k, hint="please check your locale settings")

368

raise error.Abort(k, hint="please check your locale settings")

369

370

class normcasespecs(object):

370

class normcasespecs(object):

371

'''what a platform's normcase does to ASCII strings

371

'''what a platform's normcase does to ASCII strings

372

373

This is specified per platform, and should be consistent with what normcase

373

This is specified per platform, and should be consistent with what normcase

374

on that platform actually does.

374

on that platform actually does.

375

376

lower: normcase lowercases ASCII strings

376

lower: normcase lowercases ASCII strings

377

upper: normcase uppercases ASCII strings

377

upper: normcase uppercases ASCII strings

378

other: the fallback function should always be called

378

other: the fallback function should always be called

379

380

This should be kept in sync with normcase_spec in util.h.'''

380

This should be kept in sync with normcase_spec in util.h.'''

381

lower = -1

381

lower = -1

382

upper = 1

382

upper = 1

383

other = 0

383

other = 0

384

385

_jsonmap = []

385

_jsonmap = []

386

_jsonmap.extend("\\u%04x" % x for x in range(32))

386

_jsonmap.extend("\\u%04x" % x for x in range(32))

387

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

387

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

388

_jsonmap.append('\\u007f')

388

_jsonmap.append('\\u007f')

389

_jsonmap[0x09] = '\\t'

389

_jsonmap[0x09] = '\\t'

390

_jsonmap[0x0a] = '\\n'

390

_jsonmap[0x0a] = '\\n'

391

_jsonmap[0x22] = '\\"'

391

_jsonmap[0x22] = '\\"'

392

_jsonmap[0x5c] = '\\\\'

392

_jsonmap[0x5c] = '\\\\'

393

_jsonmap[0x08] = '\\b'

393

_jsonmap[0x08] = '\\b'

394

_jsonmap[0x0c] = '\\f'

394

_jsonmap[0x0c] = '\\f'

395

_jsonmap[0x0d] = '\\r'

395

_jsonmap[0x0d] = '\\r'

396

_paranoidjsonmap = _jsonmap[:]

396

_paranoidjsonmap = _jsonmap[:]

397

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

397

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

398

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

398

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

399

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

399

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

400

401

def jsonescape(s, paranoid=False):

401

def jsonescape(s, paranoid=False):

402

'''returns a string suitable for JSON

402

'''returns a string suitable for JSON

403

404

JSON is problematic for us because it doesn't support non-Unicode

404

JSON is problematic for us because it doesn't support non-Unicode

405

bytes. To deal with this, we take the following approach:

405

bytes. To deal with this, we take the following approach:

406

407

- localstr objects are converted back to UTF-8

407

- localstr objects are converted back to UTF-8

408

- valid UTF-8/ASCII strings are passed as-is

408

- valid UTF-8/ASCII strings are passed as-is

409

- other strings are converted to UTF-8b surrogate encoding

409

- other strings are converted to UTF-8b surrogate encoding

410

- apply JSON-specified string escaping

410

- apply JSON-specified string escaping

411

412

(escapes are doubled in these tests)

412

(escapes are doubled in these tests)

413

414

>>> jsonescape('this is a test')

414

>>> jsonescape('this is a test')

415

'this is a test'

415

'this is a test'

416

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

416

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

417

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

417

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

418

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

418

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

419

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

419

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

420

>>> jsonescape('a weird byte: \\xdd')

420

>>> jsonescape('a weird byte: \\xdd')

421

'a weird byte: \\xed\\xb3\\x9d'

421

'a weird byte: \\xed\\xb3\\x9d'

422

>>> jsonescape('utf-8: caf\\xc3\\xa9')

422

>>> jsonescape('utf-8: caf\\xc3\\xa9')

423

'utf-8: caf\\xc3\\xa9'

423

'utf-8: caf\\xc3\\xa9'

424

>>> jsonescape('')

424

>>> jsonescape('')

425

''

425

''

426

427

If paranoid, non-ascii and common troublesome characters are also escaped.

427

If paranoid, non-ascii and common troublesome characters are also escaped.

428

This is suitable for web output.

428

This is suitable for web output.

429

430

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

430

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

431

'escape boundary: ~ \\\\u007f \\\\u0080'

431

'escape boundary: ~ \\\\u007f \\\\u0080'

432

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

432

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

433

'a weird byte: \\\\udcdd'

433

'a weird byte: \\\\udcdd'

434

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

434

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

435

'utf-8: caf\\\\u00e9'

435

'utf-8: caf\\\\u00e9'

436

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

436

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

437

'non-BMP: \\\\ud834\\\\udd1e'

437

'non-BMP: \\\\ud834\\\\udd1e'

438

>>> jsonescape('<foo@example.org>', paranoid=True)

438

>>> jsonescape('<foo@example.org>', paranoid=True)

439

'\\\\u003cfoo@example.org\\\\u003e'

439

'\\\\u003cfoo@example.org\\\\u003e'

440

'''

440

'''

441

442

if paranoid:

442

if paranoid:

443

jm = _paranoidjsonmap

443

jm = _paranoidjsonmap

444

else:

444

else:

445

jm = _jsonmap

445

jm = _jsonmap

446

447

u8chars = toutf8b(s)

447

u8chars = toutf8b(s)

448

try:

448

try:

449

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

449

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

450

except IndexError:

450

except IndexError:

451

pass

451

pass

452

# non-BMP char is represented as UTF-16 surrogate pair

452

# non-BMP char is represented as UTF-16 surrogate pair

453

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

453

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

454

u16codes.pop(0) # drop BOM

454

u16codes.pop(0) # drop BOM

455

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

455

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

456

457

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

457

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

458

459

def getutf8char(s, pos):

459

def getutf8char(s, pos):

460

'''get the next full utf-8 character in the given string, starting at pos

460

'''get the next full utf-8 character in the given string, starting at pos

461

462

Raises a UnicodeError if the given location does not start a valid

462

Raises a UnicodeError if the given location does not start a valid

463

utf-8 character.

463

utf-8 character.

464

'''

464

'''

465

466

# find how many bytes to attempt decoding from first nibble

466

# find how many bytes to attempt decoding from first nibble

467

l = _utf8len[ord(s[pos]) >> 4]

467

l = _utf8len[ord(s[pos]) >> 4]

468

if not l: # ascii

468

if not l: # ascii

469

return s[pos]

469

return s[pos]

470

471

c = s[pos:pos + l]

471

c = s[pos:pos + l]

472

# validate with attempted decode

472

# validate with attempted decode

473

c.decode("utf-8")

473

c.decode("utf-8")

474

return c

474

return c

475

476

def toutf8b(s):

476

def toutf8b(s):

477

'''convert a local, possibly-binary string into UTF-8b

477

'''convert a local, possibly-binary string into UTF-8b

478

479

This is intended as a generic method to preserve data when working

479

This is intended as a generic method to preserve data when working

480

with schemes like JSON and XML that have no provision for

480

with schemes like JSON and XML that have no provision for

481

arbitrary byte strings. As Mercurial often doesn't know

481

arbitrary byte strings. As Mercurial often doesn't know

482

what encoding data is in, we use so-called UTF-8b.

482

what encoding data is in, we use so-called UTF-8b.

483

484

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

484

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

485

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

485

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

486

uDC00-uDCFF.

486

uDC00-uDCFF.

487

488

Principles of operation:

488

Principles of operation:

489

490

- ASCII and UTF-8 data successfully round-trips and is understood

490

- ASCII and UTF-8 data successfully round-trips and is understood

491

by Unicode-oriented clients

491

by Unicode-oriented clients

492

- filenames and file contents in arbitrary other encodings can have

492

- filenames and file contents in arbitrary other encodings can have

493

be round-tripped or recovered by clueful clients

493

be round-tripped or recovered by clueful clients

494

- local strings that have a cached known UTF-8 encoding (aka

494

- local strings that have a cached known UTF-8 encoding (aka

495

localstr) get sent as UTF-8 so Unicode-oriented clients get the

495

localstr) get sent as UTF-8 so Unicode-oriented clients get the

496

Unicode data they want

496

Unicode data they want

497

- because we must preserve UTF-8 bytestring in places such as

497

- because we must preserve UTF-8 bytestring in places such as

498

filenames, metadata can't be roundtripped without help

498

filenames, metadata can't be roundtripped without help

499

500

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

500

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

501

arbitrary bytes into an internal Unicode format that can be

501

arbitrary bytes into an internal Unicode format that can be

502

re-encoded back into the original. Here we are exposing the

502

re-encoded back into the original. Here we are exposing the

503

internal surrogate encoding as a UTF-8 string.)

503

internal surrogate encoding as a UTF-8 string.)

504

'''

504

'''

505

506

if "\xed" not in s:

506

if "\xed" not in s:

507

if isinstance(s, localstr):

507

if isinstance(s, localstr):

508

return s._utf8

508

return s._utf8

509

try:

509

try:

510

s.decode('utf-8')

510

s.decode('utf-8')

511

return s

511

return s

512

except UnicodeDecodeError:

512

except UnicodeDecodeError:

513

pass

513

pass

514

515

r = ""

515

r = ""

516

pos = 0

516

pos = 0

517

l = len(s)

517

l = len(s)

518

while pos < l:

518

while pos < l:

519

try:

519

try:

520

c = getutf8char(s, pos)

520

c = getutf8char(s, pos)

521

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

521

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

522

# have to re-escape existing U+DCxx characters

522

# have to re-escape existing U+DCxx characters

523

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

523

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

524

pos += 1

524

pos += 1

525

else:

525

else:

526

pos += len(c)

526

pos += len(c)

527

except UnicodeDecodeError:

527

except UnicodeDecodeError:

528

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

528

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

529

pos += 1

529

pos += 1

530

r += c

530

r += c

531

return r

531

return r

532

533

def fromutf8b(s):

533

def fromutf8b(s):

534

'''Given a UTF-8b string, return a local, possibly-binary string.

534

'''Given a UTF-8b string, return a local, possibly-binary string.

535

536

return the original binary string. This

536

return the original binary string. This

537

is a round-trip process for strings like filenames, but metadata

537

is a round-trip process for strings like filenames, but metadata

538

that's was passed through tolocal will remain in UTF-8.

538

that's was passed through tolocal will remain in UTF-8.

539

540

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

540

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

541

>>> m = "\\xc3\\xa9\\x99abcd"

541

>>> m = "\\xc3\\xa9\\x99abcd"

542

>>> toutf8b(m)

542

>>> toutf8b(m)

543

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

543

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

544

>>> roundtrip(m)

544

>>> roundtrip(m)

545

True

545

True

546

>>> roundtrip("\\xc2\\xc2\\x80")

546

>>> roundtrip("\\xc2\\xc2\\x80")

547

True

547

True

548

>>> roundtrip("\\xef\\xbf\\xbd")

548

>>> roundtrip("\\xef\\xbf\\xbd")

549

True

549

True

550

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

550

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

551

True

551

True

552

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

552

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

553

True

553

True

554

'''

554

'''

555

556

# fast path - look for uDxxx prefixes in s

556

# fast path - look for uDxxx prefixes in s

557

if "\xed" not in s:

557

if "\xed" not in s:

558

return s

558

return s

559

560

# We could do this with the unicode type but some Python builds

560

# We could do this with the unicode type but some Python builds

561

# use UTF-16 internally (issue5031) which causes non-BMP code

561

# use UTF-16 internally (issue5031) which causes non-BMP code

562

# points to be escaped. Instead, we use our handy getutf8char

562

# points to be escaped. Instead, we use our handy getutf8char

563

# helper again to walk the string without "decoding" it.

563

# helper again to walk the string without "decoding" it.

564

565

r = ""

565

r = ""

566

pos = 0

566

pos = 0

567

l = len(s)

567

l = len(s)

568

while pos < l:

568

while pos < l:

569

c = getutf8char(s, pos)

569

c = getutf8char(s, pos)

570

pos += len(c)

570

pos += len(c)

571

# unescape U+DCxx characters

571

# unescape U+DCxx characters

572

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

572

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

573

c = chr(ord(c.decode("utf-8")) & 0xff)

573

c = chr(ord(c.decode("utf-8")) & 0xff)

574

r += c

574

r += c

575

return r

575

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             charencode = policy.importmod(r'charencode')
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
-            class localstr(str):
+            class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
-                    s = str.__new__(cls, l)
+                    s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r