upstream/mercurial-mirror Commit - r34213:1c601df9

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import io

10

import io

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import (

21

from .pure import (

22

charencode as charencodepure,

22

charencode as charencodepure,

23

)

23

)

24

25

charencode = policy.importmod(r'charencode')

25

charencode = policy.importmod(r'charencode')

26

27

isasciistr = charencode.isasciistr

27

isasciistr = charencode.isasciistr

28

asciilower = charencode.asciilower

28

asciilower = charencode.asciilower

29

asciiupper = charencode.asciiupper

29

asciiupper = charencode.asciiupper

30

_jsonescapeu8fast = charencode.jsonescapeu8fast

30

_jsonescapeu8fast = charencode.jsonescapeu8fast

31

32

_sysstr = pycompat.sysstr

32

_sysstr = pycompat.sysstr

33

34

if pycompat.ispy3:

34

if pycompat.ispy3:

35

unichr = chr

35

unichr = chr

36

37

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

37

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

38

# "Unicode Subtleties"), so we need to ignore them in some places for

38

# "Unicode Subtleties"), so we need to ignore them in some places for

39

# sanity.

39

# sanity.

40

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

40

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

41

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

42

"206a 206b 206c 206d 206e 206f feff".split()]

42

"206a 206b 206c 206d 206e 206f feff".split()]

43

# verify the next function will work

43

# verify the next function will work

44

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

44

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

45

46

def hfsignoreclean(s):

46

def hfsignoreclean(s):

47

"""Remove codepoints ignored by HFS+ from s.

47

"""Remove codepoints ignored by HFS+ from s.

48

49

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

49

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

50

'.hg'

50

'.hg'

51

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

51

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

52

'.hg'

52

'.hg'

53

"""

53

"""

54

if "\xe2" in s or "\xef" in s:

54

if "\xe2" in s or "\xef" in s:

55

for c in _ignore:

55

for c in _ignore:

56

s = s.replace(c, '')

56

s = s.replace(c, '')

57

return s

57

return s

58

59

# encoding.environ is provided read-only, which may not be used to modify

59

# encoding.environ is provided read-only, which may not be used to modify

60

# the process environment

60

# the process environment

61

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

61

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

62

if not pycompat.ispy3:

62

if not pycompat.ispy3:

63

environ = os.environ # re-exports

63

environ = os.environ # re-exports

64

elif _nativeenviron:

64

elif _nativeenviron:

65

environ = os.environb # re-exports

65

environ = os.environb # re-exports

66

else:

66

else:

67

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

67

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

68

# and recreate it once encoding is settled

68

# and recreate it once encoding is settled

69

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

69

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

70

for k, v in os.environ.items()) # re-exports

70

for k, v in os.environ.items()) # re-exports

71

72

_encodingfixers = {

72

_encodingfixers = {

73

'646': lambda: 'ascii',

73

'646': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

75

}

75

}

76

77

try:

77

try:

78

encoding = environ.get("HGENCODING")

78

encoding = environ.get("HGENCODING")

79

if not encoding:

79

if not encoding:

80

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

80

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

81

encoding = _encodingfixers.get(encoding, lambda: encoding)()

81

encoding = _encodingfixers.get(encoding, lambda: encoding)()

82

except locale.Error:

82

except locale.Error:

83

encoding = 'ascii'

83

encoding = 'ascii'

84

encodingmode = environ.get("HGENCODINGMODE", "strict")

84

encodingmode = environ.get("HGENCODINGMODE", "strict")

85

fallbackencoding = 'ISO-8859-1'

85

fallbackencoding = 'ISO-8859-1'

86

87

class localstr(bytes):

87

class localstr(bytes):

88

'''This class allows strings that are unmodified to be

88

'''This class allows strings that are unmodified to be

89

round-tripped to the local encoding and back'''

89

round-tripped to the local encoding and back'''

90

def __new__(cls, u, l):

90

def __new__(cls, u, l):

91

s = bytes.__new__(cls, l)

91

s = bytes.__new__(cls, l)

92

s._utf8 = u

92

s._utf8 = u

93

return s

93

return s

94

def __hash__(self):

94

def __hash__(self):

95

return hash(self._utf8) # avoid collisions in local string space

95

return hash(self._utf8) # avoid collisions in local string space

96

97

def tolocal(s):

97

def tolocal(s):

98

"""

98

"""

99

Convert a string from internal UTF-8 to local encoding

99

Convert a string from internal UTF-8 to local encoding

100

101

All internal strings should be UTF-8 but some repos before the

101

All internal strings should be UTF-8 but some repos before the

102

implementation of locale support may contain latin1 or possibly

102

implementation of locale support may contain latin1 or possibly

103

other character sets. We attempt to decode everything strictly

103

other character sets. We attempt to decode everything strictly

104

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

104

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

105

replace unknown characters.

105

replace unknown characters.

106

107

The localstr class is used to cache the known UTF-8 encoding of

107

The localstr class is used to cache the known UTF-8 encoding of

108

strings next to their local representation to allow lossless

108

strings next to their local representation to allow lossless

109

round-trip conversion back to UTF-8.

109

round-trip conversion back to UTF-8.

110

111

>>> u = b'foo: \\xc3\\xa4' # utf-8

111

>>> u = b'foo: \\xc3\\xa4' # utf-8

112

>>> l = tolocal(u)

112

>>> l = tolocal(u)

113

>>> l

113

>>> l

114

'foo: ?'

114

'foo: ?'

115

>>> fromlocal(l)

115

>>> fromlocal(l)

116

'foo: \\xc3\\xa4'

116

'foo: \\xc3\\xa4'

117

>>> u2 = b'foo: \\xc3\\xa1'

117

>>> u2 = b'foo: \\xc3\\xa1'

118

>>> d = { l: 1, tolocal(u2): 2 }

118

>>> d = { l: 1, tolocal(u2): 2 }

119

>>> len(d) # no collision

119

>>> len(d) # no collision

120

2

120

2

121

>>> b'foo: ?' in d

121

>>> b'foo: ?' in d

122

False

122

False

123

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

123

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

124

>>> l = tolocal(l1)

124

>>> l = tolocal(l1)

125

>>> l

125

>>> l

126

'foo: ?'

126

'foo: ?'

127

>>> fromlocal(l) # magically in utf-8

127

>>> fromlocal(l) # magically in utf-8

128

'foo: \\xc3\\xa4'

128

'foo: \\xc3\\xa4'

129

"""

129

"""

130

131

if isasciistr(s):

131

if isasciistr(s):

132

return s

132

return s

133

134

try:

134

try:

135

try:

135

try:

136

# make sure string is actually stored in UTF-8

136

# make sure string is actually stored in UTF-8

137

u = s.decode('UTF-8')

137

u = s.decode('UTF-8')

138

if encoding == 'UTF-8':

138

if encoding == 'UTF-8':

139

# fast path

139

# fast path

140

return s

140

return s

141

r = u.encode(_sysstr(encoding), u"replace")

141

r = u.encode(_sysstr(encoding), u"replace")

142

if u == r.decode(_sysstr(encoding)):

142

if u == r.decode(_sysstr(encoding)):

143

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

144

return r

144

return r

145

return localstr(s, r)

145

return localstr(s, r)

146

except UnicodeDecodeError:

146

except UnicodeDecodeError:

147

# we should only get here if we're looking at an ancient changeset

147

# we should only get here if we're looking at an ancient changeset

148

try:

148

try:

149

u = s.decode(_sysstr(fallbackencoding))

149

u = s.decode(_sysstr(fallbackencoding))

150

r = u.encode(_sysstr(encoding), u"replace")

150

r = u.encode(_sysstr(encoding), u"replace")

151

if u == r.decode(_sysstr(encoding)):

151

if u == r.decode(_sysstr(encoding)):

152

# r is a safe, non-lossy encoding of s

152

# r is a safe, non-lossy encoding of s

153

return r

153

return r

154

return localstr(u.encode('UTF-8'), r)

154

return localstr(u.encode('UTF-8'), r)

155

except UnicodeDecodeError:

155

except UnicodeDecodeError:

156

u = s.decode("utf-8", "replace") # last ditch

156

u = s.decode("utf-8", "replace") # last ditch

157

# can't round-trip

157

# can't round-trip

158

return u.encode(_sysstr(encoding), u"replace")

158

return u.encode(_sysstr(encoding), u"replace")

159

except LookupError as k:

159

except LookupError as k:

160

raise error.Abort(k, hint="please check your locale settings")

160

raise error.Abort(k, hint="please check your locale settings")

161

162

def fromlocal(s):

162

def fromlocal(s):

163

"""

163

"""

164

Convert a string from the local character encoding to UTF-8

164

Convert a string from the local character encoding to UTF-8

165

166

We attempt to decode strings using the encoding mode set by

166

We attempt to decode strings using the encoding mode set by

167

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

167

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

168

characters will cause an error message. Other modes include

168

characters will cause an error message. Other modes include

169

'replace', which replaces unknown characters with a special

169

'replace', which replaces unknown characters with a special

170

Unicode character, and 'ignore', which drops the character.

170

Unicode character, and 'ignore', which drops the character.

171

"""

171

"""

172

173

# can we do a lossless round-trip?

173

# can we do a lossless round-trip?

174

if isinstance(s, localstr):

174

if isinstance(s, localstr):

175

return s._utf8

175

return s._utf8

176

if isasciistr(s):

176

if isasciistr(s):

177

return s

177

return s

178

179

try:

179

try:

180

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

180

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

181

return u.encode("utf-8")

181

return u.encode("utf-8")

182

except UnicodeDecodeError as inst:

182

except UnicodeDecodeError as inst:

183

sub = s[max(0, inst.start - 10):inst.start + 10]

183

sub = s[max(0, inst.start - 10):inst.start + 10]

184

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

184

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

185

except LookupError as k:

185

except LookupError as k:

186

raise error.Abort(k, hint="please check your locale settings")

186

raise error.Abort(k, hint="please check your locale settings")

187

188

def unitolocal(u):

188

def unitolocal(u):

189

"""Convert a unicode string to a byte string of local encoding"""

189

"""Convert a unicode string to a byte string of local encoding"""

190

return tolocal(u.encode('utf-8'))

190

return tolocal(u.encode('utf-8'))

191

192

def unifromlocal(s):

192

def unifromlocal(s):

193

"""Convert a byte string of local encoding to a unicode string"""

193

"""Convert a byte string of local encoding to a unicode string"""

194

return fromlocal(s).decode('utf-8')

194

return fromlocal(s).decode('utf-8')

195

196

def unimethod(bytesfunc):

196

def unimethod(bytesfunc):

197

"""Create a proxy method that forwards __unicode__() and __str__() of

197

"""Create a proxy method that forwards __unicode__() and __str__() of

198

Python 3 to __bytes__()"""

198

Python 3 to __bytes__()"""

199

def unifunc(obj):

199

def unifunc(obj):

200

return unifromlocal(bytesfunc(obj))

200

return unifromlocal(bytesfunc(obj))

201

return unifunc

201

return unifunc

202

203

# converter functions between native str and byte string. use these if the

203

# converter functions between native str and byte string. use these if the

204

# character encoding is not aware (e.g. exception message) or is known to

204

# character encoding is not aware (e.g. exception message) or is known to

205

# be locale dependent (e.g. date formatting.)

205

# be locale dependent (e.g. date formatting.)

206

if pycompat.ispy3:

206

if pycompat.ispy3:

207

strtolocal = unitolocal

207

strtolocal = unitolocal

208

strfromlocal = unifromlocal

208

strfromlocal = unifromlocal

209

strmethod = unimethod

209

strmethod = unimethod

210

else:

210

else:

211

strtolocal = pycompat.identity

211

strtolocal = pycompat.identity

212

strfromlocal = pycompat.identity

212

strfromlocal = pycompat.identity

213

strmethod = pycompat.identity

213

strmethod = pycompat.identity

214

215

if not _nativeenviron:

215

if not _nativeenviron:

216

# now encoding and helper functions are available, recreate the environ

216

# now encoding and helper functions are available, recreate the environ

217

# dict to be exported to other modules

217

# dict to be exported to other modules

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

219

for k, v in os.environ.items()) # re-exports

219

for k, v in os.environ.items()) # re-exports

220

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

223

and "WFA" or "WF")

223

and "WFA" or "WF")

224

225

def colwidth(s):

225

def colwidth(s):

226

"Find the column width of a string for display in the local encoding"

226

"Find the column width of a string for display in the local encoding"

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

228

229

def ucolwidth(d):

229

def ucolwidth(d):

230

"Find the column width of a Unicode string for display"

230

"Find the column width of a Unicode string for display"

231

eaw = getattr(unicodedata, 'east_asian_width', None)

231

eaw = getattr(unicodedata, 'east_asian_width', None)

232

if eaw is not None:

232

if eaw is not None:

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

234

return len(d)

234

return len(d)

235

236

def getcols(s, start, c):

236

def getcols(s, start, c):

237

'''Use colwidth to find a c-column substring of s starting at byte

237

'''Use colwidth to find a c-column substring of s starting at byte

238

index start'''

238

index start'''

239

for x in xrange(start + c, len(s)):

239

for x in xrange(start + c, len(s)):

240

t = s[start:x]

240

t = s[start:x]

241

if colwidth(t) == c:

241

if colwidth(t) == c:

242

return t

242

return t

243

244

def trim(s, width, ellipsis='', leftside=False):

244

def trim(s, width, ellipsis='', leftside=False):

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

246

247

If 'leftside' is True, left side of string 's' is trimmed.

247

If 'leftside' is True, left side of string 's' is trimmed.

248

'ellipsis' is always placed at trimmed side.

248

'ellipsis' is always placed at trimmed side.

249

250

>>> from .node import bin

250

>>> from .node import bin

251

>>> def bprint(s):

251

>>> def bprint(s):

252

... print(pycompat.sysstr(s))

252

... print(pycompat.sysstr(s))

253

>>> ellipsis = b'+++'

253

>>> ellipsis = b'+++'

254

>>> from . import encoding

254

>>> from . import encoding

255

>>> encoding.encoding = b'utf-8'

255

>>> encoding.encoding = b'utf-8'

256

>>> t = b'1234567890'

256

>>> t = b'1234567890'

257

>>> bprint(trim(t, 12, ellipsis=ellipsis))

257

>>> bprint(trim(t, 12, ellipsis=ellipsis))

258

1234567890

258

1234567890

259

>>> bprint(trim(t, 10, ellipsis=ellipsis))

259

>>> bprint(trim(t, 10, ellipsis=ellipsis))

260

1234567890

260

1234567890

261

>>> bprint(trim(t, 8, ellipsis=ellipsis))

261

>>> bprint(trim(t, 8, ellipsis=ellipsis))

262

12345+++

262

12345+++

263

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

263

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

264

+++67890

264

+++67890

265

>>> bprint(trim(t, 8))

265

>>> bprint(trim(t, 8))

266

12345678

266

12345678

267

>>> bprint(trim(t, 8, leftside=True))

267

>>> bprint(trim(t, 8, leftside=True))

268

34567890

268

34567890

269

>>> bprint(trim(t, 3, ellipsis=ellipsis))

269

>>> bprint(trim(t, 3, ellipsis=ellipsis))

270

+++

270

+++

271

>>> bprint(trim(t, 1, ellipsis=ellipsis))

271

>>> bprint(trim(t, 1, ellipsis=ellipsis))

272

+

272

+

273

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

273

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

274

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

274

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

275

>>> bprint(trim(t, 12, ellipsis=ellipsis))

275

>>> bprint(trim(t, 12, ellipsis=ellipsis))

276

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

276

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

277

>>> bprint(trim(t, 10, ellipsis=ellipsis))

277

>>> bprint(trim(t, 10, ellipsis=ellipsis))

278

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

278

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

279

>>> bprint(trim(t, 8, ellipsis=ellipsis))

279

>>> bprint(trim(t, 8, ellipsis=ellipsis))

280

\xe3\x81\x82\xe3\x81\x84+++

280

\xe3\x81\x82\xe3\x81\x84+++

281

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

281

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

282

+++\xe3\x81\x88\xe3\x81\x8a

282

+++\xe3\x81\x88\xe3\x81\x8a

283

>>> bprint(trim(t, 5))

283

>>> bprint(trim(t, 5))

284

\xe3\x81\x82\xe3\x81\x84

284

\xe3\x81\x82\xe3\x81\x84

285

>>> bprint(trim(t, 5, leftside=True))

285

>>> bprint(trim(t, 5, leftside=True))

286

\xe3\x81\x88\xe3\x81\x8a

286

\xe3\x81\x88\xe3\x81\x8a

287

>>> bprint(trim(t, 4, ellipsis=ellipsis))

287

>>> bprint(trim(t, 4, ellipsis=ellipsis))

288

+++

288

+++

289

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

289

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

290

+++

290

+++

291

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

291

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

292

>>> bprint(trim(t, 12, ellipsis=ellipsis))

292

>>> bprint(trim(t, 12, ellipsis=ellipsis))

293

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

293

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

294

>>> bprint(trim(t, 10, ellipsis=ellipsis))

294

>>> bprint(trim(t, 10, ellipsis=ellipsis))

295

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

295

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

296

>>> bprint(trim(t, 8, ellipsis=ellipsis))

296

>>> bprint(trim(t, 8, ellipsis=ellipsis))

297

\x11\x22\x33\x44\x55+++

297

\x11\x22\x33\x44\x55+++

298

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

298

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

299

+++\x66\x77\x88\x99\xaa

299

+++\x66\x77\x88\x99\xaa

300

>>> bprint(trim(t, 8))

300

>>> bprint(trim(t, 8))

301

\x11\x22\x33\x44\x55\x66\x77\x88

301

\x11\x22\x33\x44\x55\x66\x77\x88

302

>>> bprint(trim(t, 8, leftside=True))

302

>>> bprint(trim(t, 8, leftside=True))

303

\x33\x44\x55\x66\x77\x88\x99\xaa

303

\x33\x44\x55\x66\x77\x88\x99\xaa

304

>>> bprint(trim(t, 3, ellipsis=ellipsis))

304

>>> bprint(trim(t, 3, ellipsis=ellipsis))

305

+++

305

+++

306

>>> bprint(trim(t, 1, ellipsis=ellipsis))

306

>>> bprint(trim(t, 1, ellipsis=ellipsis))

307

+

307

+

308

"""

308

"""

309

try:

309

try:

310

u = s.decode(_sysstr(encoding))

310

u = s.decode(_sysstr(encoding))

311

except UnicodeDecodeError:

311

except UnicodeDecodeError:

312

if len(s) <= width: # trimming is not needed

312

if len(s) <= width: # trimming is not needed

313

return s

313

return s

314

width -= len(ellipsis)

314

width -= len(ellipsis)

315

if width <= 0: # no enough room even for ellipsis

315

if width <= 0: # no enough room even for ellipsis

316

return ellipsis[:width + len(ellipsis)]

316

return ellipsis[:width + len(ellipsis)]

317

if leftside:

317

if leftside:

318

return ellipsis + s[-width:]

318

return ellipsis + s[-width:]

319

return s[:width] + ellipsis

319

return s[:width] + ellipsis

320

321

if ucolwidth(u) <= width: # trimming is not needed

321

if ucolwidth(u) <= width: # trimming is not needed

322

return s

322

return s

323

324

width -= len(ellipsis)

324

width -= len(ellipsis)

325

if width <= 0: # no enough room even for ellipsis

325

if width <= 0: # no enough room even for ellipsis

326

return ellipsis[:width + len(ellipsis)]

326

return ellipsis[:width + len(ellipsis)]

327

328

if leftside:

328

if leftside:

329

uslice = lambda i: u[i:]

329

uslice = lambda i: u[i:]

330

concat = lambda s: ellipsis + s

330

concat = lambda s: ellipsis + s

331

else:

331

else:

332

uslice = lambda i: u[:-i]

332

uslice = lambda i: u[:-i]

333

concat = lambda s: s + ellipsis

333

concat = lambda s: s + ellipsis

334

for i in xrange(1, len(u)):

334

for i in xrange(1, len(u)):

335

usub = uslice(i)

335

usub = uslice(i)

336

if ucolwidth(usub) <= width:

336

if ucolwidth(usub) <= width:

337

return concat(usub.encode(_sysstr(encoding)))

337

return concat(usub.encode(_sysstr(encoding)))

338

return ellipsis # no enough room for multi-column characters

338

return ellipsis # no enough room for multi-column characters

339

340

def lower(s):

340

def lower(s):

341

"best-effort encoding-aware case-folding of local string s"

341

"best-effort encoding-aware case-folding of local string s"

342

try:

342

try:

343

return asciilower(s)

343

return asciilower(s)

344

except UnicodeDecodeError:

344

except UnicodeDecodeError:

345

pass

345

pass

346

try:

346

try:

347

if isinstance(s, localstr):

347

if isinstance(s, localstr):

348

u = s._utf8.decode("utf-8")

348

u = s._utf8.decode("utf-8")

349

else:

349

else:

350

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

350

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

351

352

lu = u.lower()

352

lu = u.lower()

353

if u == lu:

353

if u == lu:

354

return s # preserve localstring

354

return s # preserve localstring

355

return lu.encode(_sysstr(encoding))

355

return lu.encode(_sysstr(encoding))

356

except UnicodeError:

356

except UnicodeError:

357

return s.lower() # we don't know how to fold this except in ASCII

357

return s.lower() # we don't know how to fold this except in ASCII

358

except LookupError as k:

358

except LookupError as k:

359

raise error.Abort(k, hint="please check your locale settings")

359

raise error.Abort(k, hint="please check your locale settings")

360

361

def upper(s):

361

def upper(s):

362

"best-effort encoding-aware case-folding of local string s"

362

"best-effort encoding-aware case-folding of local string s"

363

try:

363

try:

364

return asciiupper(s)

364

return asciiupper(s)

365

except UnicodeDecodeError:

365

except UnicodeDecodeError:

366

return upperfallback(s)

366

return upperfallback(s)

367

368

def upperfallback(s):

368

def upperfallback(s):

369

try:

369

try:

370

if isinstance(s, localstr):

370

if isinstance(s, localstr):

371

u = s._utf8.decode("utf-8")

371

u = s._utf8.decode("utf-8")

372

else:

372

else:

373

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

373

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

374

375

uu = u.upper()

375

uu = u.upper()

376

if u == uu:

376

if u == uu:

377

return s # preserve localstring

377

return s # preserve localstring

378

return uu.encode(_sysstr(encoding))

378

return uu.encode(_sysstr(encoding))

379

except UnicodeError:

379

except UnicodeError:

380

return s.upper() # we don't know how to fold this except in ASCII

380

return s.upper() # we don't know how to fold this except in ASCII

381

except LookupError as k:

381

except LookupError as k:

382

raise error.Abort(k, hint="please check your locale settings")

382

raise error.Abort(k, hint="please check your locale settings")

383

384

class normcasespecs(object):

384

class normcasespecs(object):

385

'''what a platform's normcase does to ASCII strings

385

'''what a platform's normcase does to ASCII strings

386

387

This is specified per platform, and should be consistent with what normcase

387

This is specified per platform, and should be consistent with what normcase

388

on that platform actually does.

388

on that platform actually does.

389

390

lower: normcase lowercases ASCII strings

390

lower: normcase lowercases ASCII strings

391

upper: normcase uppercases ASCII strings

391

upper: normcase uppercases ASCII strings

392

other: the fallback function should always be called

392

other: the fallback function should always be called

393

394

This should be kept in sync with normcase_spec in util.h.'''

394

This should be kept in sync with normcase_spec in util.h.'''

395

lower = -1

395

lower = -1

396

upper = 1

396

upper = 1

397

other = 0

397

other = 0

398

399

def jsonescape(s, paranoid=False):

399

def jsonescape(s, paranoid=False):

400

'''returns a string suitable for JSON

400

'''returns a string suitable for JSON

401

402

JSON is problematic for us because it doesn't support non-Unicode

402

JSON is problematic for us because it doesn't support non-Unicode

403

bytes. To deal with this, we take the following approach:

403

bytes. To deal with this, we take the following approach:

404

405

- localstr objects are converted back to UTF-8

405

- localstr objects are converted back to UTF-8

406

- valid UTF-8/ASCII strings are passed as-is

406

- valid UTF-8/ASCII strings are passed as-is

407

- other strings are converted to UTF-8b surrogate encoding

407

- other strings are converted to UTF-8b surrogate encoding

408

- apply JSON-specified string escaping

408

- apply JSON-specified string escaping

409

410

(escapes are doubled in these tests)

410

(escapes are doubled in these tests)

411

412

>>> jsonescape(b'this is a test')

412

>>> jsonescape(b'this is a test')

413

'this is a test'

413

'this is a test'

414

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

414

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

415

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

415

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

416

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

416

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

417

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

417

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

418

>>> jsonescape(b'a weird byte: \\xdd')

418

>>> jsonescape(b'a weird byte: \\xdd')

419

'a weird byte: \\xed\\xb3\\x9d'

419

'a weird byte: \\xed\\xb3\\x9d'

420

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

420

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

421

'utf-8: caf\\xc3\\xa9'

421

'utf-8: caf\\xc3\\xa9'

422

>>> jsonescape(b'')

422

>>> jsonescape(b'')

423

''

423

''

424

425

If paranoid, non-ascii and common troublesome characters are also escaped.

425

If paranoid, non-ascii and common troublesome characters are also escaped.

426

This is suitable for web output.

426

This is suitable for web output.

427

428

>>> s = b'escape characters: \\0 \\x0b \\x7f'

428

>>> s = b'escape characters: \\0 \\x0b \\x7f'

429

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

429

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

430

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

430

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

431

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

431

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

432

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

432

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

433

'escape boundary: ~ \\\\u007f \\\\u0080'

433

'escape boundary: ~ \\\\u007f \\\\u0080'

434

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

434

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

435

'a weird byte: \\\\udcdd'

435

'a weird byte: \\\\udcdd'

436

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

436

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

437

'utf-8: caf\\\\u00e9'

437

'utf-8: caf\\\\u00e9'

438

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

438

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

439

'non-BMP: \\\\ud834\\\\udd1e'

439

'non-BMP: \\\\ud834\\\\udd1e'

440

>>> jsonescape(b'<foo@example.org>', paranoid=True)

440

>>> jsonescape(b'<foo@example.org>', paranoid=True)

441

'\\\\u003cfoo@example.org\\\\u003e'

441

'\\\\u003cfoo@example.org\\\\u003e'

442

'''

442

'''

443

444

u8chars = toutf8b(s)

444

u8chars = toutf8b(s)

445

try:

445

try:

446

return _jsonescapeu8fast(u8chars, paranoid)

446

return _jsonescapeu8fast(u8chars, paranoid)

447

except ValueError:

447

except ValueError:

448

pass

448

pass

449

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

449

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

450

451

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

451

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

452

453

def getutf8char(s, pos):

453

def getutf8char(s, pos):

454

'''get the next full utf-8 character in the given string, starting at pos

454

'''get the next full utf-8 character in the given string, starting at pos

455

456

Raises a UnicodeError if the given location does not start a valid

456

Raises a UnicodeError if the given location does not start a valid

457

utf-8 character.

457

utf-8 character.

458

'''

458

'''

459

460

# find how many bytes to attempt decoding from first nibble

460

# find how many bytes to attempt decoding from first nibble

461

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

461

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

462

if not l: # ascii

462

if not l: # ascii

463

return s[pos:pos + 1]

463

return s[pos:pos + 1]

464

465

c = s[pos:pos + l]

465

c = s[pos:pos + l]

466

# validate with attempted decode

466

# validate with attempted decode

467

c.decode("utf-8")

467

c.decode("utf-8")

468

return c

468

return c

469

470

def toutf8b(s):

470

def toutf8b(s):

471

'''convert a local, possibly-binary string into UTF-8b

471

'''convert a local, possibly-binary string into UTF-8b

472

473

This is intended as a generic method to preserve data when working

473

This is intended as a generic method to preserve data when working

474

with schemes like JSON and XML that have no provision for

474

with schemes like JSON and XML that have no provision for

475

arbitrary byte strings. As Mercurial often doesn't know

475

arbitrary byte strings. As Mercurial often doesn't know

476

what encoding data is in, we use so-called UTF-8b.

476

what encoding data is in, we use so-called UTF-8b.

477

478

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

478

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

479

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

479

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

480

uDC00-uDCFF.

480

uDC00-uDCFF.

481

482

Principles of operation:

482

Principles of operation:

483

484

- ASCII and UTF-8 data successfully round-trips and is understood

484

- ASCII and UTF-8 data successfully round-trips and is understood

485

by Unicode-oriented clients

485

by Unicode-oriented clients

486

- filenames and file contents in arbitrary other encodings can have

486

- filenames and file contents in arbitrary other encodings can have

487

be round-tripped or recovered by clueful clients

487

be round-tripped or recovered by clueful clients

488

- local strings that have a cached known UTF-8 encoding (aka

488

- local strings that have a cached known UTF-8 encoding (aka

489

localstr) get sent as UTF-8 so Unicode-oriented clients get the

489

localstr) get sent as UTF-8 so Unicode-oriented clients get the

490

Unicode data they want

490

Unicode data they want

491

- because we must preserve UTF-8 bytestring in places such as

491

- because we must preserve UTF-8 bytestring in places such as

492

filenames, metadata can't be roundtripped without help

492

filenames, metadata can't be roundtripped without help

493

494

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

494

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

495

arbitrary bytes into an internal Unicode format that can be

495

arbitrary bytes into an internal Unicode format that can be

496

re-encoded back into the original. Here we are exposing the

496

re-encoded back into the original. Here we are exposing the

497

internal surrogate encoding as a UTF-8 string.)

497

internal surrogate encoding as a UTF-8 string.)

498

'''

498

'''

499

500

if not isinstance(s, localstr) and isasciistr(s):

500

if not isinstance(s, localstr) and isasciistr(s):

501

return s

501

return s

502

if "\xed" not in s:

502

if "\xed" not in s:

503

if isinstance(s, localstr):

503

if isinstance(s, localstr):

504

return s._utf8

504

return s._utf8

505

try:

505

try:

506

s.decode('utf-8')

506

s.decode('utf-8')

507

return s

507

return s

508

except UnicodeDecodeError:

508

except UnicodeDecodeError:

509

pass

509

pass

510

511

s = pycompat.bytestr(s)

511

r = ""

512

r = ""

512

pos = 0

513

pos = 0

513

l = len(s)

514

l = len(s)

514

while pos < l:

515

while pos < l:

515

try:

516

try:

516

c = getutf8char(s, pos)

517

c = getutf8char(s, pos)

517

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

518

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

518

# have to re-escape existing U+DCxx characters

519

# have to re-escape existing U+DCxx characters

519

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

520

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

520

pos += 1

521

pos += 1

521

else:

522

else:

522

pos += len(c)

523

pos += len(c)

523

except UnicodeDecodeError:

524

except UnicodeDecodeError:

524

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

525

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

525

pos += 1

526

pos += 1

526

r += c

527

r += c

527

return r

528

return r

528

529

def fromutf8b(s):

530

def fromutf8b(s):

530

'''Given a UTF-8b string, return a local, possibly-binary string.

531

'''Given a UTF-8b string, return a local, possibly-binary string.

531

532

return the original binary string. This

533

return the original binary string. This

533

is a round-trip process for strings like filenames, but metadata

534

is a round-trip process for strings like filenames, but metadata

534

that's was passed through tolocal will remain in UTF-8.

535

that's was passed through tolocal will remain in UTF-8.

535

536

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

537

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

537

>>> m = b"\\xc3\\xa9\\x99abcd"

538

>>> m = b"\\xc3\\xa9\\x99abcd"

538

>>> toutf8b(m)

539

>>> toutf8b(m)

539

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

540

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

540

>>> roundtrip(m)

541

>>> roundtrip(m)

541

True

542

True

542

>>> roundtrip(b"\\xc2\\xc2\\x80")

543

>>> roundtrip(b"\\xc2\\xc2\\x80")

543

True

544

True

544

>>> roundtrip(b"\\xef\\xbf\\xbd")

545

>>> roundtrip(b"\\xef\\xbf\\xbd")

545

True

546

True

546

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

547

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

547

True

548

True

548

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

549

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

549

True

550

True

550

'''

551

'''

551

552

if isasciistr(s):

553

if isasciistr(s):

553

return s

554

return s

554

# fast path - look for uDxxx prefixes in s

555

# fast path - look for uDxxx prefixes in s

555

if "\xed" not in s:

556

if "\xed" not in s:

556

return s

557

return s

557

558

# We could do this with the unicode type but some Python builds

559

# We could do this with the unicode type but some Python builds

559

# use UTF-16 internally (issue5031) which causes non-BMP code

560

# use UTF-16 internally (issue5031) which causes non-BMP code

560

# points to be escaped. Instead, we use our handy getutf8char

561

# points to be escaped. Instead, we use our handy getutf8char

561

# helper again to walk the string without "decoding" it.

562

# helper again to walk the string without "decoding" it.

562

563

564

s = pycompat.bytestr(s)

563

r = ""

565

r = ""

564

pos = 0

566

pos = 0

565

l = len(s)

567

l = len(s)

566

while pos < l:

568

while pos < l:

567

c = getutf8char(s, pos)

569

c = getutf8char(s, pos)

568

pos += len(c)

570

pos += len(c)

569

# unescape U+DCxx characters

571

# unescape U+DCxx characters

570

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

572

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

571

c = chr(ord(c.decode("utf-8")) & 0xff)

573

c = chr(ord(c.decode("utf-8")) & 0xff)

572

r += c

574

r += c

573

return r

575

return r

574

576

575

if pycompat.ispy3:

577

if pycompat.ispy3:

576

class strio(io.TextIOWrapper):

578

class strio(io.TextIOWrapper):

577

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

579

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

578

580

579

Also works around Python closing streams.

581

Also works around Python closing streams.

580

"""

582

"""

581

583

582

def __init__(self, buffer):

584

def __init__(self, buffer):

583

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

585

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

584

586

585

def __del__(self):

587

def __del__(self):

586

"""Override __del__ so it doesn't close the underlying stream."""

588

"""Override __del__ so it doesn't close the underlying stream."""

587

else:

589

else:

588

strio = pycompat.identity

590

strio = pycompat.identity

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import io
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import (
                 charencode as charencodepure,
             )
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
                 if not l: # ascii
                     return s[pos:pos + 1]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if not isinstance(s, localstr) and isasciistr(s):
                     return s
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
+                s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
+                s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r
             if pycompat.ispy3:
                 class strio(io.TextIOWrapper):
                     """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
                     Also works around Python closing streams.
                     """
                     def __init__(self, buffer):
                         super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
                     def __del__(self):
                         """Override __del__ so it doesn't close the underlying stream."""
             else:
                 strio = pycompat.identity