upstream/mercurial-mirror Commit - r33929:6c119dbf

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import io

10

import io

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import (

21

from .pure import (

22

charencode as charencodepure,

22

charencode as charencodepure,

23

)

23

)

24

25

charencode = policy.importmod(r'charencode')

25

charencode = policy.importmod(r'charencode')

26

27

isasciistr = charencode.isasciistr

27

isasciistr = charencode.isasciistr

28

asciilower = charencode.asciilower

28

asciilower = charencode.asciilower

29

asciiupper = charencode.asciiupper

29

asciiupper = charencode.asciiupper

30

_jsonescapeu8fast = charencode.jsonescapeu8fast

30

_jsonescapeu8fast = charencode.jsonescapeu8fast

31

32

_sysstr = pycompat.sysstr

32

_sysstr = pycompat.sysstr

33

34

if pycompat.ispy3:

34

if pycompat.ispy3:

35

unichr = chr

35

unichr = chr

36

37

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

37

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

38

# "Unicode Subtleties"), so we need to ignore them in some places for

38

# "Unicode Subtleties"), so we need to ignore them in some places for

39

# sanity.

39

# sanity.

40

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

40

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

41

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

42

"206a 206b 206c 206d 206e 206f feff".split()]

42

"206a 206b 206c 206d 206e 206f feff".split()]

43

# verify the next function will work

43

# verify the next function will work

44

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

44

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

45

46

def hfsignoreclean(s):

46

def hfsignoreclean(s):

47

"""Remove codepoints ignored by HFS+ from s.

47

"""Remove codepoints ignored by HFS+ from s.

48

49

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

49

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

50

'.hg'

50

'.hg'

51

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

51

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

52

'.hg'

52

'.hg'

53

"""

53

"""

54

if "\xe2" in s or "\xef" in s:

54

if "\xe2" in s or "\xef" in s:

55

for c in _ignore:

55

for c in _ignore:

56

s = s.replace(c, '')

56

s = s.replace(c, '')

57

return s

57

return s

58

59

# encoding.environ is provided read-only, which may not be used to modify

59

# encoding.environ is provided read-only, which may not be used to modify

60

# the process environment

60

# the process environment

61

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

61

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

62

if not pycompat.ispy3:

62

if not pycompat.ispy3:

63

environ = os.environ # re-exports

63

environ = os.environ # re-exports

64

elif _nativeenviron:

64

elif _nativeenviron:

65

environ = os.environb # re-exports

65

environ = os.environb # re-exports

66

else:

66

else:

67

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

67

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

68

# and recreate it once encoding is settled

68

# and recreate it once encoding is settled

69

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

69

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

70

for k, v in os.environ.items()) # re-exports

70

for k, v in os.environ.items()) # re-exports

71

72

_encodingfixers = {

72

_encodingfixers = {

73

'646': lambda: 'ascii',

73

'646': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

75

}

75

}

76

77

try:

77

try:

78

encoding = environ.get("HGENCODING")

78

encoding = environ.get("HGENCODING")

79

if not encoding:

79

if not encoding:

80

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

80

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

81

encoding = _encodingfixers.get(encoding, lambda: encoding)()

81

encoding = _encodingfixers.get(encoding, lambda: encoding)()

82

except locale.Error:

82

except locale.Error:

83

encoding = 'ascii'

83

encoding = 'ascii'

84

encodingmode = environ.get("HGENCODINGMODE", "strict")

84

encodingmode = environ.get("HGENCODINGMODE", "strict")

85

fallbackencoding = 'ISO-8859-1'

85

fallbackencoding = 'ISO-8859-1'

86

87

class localstr(bytes):

87

class localstr(bytes):

88

'''This class allows strings that are unmodified to be

88

'''This class allows strings that are unmodified to be

89

round-tripped to the local encoding and back'''

89

round-tripped to the local encoding and back'''

90

def __new__(cls, u, l):

90

def __new__(cls, u, l):

91

s = bytes.__new__(cls, l)

91

s = bytes.__new__(cls, l)

92

s._utf8 = u

92

s._utf8 = u

93

return s

93

return s

94

def __hash__(self):

94

def __hash__(self):

95

return hash(self._utf8) # avoid collisions in local string space

95

return hash(self._utf8) # avoid collisions in local string space

96

97

def tolocal(s):

97

def tolocal(s):

98

"""

98

"""

99

Convert a string from internal UTF-8 to local encoding

99

Convert a string from internal UTF-8 to local encoding

100

101

All internal strings should be UTF-8 but some repos before the

101

All internal strings should be UTF-8 but some repos before the

102

implementation of locale support may contain latin1 or possibly

102

implementation of locale support may contain latin1 or possibly

103

other character sets. We attempt to decode everything strictly

103

other character sets. We attempt to decode everything strictly

104

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

104

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

105

replace unknown characters.

105

replace unknown characters.

106

107

The localstr class is used to cache the known UTF-8 encoding of

107

The localstr class is used to cache the known UTF-8 encoding of

108

strings next to their local representation to allow lossless

108

strings next to their local representation to allow lossless

109

round-trip conversion back to UTF-8.

109

round-trip conversion back to UTF-8.

110

111

>>> u = 'foo: \\xc3\\xa4' # utf-8

111

>>> u = 'foo: \\xc3\\xa4' # utf-8

112

>>> l = tolocal(u)

112

>>> l = tolocal(u)

113

>>> l

113

>>> l

114

'foo: ?'

114

'foo: ?'

115

>>> fromlocal(l)

115

>>> fromlocal(l)

116

'foo: \\xc3\\xa4'

116

'foo: \\xc3\\xa4'

117

>>> u2 = 'foo: \\xc3\\xa1'

117

>>> u2 = 'foo: \\xc3\\xa1'

118

>>> d = { l: 1, tolocal(u2): 2 }

118

>>> d = { l: 1, tolocal(u2): 2 }

119

>>> len(d) # no collision

119

>>> len(d) # no collision

120

2

120

2

121

>>> 'foo: ?' in d

121

>>> 'foo: ?' in d

122

False

122

False

123

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

123

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

124

>>> l = tolocal(l1)

124

>>> l = tolocal(l1)

125

>>> l

125

>>> l

126

'foo: ?'

126

'foo: ?'

127

>>> fromlocal(l) # magically in utf-8

127

>>> fromlocal(l) # magically in utf-8

128

'foo: \\xc3\\xa4'

128

'foo: \\xc3\\xa4'

129

"""

129

"""

130

131

if isasciistr(s):

131

if isasciistr(s):

132

return s

132

return s

133

134

try:

134

try:

135

try:

135

try:

136

# make sure string is actually stored in UTF-8

136

# make sure string is actually stored in UTF-8

137

u = s.decode('UTF-8')

137

u = s.decode('UTF-8')

138

if encoding == 'UTF-8':

138

if encoding == 'UTF-8':

139

# fast path

139

# fast path

140

return s

140

return s

141

r = u.encode(_sysstr(encoding), u"replace")

141

r = u.encode(_sysstr(encoding), u"replace")

142

if u == r.decode(_sysstr(encoding)):

142

if u == r.decode(_sysstr(encoding)):

143

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

144

return r

144

return r

145

return localstr(s, r)

145

return localstr(s, r)

146

except UnicodeDecodeError:

146

except UnicodeDecodeError:

147

# we should only get here if we're looking at an ancient changeset

147

# we should only get here if we're looking at an ancient changeset

148

try:

148

try:

149

u = s.decode(_sysstr(fallbackencoding))

149

u = s.decode(_sysstr(fallbackencoding))

150

r = u.encode(_sysstr(encoding), u"replace")

150

r = u.encode(_sysstr(encoding), u"replace")

151

if u == r.decode(_sysstr(encoding)):

151

if u == r.decode(_sysstr(encoding)):

152

# r is a safe, non-lossy encoding of s

152

# r is a safe, non-lossy encoding of s

153

return r

153

return r

154

return localstr(u.encode('UTF-8'), r)

154

return localstr(u.encode('UTF-8'), r)

155

except UnicodeDecodeError:

155

except UnicodeDecodeError:

156

u = s.decode("utf-8", "replace") # last ditch

156

u = s.decode("utf-8", "replace") # last ditch

157

# can't round-trip

157

# can't round-trip

158

return u.encode(_sysstr(encoding), u"replace")

158

return u.encode(_sysstr(encoding), u"replace")

159

except LookupError as k:

159

except LookupError as k:

160

raise error.Abort(k, hint="please check your locale settings")

160

raise error.Abort(k, hint="please check your locale settings")

161

162

def fromlocal(s):

162

def fromlocal(s):

163

"""

163

"""

164

Convert a string from the local character encoding to UTF-8

164

Convert a string from the local character encoding to UTF-8

165

166

We attempt to decode strings using the encoding mode set by

166

We attempt to decode strings using the encoding mode set by

167

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

167

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

168

characters will cause an error message. Other modes include

168

characters will cause an error message. Other modes include

169

'replace', which replaces unknown characters with a special

169

'replace', which replaces unknown characters with a special

170

Unicode character, and 'ignore', which drops the character.

170

Unicode character, and 'ignore', which drops the character.

171

"""

171

"""

172

173

# can we do a lossless round-trip?

173

# can we do a lossless round-trip?

174

if isinstance(s, localstr):

174

if isinstance(s, localstr):

175

return s._utf8

175

return s._utf8

176

if isasciistr(s):

176

if isasciistr(s):

177

return s

177

return s

178

179

try:

179

try:

180

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

180

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

181

return u.encode("utf-8")

181

return u.encode("utf-8")

182

except UnicodeDecodeError as inst:

182

except UnicodeDecodeError as inst:

183

sub = s[max(0, inst.start - 10):inst.start + 10]

183

sub = s[max(0, inst.start - 10):inst.start + 10]

184

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

184

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

185

except LookupError as k:

185

except LookupError as k:

186

raise error.Abort(k, hint="please check your locale settings")

186

raise error.Abort(k, hint="please check your locale settings")

187

188

def unitolocal(u):

188

def unitolocal(u):

189

"""Convert a unicode string to a byte string of local encoding"""

189

"""Convert a unicode string to a byte string of local encoding"""

190

return tolocal(u.encode('utf-8'))

190

return tolocal(u.encode('utf-8'))

191

192

def unifromlocal(s):

192

def unifromlocal(s):

193

"""Convert a byte string of local encoding to a unicode string"""

193

"""Convert a byte string of local encoding to a unicode string"""

194

return fromlocal(s).decode('utf-8')

194

return fromlocal(s).decode('utf-8')

195

196

def unimethod(bytesfunc):

196

def unimethod(bytesfunc):

197

"""Create a proxy method that forwards __unicode__() and __str__() of

197

"""Create a proxy method that forwards __unicode__() and __str__() of

198

Python 3 to __bytes__()"""

198

Python 3 to __bytes__()"""

199

def unifunc(obj):

199

def unifunc(obj):

200

return unifromlocal(bytesfunc(obj))

200

return unifromlocal(bytesfunc(obj))

201

return unifunc

201

return unifunc

202

203

# converter functions between native str and byte string. use these if the

203

# converter functions between native str and byte string. use these if the

204

# character encoding is not aware (e.g. exception message) or is known to

204

# character encoding is not aware (e.g. exception message) or is known to

205

# be locale dependent (e.g. date formatting.)

205

# be locale dependent (e.g. date formatting.)

206

if pycompat.ispy3:

206

if pycompat.ispy3:

207

strtolocal = unitolocal

207

strtolocal = unitolocal

208

strfromlocal = unifromlocal

208

strfromlocal = unifromlocal

209

strmethod = unimethod

209

strmethod = unimethod

210

else:

210

else:

211

strtolocal = pycompat.identity

211

strtolocal = pycompat.identity

212

strfromlocal = pycompat.identity

212

strfromlocal = pycompat.identity

213

strmethod = pycompat.identity

213

strmethod = pycompat.identity

214

215

if not _nativeenviron:

215

if not _nativeenviron:

216

# now encoding and helper functions are available, recreate the environ

216

# now encoding and helper functions are available, recreate the environ

217

# dict to be exported to other modules

217

# dict to be exported to other modules

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

219

for k, v in os.environ.items()) # re-exports

219

for k, v in os.environ.items()) # re-exports

220

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

223

and "WFA" or "WF")

223

and "WFA" or "WF")

224

225

def colwidth(s):

225

def colwidth(s):

226

"Find the column width of a string for display in the local encoding"

226

"Find the column width of a string for display in the local encoding"

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

228

229

def ucolwidth(d):

229

def ucolwidth(d):

230

"Find the column width of a Unicode string for display"

230

"Find the column width of a Unicode string for display"

231

eaw = getattr(unicodedata, 'east_asian_width', None)

231

eaw = getattr(unicodedata, 'east_asian_width', None)

232

if eaw is not None:

232

if eaw is not None:

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

234

return len(d)

234

return len(d)

235

236

def getcols(s, start, c):

236

def getcols(s, start, c):

237

'''Use colwidth to find a c-column substring of s starting at byte

237

'''Use colwidth to find a c-column substring of s starting at byte

238

index start'''

238

index start'''

239

for x in xrange(start + c, len(s)):

239

for x in xrange(start + c, len(s)):

240

t = s[start:x]

240

t = s[start:x]

241

if colwidth(t) == c:

241

if colwidth(t) == c:

242

return t

242

return t

243

244

def trim(s, width, ellipsis='', leftside=False):

244

def trim(s, width, ellipsis='', leftside=False):

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

246

247

If 'leftside' is True, left side of string 's' is trimmed.

247

If 'leftside' is True, left side of string 's' is trimmed.

248

'ellipsis' is always placed at trimmed side.

248

'ellipsis' is always placed at trimmed side.

249

250

>>> ellipsis = '+++'

250

>>> ellipsis = '+++'

251

>>> from . import encoding

251

>>> from . import encoding

252

>>> encoding.encoding = 'utf-8'

252

>>> encoding.encoding = 'utf-8'

253

>>> t= '1234567890'

253

>>> t= '1234567890'

254

>>> print trim(t, 12, ellipsis=ellipsis)

254

>>> print trim(t, 12, ellipsis=ellipsis)

255

1234567890

255

1234567890

256

>>> print trim(t, 10, ellipsis=ellipsis)

256

>>> print trim(t, 10, ellipsis=ellipsis)

257

1234567890

257

1234567890

258

>>> print trim(t, 8, ellipsis=ellipsis)

258

>>> print trim(t, 8, ellipsis=ellipsis)

259

12345+++

259

12345+++

260

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

260

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

261

+++67890

261

+++67890

262

>>> print trim(t, 8)

262

>>> print trim(t, 8)

263

12345678

263

12345678

264

>>> print trim(t, 8, leftside=True)

264

>>> print trim(t, 8, leftside=True)

265

34567890

265

34567890

266

>>> print trim(t, 3, ellipsis=ellipsis)

266

>>> print trim(t, 3, ellipsis=ellipsis)

267

+++

267

+++

268

>>> print trim(t, 1, ellipsis=ellipsis)

268

>>> print trim(t, 1, ellipsis=ellipsis)

269

+

269

+

270

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

270

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

271

>>> t = u.encode(encoding.encoding)

271

>>> t = u.encode(encoding.encoding)

272

>>> print trim(t, 12, ellipsis=ellipsis)

272

>>> print trim(t, 12, ellipsis=ellipsis)

273

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

273

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

274

>>> print trim(t, 10, ellipsis=ellipsis)

274

>>> print trim(t, 10, ellipsis=ellipsis)

275

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

275

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

276

>>> print trim(t, 8, ellipsis=ellipsis)

276

>>> print trim(t, 8, ellipsis=ellipsis)

277

\xe3\x81\x82\xe3\x81\x84+++

277

\xe3\x81\x82\xe3\x81\x84+++

278

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

278

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

279

+++\xe3\x81\x88\xe3\x81\x8a

279

+++\xe3\x81\x88\xe3\x81\x8a

280

>>> print trim(t, 5)

280

>>> print trim(t, 5)

281

\xe3\x81\x82\xe3\x81\x84

281

\xe3\x81\x82\xe3\x81\x84

282

>>> print trim(t, 5, leftside=True)

282

>>> print trim(t, 5, leftside=True)

283

\xe3\x81\x88\xe3\x81\x8a

283

\xe3\x81\x88\xe3\x81\x8a

284

>>> print trim(t, 4, ellipsis=ellipsis)

284

>>> print trim(t, 4, ellipsis=ellipsis)

285

+++

285

+++

286

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

286

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

287

+++

287

+++

288

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

288

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

289

>>> print trim(t, 12, ellipsis=ellipsis)

289

>>> print trim(t, 12, ellipsis=ellipsis)

290

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

290

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

291

>>> print trim(t, 10, ellipsis=ellipsis)

291

>>> print trim(t, 10, ellipsis=ellipsis)

292

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

292

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

293

>>> print trim(t, 8, ellipsis=ellipsis)

293

>>> print trim(t, 8, ellipsis=ellipsis)

294

\x11\x22\x33\x44\x55+++

294

\x11\x22\x33\x44\x55+++

295

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

295

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

296

+++\x66\x77\x88\x99\xaa

296

+++\x66\x77\x88\x99\xaa

297

>>> print trim(t, 8)

297

>>> print trim(t, 8)

298

\x11\x22\x33\x44\x55\x66\x77\x88

298

\x11\x22\x33\x44\x55\x66\x77\x88

299

>>> print trim(t, 8, leftside=True)

299

>>> print trim(t, 8, leftside=True)

300

\x33\x44\x55\x66\x77\x88\x99\xaa

300

\x33\x44\x55\x66\x77\x88\x99\xaa

301

>>> print trim(t, 3, ellipsis=ellipsis)

301

>>> print trim(t, 3, ellipsis=ellipsis)

302

+++

302

+++

303

>>> print trim(t, 1, ellipsis=ellipsis)

303

>>> print trim(t, 1, ellipsis=ellipsis)

304

+

304

+

305

"""

305

"""

306

try:

306

try:

307

u = s.decode(_sysstr(encoding))

307

u = s.decode(_sysstr(encoding))

308

except UnicodeDecodeError:

308

except UnicodeDecodeError:

309

if len(s) <= width: # trimming is not needed

309

if len(s) <= width: # trimming is not needed

310

return s

310

return s

311

width -= len(ellipsis)

311

width -= len(ellipsis)

312

if width <= 0: # no enough room even for ellipsis

312

if width <= 0: # no enough room even for ellipsis

313

return ellipsis[:width + len(ellipsis)]

313

return ellipsis[:width + len(ellipsis)]

314

if leftside:

314

if leftside:

315

return ellipsis + s[-width:]

315

return ellipsis + s[-width:]

316

return s[:width] + ellipsis

316

return s[:width] + ellipsis

317

318

if ucolwidth(u) <= width: # trimming is not needed

318

if ucolwidth(u) <= width: # trimming is not needed

319

return s

319

return s

320

321

width -= len(ellipsis)

321

width -= len(ellipsis)

322

if width <= 0: # no enough room even for ellipsis

322

if width <= 0: # no enough room even for ellipsis

323

return ellipsis[:width + len(ellipsis)]

323

return ellipsis[:width + len(ellipsis)]

324

325

if leftside:

325

if leftside:

326

uslice = lambda i: u[i:]

326

uslice = lambda i: u[i:]

327

concat = lambda s: ellipsis + s

327

concat = lambda s: ellipsis + s

328

else:

328

else:

329

uslice = lambda i: u[:-i]

329

uslice = lambda i: u[:-i]

330

concat = lambda s: s + ellipsis

330

concat = lambda s: s + ellipsis

331

for i in xrange(1, len(u)):

331

for i in xrange(1, len(u)):

332

usub = uslice(i)

332

usub = uslice(i)

333

if ucolwidth(usub) <= width:

333

if ucolwidth(usub) <= width:

334

return concat(usub.encode(_sysstr(encoding)))

334

return concat(usub.encode(_sysstr(encoding)))

335

return ellipsis # no enough room for multi-column characters

335

return ellipsis # no enough room for multi-column characters

336

337

def lower(s):

337

def lower(s):

338

"best-effort encoding-aware case-folding of local string s"

338

"best-effort encoding-aware case-folding of local string s"

339

try:

339

try:

340

return asciilower(s)

340

return asciilower(s)

341

except UnicodeDecodeError:

341

except UnicodeDecodeError:

342

pass

342

pass

343

try:

343

try:

344

if isinstance(s, localstr):

344

if isinstance(s, localstr):

345

u = s._utf8.decode("utf-8")

345

u = s._utf8.decode("utf-8")

346

else:

346

else:

347

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

347

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

348

349

lu = u.lower()

349

lu = u.lower()

350

if u == lu:

350

if u == lu:

351

return s # preserve localstring

351

return s # preserve localstring

352

return lu.encode(_sysstr(encoding))

352

return lu.encode(_sysstr(encoding))

353

except UnicodeError:

353

except UnicodeError:

354

return s.lower() # we don't know how to fold this except in ASCII

354

return s.lower() # we don't know how to fold this except in ASCII

355

except LookupError as k:

355

except LookupError as k:

356

raise error.Abort(k, hint="please check your locale settings")

356

raise error.Abort(k, hint="please check your locale settings")

357

358

def upper(s):

358

def upper(s):

359

"best-effort encoding-aware case-folding of local string s"

359

"best-effort encoding-aware case-folding of local string s"

360

try:

360

try:

361

return asciiupper(s)

361

return asciiupper(s)

362

except UnicodeDecodeError:

362

except UnicodeDecodeError:

363

return upperfallback(s)

363

return upperfallback(s)

364

365

def upperfallback(s):

365

def upperfallback(s):

366

try:

366

try:

367

if isinstance(s, localstr):

367

if isinstance(s, localstr):

368

u = s._utf8.decode("utf-8")

368

u = s._utf8.decode("utf-8")

369

else:

369

else:

370

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

370

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

371

372

uu = u.upper()

372

uu = u.upper()

373

if u == uu:

373

if u == uu:

374

return s # preserve localstring

374

return s # preserve localstring

375

return uu.encode(_sysstr(encoding))

375

return uu.encode(_sysstr(encoding))

376

except UnicodeError:

376

except UnicodeError:

377

return s.upper() # we don't know how to fold this except in ASCII

377

return s.upper() # we don't know how to fold this except in ASCII

378

except LookupError as k:

378

except LookupError as k:

379

raise error.Abort(k, hint="please check your locale settings")

379

raise error.Abort(k, hint="please check your locale settings")

380

381

class normcasespecs(object):

381

class normcasespecs(object):

382

'''what a platform's normcase does to ASCII strings

382

'''what a platform's normcase does to ASCII strings

383

384

This is specified per platform, and should be consistent with what normcase

384

This is specified per platform, and should be consistent with what normcase

385

on that platform actually does.

385

on that platform actually does.

386

387

lower: normcase lowercases ASCII strings

387

lower: normcase lowercases ASCII strings

388

upper: normcase uppercases ASCII strings

388

upper: normcase uppercases ASCII strings

389

other: the fallback function should always be called

389

other: the fallback function should always be called

390

391

This should be kept in sync with normcase_spec in util.h.'''

391

This should be kept in sync with normcase_spec in util.h.'''

392

lower = -1

392

lower = -1

393

upper = 1

393

upper = 1

394

other = 0

394

other = 0

395

396

def jsonescape(s, paranoid=False):

396

def jsonescape(s, paranoid=False):

397

'''returns a string suitable for JSON

397

'''returns a string suitable for JSON

398

399

JSON is problematic for us because it doesn't support non-Unicode

399

JSON is problematic for us because it doesn't support non-Unicode

400

bytes. To deal with this, we take the following approach:

400

bytes. To deal with this, we take the following approach:

401

402

- localstr objects are converted back to UTF-8

402

- localstr objects are converted back to UTF-8

403

- valid UTF-8/ASCII strings are passed as-is

403

- valid UTF-8/ASCII strings are passed as-is

404

- other strings are converted to UTF-8b surrogate encoding

404

- other strings are converted to UTF-8b surrogate encoding

405

- apply JSON-specified string escaping

405

- apply JSON-specified string escaping

406

407

(escapes are doubled in these tests)

407

(escapes are doubled in these tests)

408

409

>>> jsonescape('this is a test')

409

>>> jsonescape('this is a test')

410

'this is a test'

410

'this is a test'

411

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

411

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

412

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

412

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

413

>>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\')

413

>>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\')

414

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

414

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

415

>>> jsonescape('a weird byte: \\xdd')

415

>>> jsonescape('a weird byte: \\xdd')

416

'a weird byte: \\xed\\xb3\\x9d'

416

'a weird byte: \\xed\\xb3\\x9d'

417

>>> jsonescape('utf-8: caf\\xc3\\xa9')

417

>>> jsonescape('utf-8: caf\\xc3\\xa9')

418

'utf-8: caf\\xc3\\xa9'

418

'utf-8: caf\\xc3\\xa9'

419

>>> jsonescape('')

419

>>> jsonescape('')

420

''

420

''

421

422

If paranoid, non-ascii and common troublesome characters are also escaped.

422

If paranoid, non-ascii and common troublesome characters are also escaped.

423

This is suitable for web output.

423

This is suitable for web output.

424

425

>>> s = 'escape characters: \\0 \\x0b \\x7f'

425

>>> s = 'escape characters: \\0 \\x0b \\x7f'

426

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

426

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

427

>>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

427

>>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

428

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

428

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

429

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

429

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

430

'escape boundary: ~ \\\\u007f \\\\u0080'

430

'escape boundary: ~ \\\\u007f \\\\u0080'

431

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

431

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

432

'a weird byte: \\\\udcdd'

432

'a weird byte: \\\\udcdd'

433

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

433

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

434

'utf-8: caf\\\\u00e9'

434

'utf-8: caf\\\\u00e9'

435

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

435

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

436

'non-BMP: \\\\ud834\\\\udd1e'

436

'non-BMP: \\\\ud834\\\\udd1e'

437

>>> jsonescape('<foo@example.org>', paranoid=True)

437

>>> jsonescape('<foo@example.org>', paranoid=True)

438

'\\\\u003cfoo@example.org\\\\u003e'

438

'\\\\u003cfoo@example.org\\\\u003e'

439

'''

439

'''

440

441

u8chars = toutf8b(s)

441

u8chars = toutf8b(s)

442

try:

442

try:

443

return _jsonescapeu8fast(u8chars, paranoid)

443

return _jsonescapeu8fast(u8chars, paranoid)

444

except ValueError:

444

except ValueError:

445

pass

445

pass

446

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

446

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

447

448

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

448

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

449

450

def getutf8char(s, pos):

450

def getutf8char(s, pos):

451

'''get the next full utf-8 character in the given string, starting at pos

451

'''get the next full utf-8 character in the given string, starting at pos

452

453

Raises a UnicodeError if the given location does not start a valid

453

Raises a UnicodeError if the given location does not start a valid

454

utf-8 character.

454

utf-8 character.

455

'''

455

'''

456

457

# find how many bytes to attempt decoding from first nibble

457

# find how many bytes to attempt decoding from first nibble

458

l = _utf8len[ord(s[pos]) >> 4]

458

l = _utf8len[ord(s[pos]) >> 4]

459

if not l: # ascii

459

if not l: # ascii

460

return s[pos]

460

return s[pos]

461

462

c = s[pos:pos + l]

462

c = s[pos:pos + l]

463

# validate with attempted decode

463

# validate with attempted decode

464

c.decode("utf-8")

464

c.decode("utf-8")

465

return c

465

return c

466

467

def toutf8b(s):

467

def toutf8b(s):

468

'''convert a local, possibly-binary string into UTF-8b

468

'''convert a local, possibly-binary string into UTF-8b

469

470

This is intended as a generic method to preserve data when working

470

This is intended as a generic method to preserve data when working

471

with schemes like JSON and XML that have no provision for

471

with schemes like JSON and XML that have no provision for

472

arbitrary byte strings. As Mercurial often doesn't know

472

arbitrary byte strings. As Mercurial often doesn't know

473

what encoding data is in, we use so-called UTF-8b.

473

what encoding data is in, we use so-called UTF-8b.

474

475

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

475

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

476

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

476

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

477

uDC00-uDCFF.

477

uDC00-uDCFF.

478

479

Principles of operation:

479

Principles of operation:

480

481

- ASCII and UTF-8 data successfully round-trips and is understood

481

- ASCII and UTF-8 data successfully round-trips and is understood

482

by Unicode-oriented clients

482

by Unicode-oriented clients

483

- filenames and file contents in arbitrary other encodings can have

483

- filenames and file contents in arbitrary other encodings can have

484

be round-tripped or recovered by clueful clients

484

be round-tripped or recovered by clueful clients

485

- local strings that have a cached known UTF-8 encoding (aka

485

- local strings that have a cached known UTF-8 encoding (aka

486

localstr) get sent as UTF-8 so Unicode-oriented clients get the

486

localstr) get sent as UTF-8 so Unicode-oriented clients get the

487

Unicode data they want

487

Unicode data they want

488

- because we must preserve UTF-8 bytestring in places such as

488

- because we must preserve UTF-8 bytestring in places such as

489

filenames, metadata can't be roundtripped without help

489

filenames, metadata can't be roundtripped without help

490

491

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

491

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

492

arbitrary bytes into an internal Unicode format that can be

492

arbitrary bytes into an internal Unicode format that can be

493

re-encoded back into the original. Here we are exposing the

493

re-encoded back into the original. Here we are exposing the

494

internal surrogate encoding as a UTF-8 string.)

494

internal surrogate encoding as a UTF-8 string.)

495

'''

495

'''

496

497

if not isinstance(s, localstr) and isasciistr(s):

498

return s

497

if "\xed" not in s:

499

if "\xed" not in s:

498

if isinstance(s, localstr):

500

if isinstance(s, localstr):

499

return s._utf8

501

return s._utf8

500

try:

502

try:

501

s.decode('utf-8')

503

s.decode('utf-8')

502

return s

504

return s

503

except UnicodeDecodeError:

505

except UnicodeDecodeError:

504

pass

506

pass

505

507

506

r = ""

508

r = ""

507

pos = 0

509

pos = 0

508

l = len(s)

510

l = len(s)

509

while pos < l:

511

while pos < l:

510

try:

512

try:

511

c = getutf8char(s, pos)

513

c = getutf8char(s, pos)

512

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

514

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

513

# have to re-escape existing U+DCxx characters

515

# have to re-escape existing U+DCxx characters

514

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

516

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

515

pos += 1

517

pos += 1

516

else:

518

else:

517

pos += len(c)

519

pos += len(c)

518

except UnicodeDecodeError:

520

except UnicodeDecodeError:

519

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

521

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

520

pos += 1

522

pos += 1

521

r += c

523

r += c

522

return r

524

return r

523

525

524

def fromutf8b(s):

526

def fromutf8b(s):

525

'''Given a UTF-8b string, return a local, possibly-binary string.

527

'''Given a UTF-8b string, return a local, possibly-binary string.

526

528

527

return the original binary string. This

529

return the original binary string. This

528

is a round-trip process for strings like filenames, but metadata

530

is a round-trip process for strings like filenames, but metadata

529

that's was passed through tolocal will remain in UTF-8.

531

that's was passed through tolocal will remain in UTF-8.

530

532

531

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

533

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

532

>>> m = "\\xc3\\xa9\\x99abcd"

534

>>> m = "\\xc3\\xa9\\x99abcd"

533

>>> toutf8b(m)

535

>>> toutf8b(m)

534

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

536

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

535

>>> roundtrip(m)

537

>>> roundtrip(m)

536

True

538

True

537

>>> roundtrip("\\xc2\\xc2\\x80")

539

>>> roundtrip("\\xc2\\xc2\\x80")

538

True

540

True

539

>>> roundtrip("\\xef\\xbf\\xbd")

541

>>> roundtrip("\\xef\\xbf\\xbd")

540

True

542

True

541

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

543

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

542

True

544

True

543

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

545

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

544

True

546

True

545

'''

547

'''

546

548

549

if isasciistr(s):

550

return s

547

# fast path - look for uDxxx prefixes in s

551

# fast path - look for uDxxx prefixes in s

548

if "\xed" not in s:

552

if "\xed" not in s:

549

return s

553

return s

550

554

551

# We could do this with the unicode type but some Python builds

555

# We could do this with the unicode type but some Python builds

552

# use UTF-16 internally (issue5031) which causes non-BMP code

556

# use UTF-16 internally (issue5031) which causes non-BMP code

553

# points to be escaped. Instead, we use our handy getutf8char

557

# points to be escaped. Instead, we use our handy getutf8char

554

# helper again to walk the string without "decoding" it.

558

# helper again to walk the string without "decoding" it.

555

559

556

r = ""

560

r = ""

557

pos = 0

561

pos = 0

558

l = len(s)

562

l = len(s)

559

while pos < l:

563

while pos < l:

560

c = getutf8char(s, pos)

564

c = getutf8char(s, pos)

561

pos += len(c)

565

pos += len(c)

562

# unescape U+DCxx characters

566

# unescape U+DCxx characters

563

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

567

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

564

c = chr(ord(c.decode("utf-8")) & 0xff)

568

c = chr(ord(c.decode("utf-8")) & 0xff)

565

r += c

569

r += c

566

return r

570

return r

567

571

568

if pycompat.ispy3:

572

if pycompat.ispy3:

569

class strio(io.TextIOWrapper):

573

class strio(io.TextIOWrapper):

570

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

574

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

571

575

572

Also works around Python closing streams.

576

Also works around Python closing streams.

573

"""

577

"""

574

578

575

def __init__(self, buffer):

579

def __init__(self, buffer):

576

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

580

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

577

581

578

def __del__(self):

582

def __del__(self):

579

"""Override __del__ so it doesn't close the underlying stream."""

583

"""Override __del__ so it doesn't close the underlying stream."""

580

else:

584

else:

581

strio = pycompat.identity

585

strio = pycompat.identity

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import io
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import (
                 charencode as charencodepure,
             )
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = 'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
+                if not isinstance(s, localstr) and isasciistr(s):
+                    return s
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
+                if isasciistr(s):
+                    return s
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r
             if pycompat.ispy3:
                 class strio(io.TextIOWrapper):
                     """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
                     Also works around Python closing streams.
                     """
                     def __init__(self, buffer):
                         super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
                     def __del__(self):
                         """Override __del__ so it doesn't close the underlying stream."""
             else:
                 strio = pycompat.identity

             from __future__ import absolute_import
             import unittest
             from mercurial import (
                 encoding,
             )
             class IsasciistrTest(unittest.TestCase):
                 asciistrs = [
                     b'a',
                     b'ab',
                     b'abc',
                     b'abcd',
                     b'abcde',
                     b'abcdefghi',
                     b'abcd\0fghi',
                 ]
                 def testascii(self):
                     for s in self.asciistrs:
                         self.assertTrue(encoding.isasciistr(s))
                 def testnonasciichar(self):
                     for s in self.asciistrs:
                         for i in range(len(s)):
                             t = bytearray(s)
                             t[i] |= 0x80
                             self.assertFalse(encoding.isasciistr(bytes(t)))
             class LocalEncodingTest(unittest.TestCase):
                 def testasciifastpath(self):
                     s = b'\0' * 100
                     self.assertTrue(s is encoding.tolocal(s))
                     self.assertTrue(s is encoding.fromlocal(s))
+            class Utf8bEncodingTest(unittest.TestCase):
+                def testasciifastpath(self):
+                    s = b'\0' * 100
+                    self.assertTrue(s is encoding.toutf8b(s))
+                    self.assertTrue(s is encoding.fromutf8b(s))
             if __name__ == '__main__':
                 import silenttestrunner
                 silenttestrunner.main(__name__)