upstream/mercurial-mirror Commit - r43770:5f2a8dab

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

charencode = policy.importmod(r'charencode')

23

charencode = policy.importmod(r'charencode')

24

25

isasciistr = charencode.isasciistr

25

isasciistr = charencode.isasciistr

26

asciilower = charencode.asciilower

26

asciilower = charencode.asciilower

27

asciiupper = charencode.asciiupper

27

asciiupper = charencode.asciiupper

28

_jsonescapeu8fast = charencode.jsonescapeu8fast

28

_jsonescapeu8fast = charencode.jsonescapeu8fast

29

30

_sysstr = pycompat.sysstr

30

_sysstr = pycompat.sysstr

31

32

if pycompat.ispy3:

32

if pycompat.ispy3:

33

unichr = chr

33

unichr = chr

34

35

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

35

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

36

# "Unicode Subtleties"), so we need to ignore them in some places for

36

# "Unicode Subtleties"), so we need to ignore them in some places for

37

# sanity.

37

# sanity.

38

_ignore = [

38

_ignore = [

39

unichr(int(x, 16)).encode("utf-8")

39

unichr(int(x, 16)).encode("utf-8")

40

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

40

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

b"206a 206b 206c 206d 206e 206f feff".split()

41

b"206a 206b 206c 206d 206e 206f feff".split()

42

]

42

]

43

# verify the next function will work

43

# verify the next function will work

44

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

44

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

45

46

47

def hfsignoreclean(s):

47

def hfsignoreclean(s):

48

"""Remove codepoints ignored by HFS+ from s.

48

"""Remove codepoints ignored by HFS+ from s.

49

50

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

50

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

51

'.hg'

51

'.hg'

52

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

52

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

53

'.hg'

53

'.hg'

54

"""

54

"""

55

if b"\xe2" in s or b"\xef" in s:

55

if b"\xe2" in s or b"\xef" in s:

56

for c in _ignore:

56

for c in _ignore:

57

s = s.replace(c, b'')

57

s = s.replace(c, b'')

58

return s

58

return s

59

60

61

# encoding.environ is provided read-only, which may not be used to modify

61

# encoding.environ is provided read-only, which may not be used to modify

62

# the process environment

62

# the process environment

63

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

63

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

64

if not pycompat.ispy3:

64

if not pycompat.ispy3:

65

environ = os.environ # re-exports

65

environ = os.environ # re-exports

66

elif _nativeenviron:

66

elif _nativeenviron:

67

environ = os.environb # re-exports

67

environ = os.environb # re-exports

68

else:

68

else:

69

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

69

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

70

# and recreate it once encoding is settled

70

# and recreate it once encoding is settled

71

environ = dict(

71

environ = dict(

72

(k.encode(r'utf-8'), v.encode(r'utf-8'))

72

(k.encode(r'utf-8'), v.encode(r'utf-8'))

73

for k, v in os.environ.items() # re-exports

73

for k, v in os.environ.items() # re-exports

74

)

74

)

75

76

_encodingrewrites = {

76

_encodingrewrites = {

77

b'646': b'ascii',

77

b'646': b'ascii',

78

b'ANSI_X3.4-1968': b'ascii',

78

b'ANSI_X3.4-1968': b'ascii',

79

}

79

}

80

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

80

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

81

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

81

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

82

# https://bugs.python.org/issue13216

82

# https://bugs.python.org/issue13216

83

if pycompat.iswindows and not pycompat.ispy3:

83

if pycompat.iswindows and not pycompat.ispy3:

84

_encodingrewrites[b'cp65001'] = b'utf-8'

84

_encodingrewrites[b'cp65001'] = b'utf-8'

85

86

try:

86

try:

87

encoding = environ.get(b"HGENCODING")

87

encoding = environ.get(b"HGENCODING")

88

if not encoding:

88

if not encoding:

89

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

89

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

90

encoding = _encodingrewrites.get(encoding, encoding)

90

encoding = _encodingrewrites.get(encoding, encoding)

91

except locale.Error:

91

except locale.Error:

92

encoding = b'ascii'

92

encoding = b'ascii'

93

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

93

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

94

fallbackencoding = b'ISO-8859-1'

94

fallbackencoding = b'ISO-8859-1'

95

96

97

class localstr(bytes):

97

class localstr(bytes):

98

'''This class allows strings that are unmodified to be

98

'''This class allows strings that are unmodified to be

99

round-tripped to the local encoding and back'''

99

round-tripped to the local encoding and back'''

100

101

def __new__(cls, u, l):

101

def __new__(cls, u, l):

102

s = bytes.__new__(cls, l)

102

s = bytes.__new__(cls, l)

103

s._utf8 = u

103

s._utf8 = u

104

return s

104

return s

105

106

def __hash__(self):

106

def __hash__(self):

107

return hash(self._utf8) # avoid collisions in local string space

107

return hash(self._utf8) # avoid collisions in local string space

108

109

110

class safelocalstr(bytes):

110

class safelocalstr(bytes):

111

"""Tagged string denoting it was previously an internal UTF-8 string,

111

"""Tagged string denoting it was previously an internal UTF-8 string,

112

and can be converted back to UTF-8 losslessly

112

and can be converted back to UTF-8 losslessly

113

114

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

114

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

115

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

115

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

116

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

116

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

117

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

117

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

118

"""

118

"""

119

120

121

def tolocal(s):

121

def tolocal(s):

122

"""

122

"""

123

Convert a string from internal UTF-8 to local encoding

123

Convert a string from internal UTF-8 to local encoding

124

125

All internal strings should be UTF-8 but some repos before the

125

All internal strings should be UTF-8 but some repos before the

126

implementation of locale support may contain latin1 or possibly

126

implementation of locale support may contain latin1 or possibly

127

other character sets. We attempt to decode everything strictly

127

other character sets. We attempt to decode everything strictly

128

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

128

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

129

replace unknown characters.

129

replace unknown characters.

130

131

The localstr class is used to cache the known UTF-8 encoding of

131

The localstr class is used to cache the known UTF-8 encoding of

132

strings next to their local representation to allow lossless

132

strings next to their local representation to allow lossless

133

round-trip conversion back to UTF-8.

133

round-trip conversion back to UTF-8.

134

135

>>> u = b'foo: \\xc3\\xa4' # utf-8

135

>>> u = b'foo: \\xc3\\xa4' # utf-8

136

>>> l = tolocal(u)

136

>>> l = tolocal(u)

137

>>> l

137

>>> l

138

'foo: ?'

138

'foo: ?'

139

>>> fromlocal(l)

139

>>> fromlocal(l)

140

'foo: \\xc3\\xa4'

140

'foo: \\xc3\\xa4'

141

>>> u2 = b'foo: \\xc3\\xa1'

141

>>> u2 = b'foo: \\xc3\\xa1'

142

>>> d = { l: 1, tolocal(u2): 2 }

142

>>> d = { l: 1, tolocal(u2): 2 }

143

>>> len(d) # no collision

143

>>> len(d) # no collision

144

2

144

2

145

>>> b'foo: ?' in d

145

>>> b'foo: ?' in d

146

False

146

False

147

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

147

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

148

>>> l = tolocal(l1)

148

>>> l = tolocal(l1)

149

>>> l

149

>>> l

150

'foo: ?'

150

'foo: ?'

151

>>> fromlocal(l) # magically in utf-8

151

>>> fromlocal(l) # magically in utf-8

152

'foo: \\xc3\\xa4'

152

'foo: \\xc3\\xa4'

153

"""

153

"""

154

155

if isasciistr(s):

155

if isasciistr(s):

156

return s

156

return s

157

158

try:

158

try:

159

try:

159

try:

160

# make sure string is actually stored in UTF-8

160

# make sure string is actually stored in UTF-8

161

u = s.decode('UTF-8')

161

u = s.decode('UTF-8')

162

if encoding == b'UTF-8':

162

if encoding == b'UTF-8':

163

# fast path

163

# fast path

164

return s

164

return s

165

r = u.encode(_sysstr(encoding), r"replace")

165

r = u.encode(_sysstr(encoding), r"replace")

166

if u == r.decode(_sysstr(encoding)):

166

if u == r.decode(_sysstr(encoding)):

167

# r is a safe, non-lossy encoding of s

167

# r is a safe, non-lossy encoding of s

168

return safelocalstr(r)

168

return safelocalstr(r)

169

return localstr(s, r)

169

return localstr(s, r)

170

except UnicodeDecodeError:

170

except UnicodeDecodeError:

171

# we should only get here if we're looking at an ancient changeset

171

# we should only get here if we're looking at an ancient changeset

172

try:

172

try:

173

u = s.decode(_sysstr(fallbackencoding))

173

u = s.decode(_sysstr(fallbackencoding))

174

r = u.encode(_sysstr(encoding), r"replace")

174

r = u.encode(_sysstr(encoding), r"replace")

175

if u == r.decode(_sysstr(encoding)):

175

if u == r.decode(_sysstr(encoding)):

176

# r is a safe, non-lossy encoding of s

176

# r is a safe, non-lossy encoding of s

177

return safelocalstr(r)

177

return safelocalstr(r)

178

return localstr(u.encode('UTF-8'), r)

178

return localstr(u.encode('UTF-8'), r)

179

except UnicodeDecodeError:

179

except UnicodeDecodeError:

180

u = s.decode("utf-8", "replace") # last ditch

180

u = s.decode("utf-8", "replace") # last ditch

181

# can't round-trip

181

# can't round-trip

182

return u.encode(_sysstr(encoding), r"replace")

182

return u.encode(_sysstr(encoding), r"replace")

183

except LookupError as k:

183

except LookupError as k:

184

raise error.Abort(k, hint=b"please check your locale settings")

184

raise error.Abort(k, hint=b"please check your locale settings")

185

186

187

def fromlocal(s):

187

def fromlocal(s):

188

"""

188

"""

189

Convert a string from the local character encoding to UTF-8

189

Convert a string from the local character encoding to UTF-8

190

191

We attempt to decode strings using the encoding mode set by

191

We attempt to decode strings using the encoding mode set by

192

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

192

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

193

characters will cause an error message. Other modes include

193

characters will cause an error message. Other modes include

194

'replace', which replaces unknown characters with a special

194

'replace', which replaces unknown characters with a special

195

Unicode character, and 'ignore', which drops the character.

195

Unicode character, and 'ignore', which drops the character.

196

"""

196

"""

197

198

# can we do a lossless round-trip?

198

# can we do a lossless round-trip?

199

if isinstance(s, localstr):

199

if isinstance(s, localstr):

200

return s._utf8

200

return s._utf8

201

if isasciistr(s):

201

if isasciistr(s):

202

return s

202

return s

203

204

try:

204

try:

205

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

205

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

206

return u.encode("utf-8")

206

return u.encode("utf-8")

207

except UnicodeDecodeError as inst:

207

except UnicodeDecodeError as inst:

208

sub = s[max(0, inst.start - 10) : inst.start + 10]

208

sub = s[max(0, inst.start - 10) : inst.start + 10]

209

raise error.Abort(

209

raise error.Abort(

210

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

210

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

211

)

211

)

212

except LookupError as k:

212

except LookupError as k:

213

raise error.Abort(k, hint=b"please check your locale settings")

213

raise error.Abort(k, hint=b"please check your locale settings")

214

215

216

def unitolocal(u):

216

def unitolocal(u):

217

"""Convert a unicode string to a byte string of local encoding"""

217

"""Convert a unicode string to a byte string of local encoding"""

218

return tolocal(u.encode('utf-8'))

218

return tolocal(u.encode('utf-8'))

219

220

221

def unifromlocal(s):

221

def unifromlocal(s):

222

"""Convert a byte string of local encoding to a unicode string"""

222

"""Convert a byte string of local encoding to a unicode string"""

223

return fromlocal(s).decode('utf-8')

223

return fromlocal(s).decode('utf-8')

224

225

226

def unimethod(bytesfunc):

226

def unimethod(bytesfunc):

227

"""Create a proxy method that forwards __unicode__() and __str__() of

227

"""Create a proxy method that forwards __unicode__() and __str__() of

228

Python 3 to __bytes__()"""

228

Python 3 to __bytes__()"""

229

230

def unifunc(obj):

230

def unifunc(obj):

231

return unifromlocal(bytesfunc(obj))

231

return unifromlocal(bytesfunc(obj))

232

233

return unifunc

233

return unifunc

234

235

236

# converter functions between native str and byte string. use these if the

236

# converter functions between native str and byte string. use these if the

237

# character encoding is not aware (e.g. exception message) or is known to

237

# character encoding is not aware (e.g. exception message) or is known to

238

# be locale dependent (e.g. date formatting.)

238

# be locale dependent (e.g. date formatting.)

239

if pycompat.ispy3:

239

if pycompat.ispy3:

240

strtolocal = unitolocal

240

strtolocal = unitolocal

241

strfromlocal = unifromlocal

241

strfromlocal = unifromlocal

242

strmethod = unimethod

242

strmethod = unimethod

243

else:

243

else:

244

strtolocal = pycompat.identity

244

245

strfromlocal = pycompat.identity

245

def strtolocal(s):

246

# type: (str) -> bytes

247

return s

248

249

def strfromlocal(s):

250

# type: (bytes) -> str

251

return s

252

246

strmethod = pycompat.identity

253

strmethod = pycompat.identity

247

254

248

if not _nativeenviron:

255

if not _nativeenviron:

249

# now encoding and helper functions are available, recreate the environ

256

# now encoding and helper functions are available, recreate the environ

250

# dict to be exported to other modules

257

# dict to be exported to other modules

251

environ = dict(

258

environ = dict(

252

(tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))

259

(tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))

253

for k, v in os.environ.items() # re-exports

260

for k, v in os.environ.items() # re-exports

254

)

261

)

255

262

256

if pycompat.ispy3:

263

if pycompat.ispy3:

257

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

264

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

258

# returns bytes.

265

# returns bytes.

259

if pycompat.iswindows:

266

if pycompat.iswindows:

260

# Python 3 on Windows issues a DeprecationWarning about using the bytes

267

# Python 3 on Windows issues a DeprecationWarning about using the bytes

261

# API when os.getcwdb() is called.

268

# API when os.getcwdb() is called.

262

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

269

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

263

else:

270

else:

264

getcwd = os.getcwdb # re-exports

271

getcwd = os.getcwdb # re-exports

265

else:

272

else:

266

getcwd = os.getcwd # re-exports

273

getcwd = os.getcwd # re-exports

267

274

268

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

275

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

269

_wide = _sysstr(

276

_wide = _sysstr(

270

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

277

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

271

and b"WFA"

278

and b"WFA"

272

or b"WF"

279

or b"WF"

273

)

280

)

274

281

275

282

276

def colwidth(s):

283

def colwidth(s):

277

b"Find the column width of a string for display in the local encoding"

284

b"Find the column width of a string for display in the local encoding"

278

return ucolwidth(s.decode(_sysstr(encoding), r'replace'))

285

return ucolwidth(s.decode(_sysstr(encoding), r'replace'))

279

286

280

287

281

def ucolwidth(d):

288

def ucolwidth(d):

282

b"Find the column width of a Unicode string for display"

289

b"Find the column width of a Unicode string for display"

283

eaw = getattr(unicodedata, 'east_asian_width', None)

290

eaw = getattr(unicodedata, 'east_asian_width', None)

284

if eaw is not None:

291

if eaw is not None:

285

return sum([eaw(c) in _wide and 2 or 1 for c in d])

292

return sum([eaw(c) in _wide and 2 or 1 for c in d])

286

return len(d)

293

return len(d)

287

294

288

295

289

def getcols(s, start, c):

296

def getcols(s, start, c):

290

'''Use colwidth to find a c-column substring of s starting at byte

297

'''Use colwidth to find a c-column substring of s starting at byte

291

index start'''

298

index start'''

292

for x in pycompat.xrange(start + c, len(s)):

299

for x in pycompat.xrange(start + c, len(s)):

293

t = s[start:x]

300

t = s[start:x]

294

if colwidth(t) == c:

301

if colwidth(t) == c:

295

return t

302

return t

296

303

297

304

298

def trim(s, width, ellipsis=b'', leftside=False):

305

def trim(s, width, ellipsis=b'', leftside=False):

299

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

306

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

300

307

301

If 'leftside' is True, left side of string 's' is trimmed.

308

If 'leftside' is True, left side of string 's' is trimmed.

302

'ellipsis' is always placed at trimmed side.

309

'ellipsis' is always placed at trimmed side.

303

310

304

>>> from .node import bin

311

>>> from .node import bin

305

>>> def bprint(s):

312

>>> def bprint(s):

306

... print(pycompat.sysstr(s))

313

... print(pycompat.sysstr(s))

307

>>> ellipsis = b'+++'

314

>>> ellipsis = b'+++'

308

>>> from . import encoding

315

>>> from . import encoding

309

>>> encoding.encoding = b'utf-8'

316

>>> encoding.encoding = b'utf-8'

310

>>> t = b'1234567890'

317

>>> t = b'1234567890'

311

>>> bprint(trim(t, 12, ellipsis=ellipsis))

318

>>> bprint(trim(t, 12, ellipsis=ellipsis))

312

1234567890

319

1234567890

313

>>> bprint(trim(t, 10, ellipsis=ellipsis))

320

>>> bprint(trim(t, 10, ellipsis=ellipsis))

314

1234567890

321

1234567890

315

>>> bprint(trim(t, 8, ellipsis=ellipsis))

322

>>> bprint(trim(t, 8, ellipsis=ellipsis))

316

12345+++

323

12345+++

317

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

324

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

318

+++67890

325

+++67890

319

>>> bprint(trim(t, 8))

326

>>> bprint(trim(t, 8))

320

12345678

327

12345678

321

>>> bprint(trim(t, 8, leftside=True))

328

>>> bprint(trim(t, 8, leftside=True))

322

34567890

329

34567890

323

>>> bprint(trim(t, 3, ellipsis=ellipsis))

330

>>> bprint(trim(t, 3, ellipsis=ellipsis))

324

+++

331

+++

325

>>> bprint(trim(t, 1, ellipsis=ellipsis))

332

>>> bprint(trim(t, 1, ellipsis=ellipsis))

326

+

333

+

327

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

334

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

328

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

335

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

329

>>> bprint(trim(t, 12, ellipsis=ellipsis))

336

>>> bprint(trim(t, 12, ellipsis=ellipsis))

330

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

337

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

331

>>> bprint(trim(t, 10, ellipsis=ellipsis))

338

>>> bprint(trim(t, 10, ellipsis=ellipsis))

332

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

339

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

333

>>> bprint(trim(t, 8, ellipsis=ellipsis))

340

>>> bprint(trim(t, 8, ellipsis=ellipsis))

334

\xe3\x81\x82\xe3\x81\x84+++

341

\xe3\x81\x82\xe3\x81\x84+++

335

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

342

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

336

+++\xe3\x81\x88\xe3\x81\x8a

343

+++\xe3\x81\x88\xe3\x81\x8a

337

>>> bprint(trim(t, 5))

344

>>> bprint(trim(t, 5))

338

\xe3\x81\x82\xe3\x81\x84

345

\xe3\x81\x82\xe3\x81\x84

339

>>> bprint(trim(t, 5, leftside=True))

346

>>> bprint(trim(t, 5, leftside=True))

340

\xe3\x81\x88\xe3\x81\x8a

347

\xe3\x81\x88\xe3\x81\x8a

341

>>> bprint(trim(t, 4, ellipsis=ellipsis))

348

>>> bprint(trim(t, 4, ellipsis=ellipsis))

342

+++

349

+++

343

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

350

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

344

+++

351

+++

345

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

352

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

346

>>> bprint(trim(t, 12, ellipsis=ellipsis))

353

>>> bprint(trim(t, 12, ellipsis=ellipsis))

347

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

354

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

348

>>> bprint(trim(t, 10, ellipsis=ellipsis))

355

>>> bprint(trim(t, 10, ellipsis=ellipsis))

349

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

356

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

350

>>> bprint(trim(t, 8, ellipsis=ellipsis))

357

>>> bprint(trim(t, 8, ellipsis=ellipsis))

351

\x11\x22\x33\x44\x55+++

358

\x11\x22\x33\x44\x55+++

352

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

359

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

353

+++\x66\x77\x88\x99\xaa

360

+++\x66\x77\x88\x99\xaa

354

>>> bprint(trim(t, 8))

361

>>> bprint(trim(t, 8))

355

\x11\x22\x33\x44\x55\x66\x77\x88

362

\x11\x22\x33\x44\x55\x66\x77\x88

356

>>> bprint(trim(t, 8, leftside=True))

363

>>> bprint(trim(t, 8, leftside=True))

357

\x33\x44\x55\x66\x77\x88\x99\xaa

364

\x33\x44\x55\x66\x77\x88\x99\xaa

358

>>> bprint(trim(t, 3, ellipsis=ellipsis))

365

>>> bprint(trim(t, 3, ellipsis=ellipsis))

359

+++

366

+++

360

>>> bprint(trim(t, 1, ellipsis=ellipsis))

367

>>> bprint(trim(t, 1, ellipsis=ellipsis))

361

+

368

+

362

"""

369

"""

363

try:

370

try:

364

u = s.decode(_sysstr(encoding))

371

u = s.decode(_sysstr(encoding))

365

except UnicodeDecodeError:

372

except UnicodeDecodeError:

366

if len(s) <= width: # trimming is not needed

373

if len(s) <= width: # trimming is not needed

367

return s

374

return s

368

width -= len(ellipsis)

375

width -= len(ellipsis)

369

if width <= 0: # no enough room even for ellipsis

376

if width <= 0: # no enough room even for ellipsis

370

return ellipsis[: width + len(ellipsis)]

377

return ellipsis[: width + len(ellipsis)]

371

if leftside:

378

if leftside:

372

return ellipsis + s[-width:]

379

return ellipsis + s[-width:]

373

return s[:width] + ellipsis

380

return s[:width] + ellipsis

374

381

375

if ucolwidth(u) <= width: # trimming is not needed

382

if ucolwidth(u) <= width: # trimming is not needed

376

return s

383

return s

377

384

378

width -= len(ellipsis)

385

width -= len(ellipsis)

379

if width <= 0: # no enough room even for ellipsis

386

if width <= 0: # no enough room even for ellipsis

380

return ellipsis[: width + len(ellipsis)]

387

return ellipsis[: width + len(ellipsis)]

381

388

382

if leftside:

389

if leftside:

383

uslice = lambda i: u[i:]

390

uslice = lambda i: u[i:]

384

concat = lambda s: ellipsis + s

391

concat = lambda s: ellipsis + s

385

else:

392

else:

386

uslice = lambda i: u[:-i]

393

uslice = lambda i: u[:-i]

387

concat = lambda s: s + ellipsis

394

concat = lambda s: s + ellipsis

388

for i in pycompat.xrange(1, len(u)):

395

for i in pycompat.xrange(1, len(u)):

389

usub = uslice(i)

396

usub = uslice(i)

390

if ucolwidth(usub) <= width:

397

if ucolwidth(usub) <= width:

391

return concat(usub.encode(_sysstr(encoding)))

398

return concat(usub.encode(_sysstr(encoding)))

392

return ellipsis # no enough room for multi-column characters

399

return ellipsis # no enough room for multi-column characters

393

400

394

401

395

def lower(s):

402

def lower(s):

396

b"best-effort encoding-aware case-folding of local string s"

403

b"best-effort encoding-aware case-folding of local string s"

397

try:

404

try:

398

return asciilower(s)

405

return asciilower(s)

399

except UnicodeDecodeError:

406

except UnicodeDecodeError:

400

pass

407

pass

401

try:

408

try:

402

if isinstance(s, localstr):

409

if isinstance(s, localstr):

403

u = s._utf8.decode("utf-8")

410

u = s._utf8.decode("utf-8")

404

else:

411

else:

405

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

412

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

406

413

407

lu = u.lower()

414

lu = u.lower()

408

if u == lu:

415

if u == lu:

409

return s # preserve localstring

416

return s # preserve localstring

410

return lu.encode(_sysstr(encoding))

417

return lu.encode(_sysstr(encoding))

411

except UnicodeError:

418

except UnicodeError:

412

return s.lower() # we don't know how to fold this except in ASCII

419

return s.lower() # we don't know how to fold this except in ASCII

413

except LookupError as k:

420

except LookupError as k:

414

raise error.Abort(k, hint=b"please check your locale settings")

421

raise error.Abort(k, hint=b"please check your locale settings")

415

422

416

423

417

def upper(s):

424

def upper(s):

418

b"best-effort encoding-aware case-folding of local string s"

425

b"best-effort encoding-aware case-folding of local string s"

419

try:

426

try:

420

return asciiupper(s)

427

return asciiupper(s)

421

except UnicodeDecodeError:

428

except UnicodeDecodeError:

422

return upperfallback(s)

429

return upperfallback(s)

423

430

424

431

425

def upperfallback(s):

432

def upperfallback(s):

426

try:

433

try:

427

if isinstance(s, localstr):

434

if isinstance(s, localstr):

428

u = s._utf8.decode("utf-8")

435

u = s._utf8.decode("utf-8")

429

else:

436

else:

430

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

437

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

431

438

432

uu = u.upper()

439

uu = u.upper()

433

if u == uu:

440

if u == uu:

434

return s # preserve localstring

441

return s # preserve localstring

435

return uu.encode(_sysstr(encoding))

442

return uu.encode(_sysstr(encoding))

436

except UnicodeError:

443

except UnicodeError:

437

return s.upper() # we don't know how to fold this except in ASCII

444

return s.upper() # we don't know how to fold this except in ASCII

438

except LookupError as k:

445

except LookupError as k:

439

raise error.Abort(k, hint=b"please check your locale settings")

446

raise error.Abort(k, hint=b"please check your locale settings")

440

447

441

448

442

class normcasespecs(object):

449

class normcasespecs(object):

443

'''what a platform's normcase does to ASCII strings

450

'''what a platform's normcase does to ASCII strings

444

451

445

This is specified per platform, and should be consistent with what normcase

452

This is specified per platform, and should be consistent with what normcase

446

on that platform actually does.

453

on that platform actually does.

447

454

448

lower: normcase lowercases ASCII strings

455

lower: normcase lowercases ASCII strings

449

upper: normcase uppercases ASCII strings

456

upper: normcase uppercases ASCII strings

450

other: the fallback function should always be called

457

other: the fallback function should always be called

451

458

452

This should be kept in sync with normcase_spec in util.h.'''

459

This should be kept in sync with normcase_spec in util.h.'''

453

460

454

lower = -1

461

lower = -1

455

upper = 1

462

upper = 1

456

other = 0

463

other = 0

457

464

458

465

459

def jsonescape(s, paranoid=False):

466

def jsonescape(s, paranoid=False):

460

'''returns a string suitable for JSON

467

'''returns a string suitable for JSON

461

468

462

JSON is problematic for us because it doesn't support non-Unicode

469

JSON is problematic for us because it doesn't support non-Unicode

463

bytes. To deal with this, we take the following approach:

470

bytes. To deal with this, we take the following approach:

464

471

465

- localstr/safelocalstr objects are converted back to UTF-8

472

- localstr/safelocalstr objects are converted back to UTF-8

466

- valid UTF-8/ASCII strings are passed as-is

473

- valid UTF-8/ASCII strings are passed as-is

467

- other strings are converted to UTF-8b surrogate encoding

474

- other strings are converted to UTF-8b surrogate encoding

468

- apply JSON-specified string escaping

475

- apply JSON-specified string escaping

469

476

470

(escapes are doubled in these tests)

477

(escapes are doubled in these tests)

471

478

472

>>> jsonescape(b'this is a test')

479

>>> jsonescape(b'this is a test')

473

'this is a test'

480

'this is a test'

474

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

481

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

475

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

482

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

476

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

483

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

477

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

484

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

478

>>> jsonescape(b'a weird byte: \\xdd')

485

>>> jsonescape(b'a weird byte: \\xdd')

479

'a weird byte: \\xed\\xb3\\x9d'

486

'a weird byte: \\xed\\xb3\\x9d'

480

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

487

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

481

'utf-8: caf\\xc3\\xa9'

488

'utf-8: caf\\xc3\\xa9'

482

>>> jsonescape(b'')

489

>>> jsonescape(b'')

483

''

490

''

484

491

485

If paranoid, non-ascii and common troublesome characters are also escaped.

492

If paranoid, non-ascii and common troublesome characters are also escaped.

486

This is suitable for web output.

493

This is suitable for web output.

487

494

488

>>> s = b'escape characters: \\0 \\x0b \\x7f'

495

>>> s = b'escape characters: \\0 \\x0b \\x7f'

489

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

496

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

490

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

497

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

491

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

498

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

492

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

499

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

493

'escape boundary: ~ \\\\u007f \\\\u0080'

500

'escape boundary: ~ \\\\u007f \\\\u0080'

494

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

501

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

495

'a weird byte: \\\\udcdd'

502

'a weird byte: \\\\udcdd'

496

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

503

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

497

'utf-8: caf\\\\u00e9'

504

'utf-8: caf\\\\u00e9'

498

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

505

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

499

'non-BMP: \\\\ud834\\\\udd1e'

506

'non-BMP: \\\\ud834\\\\udd1e'

500

>>> jsonescape(b'<foo@example.org>', paranoid=True)

507

>>> jsonescape(b'<foo@example.org>', paranoid=True)

501

'\\\\u003cfoo@example.org\\\\u003e'

508

'\\\\u003cfoo@example.org\\\\u003e'

502

'''

509

'''

503

510

504

u8chars = toutf8b(s)

511

u8chars = toutf8b(s)

505

try:

512

try:

506

return _jsonescapeu8fast(u8chars, paranoid)

513

return _jsonescapeu8fast(u8chars, paranoid)

507

except ValueError:

514

except ValueError:

508

pass

515

pass

509

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

516

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

510

517

511

518

512

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

519

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

513

# bytes are mapped to that range.

520

# bytes are mapped to that range.

514

if pycompat.ispy3:

521

if pycompat.ispy3:

515

_utf8strict = r'surrogatepass'

522

_utf8strict = r'surrogatepass'

516

else:

523

else:

517

_utf8strict = r'strict'

524

_utf8strict = r'strict'

518

525

519

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

526

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

520

527

521

528

522

def getutf8char(s, pos):

529

def getutf8char(s, pos):

523

'''get the next full utf-8 character in the given string, starting at pos

530

'''get the next full utf-8 character in the given string, starting at pos

524

531

525

Raises a UnicodeError if the given location does not start a valid

532

Raises a UnicodeError if the given location does not start a valid

526

utf-8 character.

533

utf-8 character.

527

'''

534

'''

528

535

529

# find how many bytes to attempt decoding from first nibble

536

# find how many bytes to attempt decoding from first nibble

530

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

537

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

531

if not l: # ascii

538

if not l: # ascii

532

return s[pos : pos + 1]

539

return s[pos : pos + 1]

533

540

534

c = s[pos : pos + l]

541

c = s[pos : pos + l]

535

# validate with attempted decode

542

# validate with attempted decode

536

c.decode("utf-8", _utf8strict)

543

c.decode("utf-8", _utf8strict)

537

return c

544

return c

538

545

539

546

540

def toutf8b(s):

547

def toutf8b(s):

541

'''convert a local, possibly-binary string into UTF-8b

548

'''convert a local, possibly-binary string into UTF-8b

542

549

543

This is intended as a generic method to preserve data when working

550

This is intended as a generic method to preserve data when working

544

with schemes like JSON and XML that have no provision for

551

with schemes like JSON and XML that have no provision for

545

arbitrary byte strings. As Mercurial often doesn't know

552

arbitrary byte strings. As Mercurial often doesn't know

546

what encoding data is in, we use so-called UTF-8b.

553

what encoding data is in, we use so-called UTF-8b.

547

554

548

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

555

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

549

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

556

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

550

uDC00-uDCFF.

557

uDC00-uDCFF.

551

558

552

Principles of operation:

559

Principles of operation:

553

560

554

- ASCII and UTF-8 data successfully round-trips and is understood

561

- ASCII and UTF-8 data successfully round-trips and is understood

555

by Unicode-oriented clients

562

by Unicode-oriented clients

556

- filenames and file contents in arbitrary other encodings can have

563

- filenames and file contents in arbitrary other encodings can have

557

be round-tripped or recovered by clueful clients

564

be round-tripped or recovered by clueful clients

558

- local strings that have a cached known UTF-8 encoding (aka

565

- local strings that have a cached known UTF-8 encoding (aka

559

localstr) get sent as UTF-8 so Unicode-oriented clients get the

566

localstr) get sent as UTF-8 so Unicode-oriented clients get the

560

Unicode data they want

567

Unicode data they want

561

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

568

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

562

- because we must preserve UTF-8 bytestring in places such as

569

- because we must preserve UTF-8 bytestring in places such as

563

filenames, metadata can't be roundtripped without help

570

filenames, metadata can't be roundtripped without help

564

571

565

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

572

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

566

arbitrary bytes into an internal Unicode format that can be

573

arbitrary bytes into an internal Unicode format that can be

567

re-encoded back into the original. Here we are exposing the

574

re-encoded back into the original. Here we are exposing the

568

internal surrogate encoding as a UTF-8 string.)

575

internal surrogate encoding as a UTF-8 string.)

569

'''

576

'''

570

577

571

if isinstance(s, localstr):

578

if isinstance(s, localstr):

572

# assume that the original UTF-8 sequence would never contain

579

# assume that the original UTF-8 sequence would never contain

573

# invalid characters in U+DCxx range

580

# invalid characters in U+DCxx range

574

return s._utf8

581

return s._utf8

575

elif isinstance(s, safelocalstr):

582

elif isinstance(s, safelocalstr):

576

# already verified that s is non-lossy in legacy encoding, which

583

# already verified that s is non-lossy in legacy encoding, which

577

# shouldn't contain characters in U+DCxx range

584

# shouldn't contain characters in U+DCxx range

578

return fromlocal(s)

585

return fromlocal(s)

579

elif isasciistr(s):

586

elif isasciistr(s):

580

return s

587

return s

581

if b"\xed" not in s:

588

if b"\xed" not in s:

582

try:

589

try:

583

s.decode('utf-8', _utf8strict)

590

s.decode('utf-8', _utf8strict)

584

return s

591

return s

585

except UnicodeDecodeError:

592

except UnicodeDecodeError:

586

pass

593

pass

587

594

588

s = pycompat.bytestr(s)

595

s = pycompat.bytestr(s)

589

r = b""

596

r = b""

590

pos = 0

597

pos = 0

591

l = len(s)

598

l = len(s)

592

while pos < l:

599

while pos < l:

593

try:

600

try:

594

c = getutf8char(s, pos)

601

c = getutf8char(s, pos)

595

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

602

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

596

# have to re-escape existing U+DCxx characters

603

# have to re-escape existing U+DCxx characters

597

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

604

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

598

pos += 1

605

pos += 1

599

else:

606

else:

600

pos += len(c)

607

pos += len(c)

601

except UnicodeDecodeError:

608

except UnicodeDecodeError:

602

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

609

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

603

pos += 1

610

pos += 1

604

r += c

611

r += c

605

return r

612

return r

606

613

607

614

608

def fromutf8b(s):

615

def fromutf8b(s):

609

'''Given a UTF-8b string, return a local, possibly-binary string.

616

'''Given a UTF-8b string, return a local, possibly-binary string.

610

617

611

return the original binary string. This

618

return the original binary string. This

612

is a round-trip process for strings like filenames, but metadata

619

is a round-trip process for strings like filenames, but metadata

613

that's was passed through tolocal will remain in UTF-8.

620

that's was passed through tolocal will remain in UTF-8.

614

621

615

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

622

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

616

>>> m = b"\\xc3\\xa9\\x99abcd"

623

>>> m = b"\\xc3\\xa9\\x99abcd"

617

>>> toutf8b(m)

624

>>> toutf8b(m)

618

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

625

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

619

>>> roundtrip(m)

626

>>> roundtrip(m)

620

True

627

True

621

>>> roundtrip(b"\\xc2\\xc2\\x80")

628

>>> roundtrip(b"\\xc2\\xc2\\x80")

622

True

629

True

623

>>> roundtrip(b"\\xef\\xbf\\xbd")

630

>>> roundtrip(b"\\xef\\xbf\\xbd")

624

True

631

True

625

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

632

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

626

True

633

True

627

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

634

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

628

True

635

True

629

'''

636

'''

630

637

631

if isasciistr(s):

638

if isasciistr(s):

632

return s

639

return s

633

# fast path - look for uDxxx prefixes in s

640

# fast path - look for uDxxx prefixes in s

634

if b"\xed" not in s:

641

if b"\xed" not in s:

635

return s

642

return s

636

643

637

# We could do this with the unicode type but some Python builds

644

# We could do this with the unicode type but some Python builds

638

# use UTF-16 internally (issue5031) which causes non-BMP code

645

# use UTF-16 internally (issue5031) which causes non-BMP code

639

# points to be escaped. Instead, we use our handy getutf8char

646

# points to be escaped. Instead, we use our handy getutf8char

640

# helper again to walk the string without "decoding" it.

647

# helper again to walk the string without "decoding" it.

641

648

642

s = pycompat.bytestr(s)

649

s = pycompat.bytestr(s)

643

r = b""

650

r = b""

644

pos = 0

651

pos = 0

645

l = len(s)

652

l = len(s)

646

while pos < l:

653

while pos < l:

647

c = getutf8char(s, pos)

654

c = getutf8char(s, pos)

648

pos += len(c)

655

pos += len(c)

649

# unescape U+DCxx characters

656

# unescape U+DCxx characters

650

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

657

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

651

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

658

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

652

r += c

659

r += c

653

return r

660

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict(
                     (k.encode(r'utf-8'), v.encode(r'utf-8'))
                     for k, v in os.environ.items()  # re-exports
                 )
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), r"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), r"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), r"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
-                strtolocal = pycompat.identity
-                strfromlocal = pycompat.identity
+                def strtolocal(s):
+                    # type: (str) -> bytes
+                    return s
+                def strfromlocal(s):
+                    # type: (bytes) -> str
+                    return s
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict(
                     (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
                     for k, v in os.environ.items()  # re-exports
                 )
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     getcwd = lambda: strtolocal(os.getcwd())  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 b"Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
             def ucolwidth(d):
                 b"Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis=b'', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             def lower(s):
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r