upstream/mercurial-mirror Commit - r52615:43adbe03

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

9

import locale

9

import locale

10

import os

10

import os

11

import re

11

import re

12

import typing

12

import typing

13

import unicodedata

13

import unicodedata

14

15

from typing import (

15

from typing import (

16

Any,

16

Any,

17

Callable,

17

Callable,

18

Text,

18

Text,

19

TypeVar,

19

TypeVar,

20

)

20

)

21

22

from . import (

22

from . import (

23

error,

23

error,

24

policy,

24

policy,

25

pycompat,

25

pycompat,

26

)

26

)

27

28

from .pure import charencode as charencodepure

28

from .pure import charencode as charencodepure

29

30

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

30

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

31

32

charencode = policy.importmod('charencode')

32

charencode = policy.importmod('charencode')

33

34

isasciistr = charencode.isasciistr

34

isasciistr = charencode.isasciistr

35

asciilower = charencode.asciilower

35

asciilower = charencode.asciilower

36

asciiupper = charencode.asciiupper

36

asciiupper = charencode.asciiupper

37

_jsonescapeu8fast = charencode.jsonescapeu8fast

37

_jsonescapeu8fast = charencode.jsonescapeu8fast

38

39

_sysstr = pycompat.sysstr

39

_sysstr = pycompat.sysstr

40

41

unichr = chr

41

unichr = chr

42

43

if typing.TYPE_CHECKING:

44

# TODO: make a stub file for .cext.charencode, and import here

45

from .pure.charencode import (

46

asciilower,

47

asciiupper,

48

isasciistr,

49

jsonescapeu8fast as _jsonescapeu8fast,

50

)

51

52

43

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

44

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# "Unicode Subtleties"), so we need to ignore them in some places for

45

# sanity.

55

# sanity.

46

_ignore = [

56

_ignore = [

47

unichr(int(x, 16)).encode("utf-8")

57

unichr(int(x, 16)).encode("utf-8")

48

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

49

b"206a 206b 206c 206d 206e 206f feff".split()

59

b"206a 206b 206c 206d 206e 206f feff".split()

50

]

60

]

51

# verify the next function will work

61

# verify the next function will work

52

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

53

63

54

64

55

def hfsignoreclean(s: bytes) -> bytes:

65

def hfsignoreclean(s: bytes) -> bytes:

56

"""Remove codepoints ignored by HFS+ from s.

66

"""Remove codepoints ignored by HFS+ from s.

57

67

58

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

59

'.hg'

69

'.hg'

60

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

61

'.hg'

71

'.hg'

62

"""

72

"""

63

if b"\xe2" in s or b"\xef" in s:

73

if b"\xe2" in s or b"\xef" in s:

64

for c in _ignore:

74

for c in _ignore:

65

s = s.replace(c, b'')

75

s = s.replace(c, b'')

66

return s

76

return s

67

77

68

78

69

# encoding.environ is provided read-only, which may not be used to modify

79

# encoding.environ is provided read-only, which may not be used to modify

70

# the process environment

80

# the process environment

71

_nativeenviron = os.supports_bytes_environ

81

_nativeenviron = os.supports_bytes_environ

72

if _nativeenviron:

82

if _nativeenviron:

73

environ = os.environb # re-exports

83

environ = os.environb # re-exports

74

if pycompat.sysplatform == b'OpenVMS':

84

if pycompat.sysplatform == b'OpenVMS':

75

# workaround for a bug in VSI 3.10 port

85

# workaround for a bug in VSI 3.10 port

76

# os.environb is only populated with a few Predefined symbols

86

# os.environb is only populated with a few Predefined symbols

77

def newget(self, key, default=None):

87

def newget(self, key, default=None):

78

# pytype on linux does not understand OpenVMS special modules

88

# pytype on linux does not understand OpenVMS special modules

79

import _decc # pytype: disable=import-error

89

import _decc # pytype: disable=import-error

80

90

81

v = _decc.getenv(key, None)

91

v = _decc.getenv(key, None)

82

if isinstance(key, bytes):

92

if isinstance(key, bytes):

83

return default if v is None else v.encode('latin-1')

93

return default if v is None else v.encode('latin-1')

84

else:

94

else:

85

return default if v is None else v

95

return default if v is None else v

86

96

87

environ.__class__.get = newget

97

environ.__class__.get = newget

88

else:

98

else:

89

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

99

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

90

# and recreate it once encoding is settled

100

# and recreate it once encoding is settled

91

environ = {

101

environ = {

92

k.encode('utf-8'): v.encode('utf-8')

102

k.encode('utf-8'): v.encode('utf-8')

93

for k, v in os.environ.items() # re-exports

103

for k, v in os.environ.items() # re-exports

94

}

104

}

95

105

96

_encodingrewrites = {

106

_encodingrewrites = {

97

b'646': b'ascii',

107

b'646': b'ascii',

98

b'ANSI_X3.4-1968': b'ascii',

108

b'ANSI_X3.4-1968': b'ascii',

99

}

109

}

100

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

110

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

101

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

111

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

102

# https://bugs.python.org/issue13216

112

# https://bugs.python.org/issue13216

103

if pycompat.iswindows:

113

if pycompat.iswindows:

104

_encodingrewrites[b'cp65001'] = b'utf-8'

114

_encodingrewrites[b'cp65001'] = b'utf-8'

105

115

106

encoding: bytes = b'' # help pytype avoid seeing None value

116

encoding: bytes = b'' # help pytype avoid seeing None value

107

try:

117

try:

108

encoding = environ.get(b"HGENCODING", b'')

118

encoding = environ.get(b"HGENCODING", b'')

109

if not encoding:

119

if not encoding:

110

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

120

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

111

encoding = _encodingrewrites.get(encoding, encoding)

121

encoding = _encodingrewrites.get(encoding, encoding)

112

except locale.Error:

122

except locale.Error:

113

encoding = b'ascii'

123

encoding = b'ascii'

114

encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")

124

encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")

115

fallbackencoding = b'ISO-8859-1'

125

fallbackencoding = b'ISO-8859-1'

116

126

117

127

118

class localstr(bytes):

128

class localstr(bytes):

119

"""This class allows strings that are unmodified to be

129

"""This class allows strings that are unmodified to be

120

round-tripped to the local encoding and back"""

130

round-tripped to the local encoding and back"""

121

131

122

def __new__(cls, u, l):

132

def __new__(cls, u, l):

123

s = bytes.__new__(cls, l)

133

s = bytes.__new__(cls, l)

124

s._utf8 = u

134

s._utf8 = u

125

return s

135

return s

126

136

127

if typing.TYPE_CHECKING:

137

if typing.TYPE_CHECKING:

128

# pseudo implementation to help pytype see localstr() constructor

138

# pseudo implementation to help pytype see localstr() constructor

129

def __init__(self, u: bytes, l: bytes) -> None:

139

def __init__(self, u: bytes, l: bytes) -> None:

130

super(localstr, self).__init__(l)

140

super(localstr, self).__init__(l)

131

self._utf8 = u

141

self._utf8 = u

132

142

133

def __hash__(self):

143

def __hash__(self):

134

return hash(self._utf8) # avoid collisions in local string space

144

return hash(self._utf8) # avoid collisions in local string space

135

145

136

146

137

class safelocalstr(bytes):

147

class safelocalstr(bytes):

138

"""Tagged string denoting it was previously an internal UTF-8 string,

148

"""Tagged string denoting it was previously an internal UTF-8 string,

139

and can be converted back to UTF-8 losslessly

149

and can be converted back to UTF-8 losslessly

140

150

141

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

151

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

142

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

152

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

143

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

153

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

144

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

154

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

145

"""

155

"""

146

156

147

157

148

def tolocal(s: bytes) -> bytes:

158

def tolocal(s: bytes) -> bytes:

149

"""

159

"""

150

Convert a string from internal UTF-8 to local encoding

160

Convert a string from internal UTF-8 to local encoding

151

161

152

All internal strings should be UTF-8 but some repos before the

162

All internal strings should be UTF-8 but some repos before the

153

implementation of locale support may contain latin1 or possibly

163

implementation of locale support may contain latin1 or possibly

154

other character sets. We attempt to decode everything strictly

164

other character sets. We attempt to decode everything strictly

155

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

165

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

156

replace unknown characters.

166

replace unknown characters.

157

167

158

The localstr class is used to cache the known UTF-8 encoding of

168

The localstr class is used to cache the known UTF-8 encoding of

159

strings next to their local representation to allow lossless

169

strings next to their local representation to allow lossless

160

round-trip conversion back to UTF-8.

170

round-trip conversion back to UTF-8.

161

171

162

>>> u = b'foo: \\xc3\\xa4' # utf-8

172

>>> u = b'foo: \\xc3\\xa4' # utf-8

163

>>> l = tolocal(u)

173

>>> l = tolocal(u)

164

>>> l

174

>>> l

165

'foo: ?'

175

'foo: ?'

166

>>> fromlocal(l)

176

>>> fromlocal(l)

167

'foo: \\xc3\\xa4'

177

'foo: \\xc3\\xa4'

168

>>> u2 = b'foo: \\xc3\\xa1'

178

>>> u2 = b'foo: \\xc3\\xa1'

169

>>> d = { l: 1, tolocal(u2): 2 }

179

>>> d = { l: 1, tolocal(u2): 2 }

170

>>> len(d) # no collision

180

>>> len(d) # no collision

171

2

181

2

172

>>> b'foo: ?' in d

182

>>> b'foo: ?' in d

173

False

183

False

174

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

184

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

175

>>> l = tolocal(l1)

185

>>> l = tolocal(l1)

176

>>> l

186

>>> l

177

'foo: ?'

187

'foo: ?'

178

>>> fromlocal(l) # magically in utf-8

188

>>> fromlocal(l) # magically in utf-8

179

'foo: \\xc3\\xa4'

189

'foo: \\xc3\\xa4'

180

"""

190

"""

181

191

182

if isasciistr(s):

192

if isasciistr(s):

183

return s

193

return s

184

194

185

try:

195

try:

186

try:

196

try:

187

# make sure string is actually stored in UTF-8

197

# make sure string is actually stored in UTF-8

188

u = s.decode('UTF-8')

198

u = s.decode('UTF-8')

189

if encoding == b'UTF-8':

199

if encoding == b'UTF-8':

190

# fast path

200

# fast path

191

return s

201

return s

192

r = u.encode(_sysstr(encoding), "replace")

202

r = u.encode(_sysstr(encoding), "replace")

193

if u == r.decode(_sysstr(encoding)):

203

if u == r.decode(_sysstr(encoding)):

194

# r is a safe, non-lossy encoding of s

204

# r is a safe, non-lossy encoding of s

195

return safelocalstr(r)

205

return safelocalstr(r)

196

return localstr(s, r)

206

return localstr(s, r)

197

except UnicodeDecodeError:

207

except UnicodeDecodeError:

198

# we should only get here if we're looking at an ancient changeset

208

# we should only get here if we're looking at an ancient changeset

199

try:

209

try:

200

u = s.decode(_sysstr(fallbackencoding))

210

u = s.decode(_sysstr(fallbackencoding))

201

r = u.encode(_sysstr(encoding), "replace")

211

r = u.encode(_sysstr(encoding), "replace")

202

if u == r.decode(_sysstr(encoding)):

212

if u == r.decode(_sysstr(encoding)):

203

# r is a safe, non-lossy encoding of s

213

# r is a safe, non-lossy encoding of s

204

return safelocalstr(r)

214

return safelocalstr(r)

205

return localstr(u.encode('UTF-8'), r)

215

return localstr(u.encode('UTF-8'), r)

206

except UnicodeDecodeError:

216

except UnicodeDecodeError:

207

u = s.decode("utf-8", "replace") # last ditch

217

u = s.decode("utf-8", "replace") # last ditch

208

# can't round-trip

218

# can't round-trip

209

return u.encode(_sysstr(encoding), "replace")

219

return u.encode(_sysstr(encoding), "replace")

210

except LookupError as k:

220

except LookupError as k:

211

raise error.Abort(

221

raise error.Abort(

212

pycompat.bytestr(k), hint=b"please check your locale settings"

222

pycompat.bytestr(k), hint=b"please check your locale settings"

213

)

223

)

214

224

215

225

216

def fromlocal(s: bytes) -> bytes:

226

def fromlocal(s: bytes) -> bytes:

217

"""

227

"""

218

Convert a string from the local character encoding to UTF-8

228

Convert a string from the local character encoding to UTF-8

219

229

220

We attempt to decode strings using the encoding mode set by

230

We attempt to decode strings using the encoding mode set by

221

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

231

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

222

characters will cause an error message. Other modes include

232

characters will cause an error message. Other modes include

223

'replace', which replaces unknown characters with a special

233

'replace', which replaces unknown characters with a special

224

Unicode character, and 'ignore', which drops the character.

234

Unicode character, and 'ignore', which drops the character.

225

"""

235

"""

226

236

227

# can we do a lossless round-trip?

237

# can we do a lossless round-trip?

228

if isinstance(s, localstr):

238

if isinstance(s, localstr):

229

return s._utf8

239

return s._utf8

230

if isasciistr(s):

240

if isasciistr(s):

231

return s

241

return s

232

242

233

try:

243

try:

234

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

244

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

235

return u.encode("utf-8")

245

return u.encode("utf-8")

236

except UnicodeDecodeError as inst:

246

except UnicodeDecodeError as inst:

237

sub = s[max(0, inst.start - 10) : inst.start + 10]

247

sub = s[max(0, inst.start - 10) : inst.start + 10]

238

raise error.Abort(

248

raise error.Abort(

239

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

249

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

240

)

250

)

241

except LookupError as k:

251

except LookupError as k:

242

raise error.Abort(

252

raise error.Abort(

243

pycompat.bytestr(k), hint=b"please check your locale settings"

253

pycompat.bytestr(k), hint=b"please check your locale settings"

244

)

254

)

245

255

246

256

247

def unitolocal(u: str) -> bytes:

257

def unitolocal(u: str) -> bytes:

248

"""Convert a unicode string to a byte string of local encoding"""

258

"""Convert a unicode string to a byte string of local encoding"""

249

return tolocal(u.encode('utf-8'))

259

return tolocal(u.encode('utf-8'))

250

260

251

261

252

def unifromlocal(s: bytes) -> str:

262

def unifromlocal(s: bytes) -> str:

253

"""Convert a byte string of local encoding to a unicode string"""

263

"""Convert a byte string of local encoding to a unicode string"""

254

return fromlocal(s).decode('utf-8')

264

return fromlocal(s).decode('utf-8')

255

265

256

266

257

def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:

267

def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:

258

"""Create a proxy method that forwards __unicode__() and __str__() of

268

"""Create a proxy method that forwards __unicode__() and __str__() of

259

Python 3 to __bytes__()"""

269

Python 3 to __bytes__()"""

260

270

261

def unifunc(obj):

271

def unifunc(obj):

262

return unifromlocal(bytesfunc(obj))

272

return unifromlocal(bytesfunc(obj))

263

273

264

return unifunc

274

return unifunc

265

275

266

276

267

# converter functions between native str and byte string. use these if the

277

# converter functions between native str and byte string. use these if the

268

# character encoding is not aware (e.g. exception message) or is known to

278

# character encoding is not aware (e.g. exception message) or is known to

269

# be locale dependent (e.g. date formatting.)

279

# be locale dependent (e.g. date formatting.)

270

strtolocal = unitolocal

280

strtolocal = unitolocal

271

strfromlocal = unifromlocal

281

strfromlocal = unifromlocal

272

strmethod = unimethod

282

strmethod = unimethod

273

283

274

284

275

def lower(s: bytes) -> bytes:

285

def lower(s: bytes) -> bytes:

276

"""best-effort encoding-aware case-folding of local string s"""

286

"""best-effort encoding-aware case-folding of local string s"""

277

try:

287

try:

278

return asciilower(s)

288

return asciilower(s)

279

except UnicodeDecodeError:

289

except UnicodeDecodeError:

280

pass

290

pass

281

try:

291

try:

282

if isinstance(s, localstr):

292

if isinstance(s, localstr):

283

u = s._utf8.decode("utf-8")

293

u = s._utf8.decode("utf-8")

284

else:

294

else:

285

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

295

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

286

296

287

lu = u.lower()

297

lu = u.lower()

288

if u == lu:

298

if u == lu:

289

return s # preserve localstring

299

return s # preserve localstring

290

return lu.encode(_sysstr(encoding))

300

return lu.encode(_sysstr(encoding))

291

except UnicodeError:

301

except UnicodeError:

292

return s.lower() # we don't know how to fold this except in ASCII

302

return s.lower() # we don't know how to fold this except in ASCII

293

except LookupError as k:

303

except LookupError as k:

294

raise error.Abort(

304

raise error.Abort(

295

pycompat.bytestr(k), hint=b"please check your locale settings"

305

pycompat.bytestr(k), hint=b"please check your locale settings"

296

)

306

)

297

307

298

308

299

def upper(s: bytes) -> bytes:

309

def upper(s: bytes) -> bytes:

300

"""best-effort encoding-aware case-folding of local string s"""

310

"""best-effort encoding-aware case-folding of local string s"""

301

try:

311

try:

302

return asciiupper(s)

312

return asciiupper(s)

303

except UnicodeDecodeError:

313

except UnicodeDecodeError:

304

return upperfallback(s)

314

return upperfallback(s)

305

315

306

316

307

def upperfallback(s: Any) -> Any:

317

def upperfallback(s: Any) -> Any:

308

try:

318

try:

309

if isinstance(s, localstr):

319

if isinstance(s, localstr):

310

u = s._utf8.decode("utf-8")

320

u = s._utf8.decode("utf-8")

311

else:

321

else:

312

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

322

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

313

323

314

uu = u.upper()

324

uu = u.upper()

315

if u == uu:

325

if u == uu:

316

return s # preserve localstring

326

return s # preserve localstring

317

return uu.encode(_sysstr(encoding))

327

return uu.encode(_sysstr(encoding))

318

except UnicodeError:

328

except UnicodeError:

319

return s.upper() # we don't know how to fold this except in ASCII

329

return s.upper() # we don't know how to fold this except in ASCII

320

except LookupError as k:

330

except LookupError as k:

321

raise error.Abort(

331

raise error.Abort(

322

pycompat.bytestr(k), hint=b"please check your locale settings"

332

pycompat.bytestr(k), hint=b"please check your locale settings"

323

)

333

)

324

334

325

335

326

if not _nativeenviron:

336

if not _nativeenviron:

327

# now encoding and helper functions are available, recreate the environ

337

# now encoding and helper functions are available, recreate the environ

328

# dict to be exported to other modules

338

# dict to be exported to other modules

329

if pycompat.iswindows:

339

if pycompat.iswindows:

330

340

331

class WindowsEnviron(dict):

341

class WindowsEnviron(dict):

332

"""`os.environ` normalizes environment variables to uppercase on windows"""

342

"""`os.environ` normalizes environment variables to uppercase on windows"""

333

343

334

def get(self, key, default=None):

344

def get(self, key, default=None):

335

return super().get(upper(key), default)

345

return super().get(upper(key), default)

336

346

337

environ = WindowsEnviron()

347

environ = WindowsEnviron()

338

348

339

for k, v in os.environ.items(): # re-exports

349

for k, v in os.environ.items(): # re-exports

340

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

350

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

341

351

342

352

343

DRIVE_RE = re.compile(b'^[a-z]:')

353

DRIVE_RE = re.compile(b'^[a-z]:')

344

354

345

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

355

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

346

# returns bytes.

356

# returns bytes.

347

if pycompat.iswindows:

357

if pycompat.iswindows:

348

# Python 3 on Windows issues a DeprecationWarning about using the bytes

358

# Python 3 on Windows issues a DeprecationWarning about using the bytes

349

# API when os.getcwdb() is called.

359

# API when os.getcwdb() is called.

350

#

360

#

351

# Additionally, py3.8+ uppercases the drive letter when calling

361

# Additionally, py3.8+ uppercases the drive letter when calling

352

# os.path.realpath(), which is used on ``repo.root``. Since those

362

# os.path.realpath(), which is used on ``repo.root``. Since those

353

# strings are compared in various places as simple strings, also call

363

# strings are compared in various places as simple strings, also call

354

# realpath here. See https://bugs.python.org/issue40368

364

# realpath here. See https://bugs.python.org/issue40368

355

#

365

#

356

# However this is not reliable, so lets explicitly make this drive

366

# However this is not reliable, so lets explicitly make this drive

357

# letter upper case.

367

# letter upper case.

358

#

368

#

359

# note: we should consider dropping realpath here since it seems to

369

# note: we should consider dropping realpath here since it seems to

360

# change the semantic of `getcwd`.

370

# change the semantic of `getcwd`.

361

371

362

def getcwd():

372

def getcwd():

363

cwd = os.getcwd() # re-exports

373

cwd = os.getcwd() # re-exports

364

cwd = os.path.realpath(cwd)

374

cwd = os.path.realpath(cwd)

365

cwd = strtolocal(cwd)

375

cwd = strtolocal(cwd)

366

if DRIVE_RE.match(cwd):

376

if DRIVE_RE.match(cwd):

367

cwd = cwd[0:1].upper() + cwd[1:]

377

cwd = cwd[0:1].upper() + cwd[1:]

368

return cwd

378

return cwd

369

379

370

else:

380

else:

371

getcwd = os.getcwdb # re-exports

381

getcwd = os.getcwdb # re-exports

372

382

373

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

383

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

374

_wide = _sysstr(

384

_wide = _sysstr(

375

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

385

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

376

and b"WFA"

386

and b"WFA"

377

or b"WF"

387

or b"WF"

378

)

388

)

379

389

380

390

381

def colwidth(s: bytes) -> int:

391

def colwidth(s: bytes) -> int:

382

"""Find the column width of a string for display in the local encoding"""

392

"""Find the column width of a string for display in the local encoding"""

383

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

393

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

384

394

385

395

386

def ucolwidth(d: Text) -> int:

396

def ucolwidth(d: Text) -> int:

387

"""Find the column width of a Unicode string for display"""

397

"""Find the column width of a Unicode string for display"""

388

eaw = getattr(unicodedata, 'east_asian_width', None)

398

eaw = getattr(unicodedata, 'east_asian_width', None)

389

if eaw is not None:

399

if eaw is not None:

390

return sum([eaw(c) in _wide and 2 or 1 for c in d])

400

return sum([eaw(c) in _wide and 2 or 1 for c in d])

391

return len(d)

401

return len(d)

392

402

393

403

394

def getcols(s: bytes, start: int, c: int) -> bytes:

404

def getcols(s: bytes, start: int, c: int) -> bytes:

395

"""Use colwidth to find a c-column substring of s starting at byte

405

"""Use colwidth to find a c-column substring of s starting at byte

396

index start"""

406

index start"""

397

for x in range(start + c, len(s)):

407

for x in range(start + c, len(s)):

398

t = s[start:x]

408

t = s[start:x]

399

if colwidth(t) == c:

409

if colwidth(t) == c:

400

return t

410

return t

401

raise ValueError('substring not found')

411

raise ValueError('substring not found')

402

412

403

413

404

def trim(

414

def trim(

405

s: bytes,

415

s: bytes,

406

width: int,

416

width: int,

407

ellipsis: bytes = b'',

417

ellipsis: bytes = b'',

408

leftside: bool = False,

418

leftside: bool = False,

409

) -> bytes:

419

) -> bytes:

410

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

420

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

411

421

412

If 'leftside' is True, left side of string 's' is trimmed.

422

If 'leftside' is True, left side of string 's' is trimmed.

413

'ellipsis' is always placed at trimmed side.

423

'ellipsis' is always placed at trimmed side.

414

424

415

>>> from .node import bin

425

>>> from .node import bin

416

>>> def bprint(s):

426

>>> def bprint(s):

417

... print(pycompat.sysstr(s))

427

... print(pycompat.sysstr(s))

418

>>> ellipsis = b'+++'

428

>>> ellipsis = b'+++'

419

>>> from . import encoding

429

>>> from . import encoding

420

>>> encoding.encoding = b'utf-8'

430

>>> encoding.encoding = b'utf-8'

421

>>> t = b'1234567890'

431

>>> t = b'1234567890'

422

>>> bprint(trim(t, 12, ellipsis=ellipsis))

432

>>> bprint(trim(t, 12, ellipsis=ellipsis))

423

1234567890

433

1234567890

424

>>> bprint(trim(t, 10, ellipsis=ellipsis))

434

>>> bprint(trim(t, 10, ellipsis=ellipsis))

425

1234567890

435

1234567890

426

>>> bprint(trim(t, 8, ellipsis=ellipsis))

436

>>> bprint(trim(t, 8, ellipsis=ellipsis))

427

12345+++

437

12345+++

428

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

438

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

429

+++67890

439

+++67890

430

>>> bprint(trim(t, 8))

440

>>> bprint(trim(t, 8))

431

12345678

441

12345678

432

>>> bprint(trim(t, 8, leftside=True))

442

>>> bprint(trim(t, 8, leftside=True))

433

34567890

443

34567890

434

>>> bprint(trim(t, 3, ellipsis=ellipsis))

444

>>> bprint(trim(t, 3, ellipsis=ellipsis))

435

+++

445

+++

436

>>> bprint(trim(t, 1, ellipsis=ellipsis))

446

>>> bprint(trim(t, 1, ellipsis=ellipsis))

437

+

447

+

438

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

448

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

439

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

449

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

440

>>> bprint(trim(t, 12, ellipsis=ellipsis))

450

>>> bprint(trim(t, 12, ellipsis=ellipsis))

441

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

451

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

442

>>> bprint(trim(t, 10, ellipsis=ellipsis))

452

>>> bprint(trim(t, 10, ellipsis=ellipsis))

443

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

453

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

444

>>> bprint(trim(t, 8, ellipsis=ellipsis))

454

>>> bprint(trim(t, 8, ellipsis=ellipsis))

445

\xe3\x81\x82\xe3\x81\x84+++

455

\xe3\x81\x82\xe3\x81\x84+++

446

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

456

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

447

+++\xe3\x81\x88\xe3\x81\x8a

457

+++\xe3\x81\x88\xe3\x81\x8a

448

>>> bprint(trim(t, 5))

458

>>> bprint(trim(t, 5))

449

\xe3\x81\x82\xe3\x81\x84

459

\xe3\x81\x82\xe3\x81\x84

450

>>> bprint(trim(t, 5, leftside=True))

460

>>> bprint(trim(t, 5, leftside=True))

451

\xe3\x81\x88\xe3\x81\x8a

461

\xe3\x81\x88\xe3\x81\x8a

452

>>> bprint(trim(t, 4, ellipsis=ellipsis))

462

>>> bprint(trim(t, 4, ellipsis=ellipsis))

453

+++

463

+++

454

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

464

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

455

+++

465

+++

456

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

466

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

457

>>> bprint(trim(t, 12, ellipsis=ellipsis))

467

>>> bprint(trim(t, 12, ellipsis=ellipsis))

458

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

468

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

459

>>> bprint(trim(t, 10, ellipsis=ellipsis))

469

>>> bprint(trim(t, 10, ellipsis=ellipsis))

460

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

470

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

461

>>> bprint(trim(t, 8, ellipsis=ellipsis))

471

>>> bprint(trim(t, 8, ellipsis=ellipsis))

462

\x11\x22\x33\x44\x55+++

472

\x11\x22\x33\x44\x55+++

463

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

473

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

464

+++\x66\x77\x88\x99\xaa

474

+++\x66\x77\x88\x99\xaa

465

>>> bprint(trim(t, 8))

475

>>> bprint(trim(t, 8))

466

\x11\x22\x33\x44\x55\x66\x77\x88

476

\x11\x22\x33\x44\x55\x66\x77\x88

467

>>> bprint(trim(t, 8, leftside=True))

477

>>> bprint(trim(t, 8, leftside=True))

468

\x33\x44\x55\x66\x77\x88\x99\xaa

478

\x33\x44\x55\x66\x77\x88\x99\xaa

469

>>> bprint(trim(t, 3, ellipsis=ellipsis))

479

>>> bprint(trim(t, 3, ellipsis=ellipsis))

470

+++

480

+++

471

>>> bprint(trim(t, 1, ellipsis=ellipsis))

481

>>> bprint(trim(t, 1, ellipsis=ellipsis))

472

+

482

+

473

"""

483

"""

474

try:

484

try:

475

u = s.decode(_sysstr(encoding))

485

u = s.decode(_sysstr(encoding))

476

except UnicodeDecodeError:

486

except UnicodeDecodeError:

477

if len(s) <= width: # trimming is not needed

487

if len(s) <= width: # trimming is not needed

478

return s

488

return s

479

width -= len(ellipsis)

489

width -= len(ellipsis)

480

if width <= 0: # no enough room even for ellipsis

490

if width <= 0: # no enough room even for ellipsis

481

return ellipsis[: width + len(ellipsis)]

491

return ellipsis[: width + len(ellipsis)]

482

if leftside:

492

if leftside:

483

return ellipsis + s[-width:]

493

return ellipsis + s[-width:]

484

return s[:width] + ellipsis

494

return s[:width] + ellipsis

485

495

486

if ucolwidth(u) <= width: # trimming is not needed

496

if ucolwidth(u) <= width: # trimming is not needed

487

return s

497

return s

488

498

489

width -= len(ellipsis)

499

width -= len(ellipsis)

490

if width <= 0: # no enough room even for ellipsis

500

if width <= 0: # no enough room even for ellipsis

491

return ellipsis[: width + len(ellipsis)]

501

return ellipsis[: width + len(ellipsis)]

492

502

493

chars = list(u)

503

chars = list(u)

494

if leftside:

504

if leftside:

495

chars.reverse()

505

chars.reverse()

496

width_so_far = 0

506

width_so_far = 0

497

for i, c in enumerate(chars):

507

for i, c in enumerate(chars):

498

width_so_far += ucolwidth(c)

508

width_so_far += ucolwidth(c)

499

if width_so_far > width:

509

if width_so_far > width:

500

break

510

break

501

chars = chars[:i]

511

chars = chars[:i]

502

if leftside:

512

if leftside:

503

chars.reverse()

513

chars.reverse()

504

u = u''.join(chars).encode(_sysstr(encoding))

514

u = u''.join(chars).encode(_sysstr(encoding))

505

if leftside:

515

if leftside:

506

return ellipsis + u

516

return ellipsis + u

507

return u + ellipsis

517

return u + ellipsis

508

518

509

519

510

class normcasespecs:

520

class normcasespecs:

511

"""what a platform's normcase does to ASCII strings

521

"""what a platform's normcase does to ASCII strings

512

522

513

This is specified per platform, and should be consistent with what normcase

523

This is specified per platform, and should be consistent with what normcase

514

on that platform actually does.

524

on that platform actually does.

515

525

516

lower: normcase lowercases ASCII strings

526

lower: normcase lowercases ASCII strings

517

upper: normcase uppercases ASCII strings

527

upper: normcase uppercases ASCII strings

518

other: the fallback function should always be called

528

other: the fallback function should always be called

519

529

520

This should be kept in sync with normcase_spec in util.h."""

530

This should be kept in sync with normcase_spec in util.h."""

521

531

522

lower = -1

532

lower = -1

523

upper = 1

533

upper = 1

524

other = 0

534

other = 0

525

535

526

536

527

def jsonescape(s: ~~Any~~, paranoid: ~~Any~~ = False) -> ~~Any~~:

537

def jsonescape(s: bytes, paranoid: bool = False) -> bytes:

528

"""returns a string suitable for JSON

538

"""returns a string suitable for JSON

529

539

530

JSON is problematic for us because it doesn't support non-Unicode

540

JSON is problematic for us because it doesn't support non-Unicode

531

bytes. To deal with this, we take the following approach:

541

bytes. To deal with this, we take the following approach:

532

542

533

- localstr/safelocalstr objects are converted back to UTF-8

543

- localstr/safelocalstr objects are converted back to UTF-8

534

- valid UTF-8/ASCII strings are passed as-is

544

- valid UTF-8/ASCII strings are passed as-is

535

- other strings are converted to UTF-8b surrogate encoding

545

- other strings are converted to UTF-8b surrogate encoding

536

- apply JSON-specified string escaping

546

- apply JSON-specified string escaping

537

547

538

(escapes are doubled in these tests)

548

(escapes are doubled in these tests)

539

549

540

>>> jsonescape(b'this is a test')

550

>>> jsonescape(b'this is a test')

541

'this is a test'

551

'this is a test'

542

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

552

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

543

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

553

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

544

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

554

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

545

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

555

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

546

>>> jsonescape(b'a weird byte: \\xdd')

556

>>> jsonescape(b'a weird byte: \\xdd')

547

'a weird byte: \\xed\\xb3\\x9d'

557

'a weird byte: \\xed\\xb3\\x9d'

548

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

558

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

549

'utf-8: caf\\xc3\\xa9'

559

'utf-8: caf\\xc3\\xa9'

550

>>> jsonescape(b'')

560

>>> jsonescape(b'')

551

''

561

''

552

562

553

If paranoid, non-ascii and common troublesome characters are also escaped.

563

If paranoid, non-ascii and common troublesome characters are also escaped.

554

This is suitable for web output.

564

This is suitable for web output.

555

565

556

>>> s = b'escape characters: \\0 \\x0b \\x7f'

566

>>> s = b'escape characters: \\0 \\x0b \\x7f'

557

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

567

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

558

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

568

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

559

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

569

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

560

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

570

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

561

'escape boundary: ~ \\\\u007f \\\\u0080'

571

'escape boundary: ~ \\\\u007f \\\\u0080'

562

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

572

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

563

'a weird byte: \\\\udcdd'

573

'a weird byte: \\\\udcdd'

564

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

574

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

565

'utf-8: caf\\\\u00e9'

575

'utf-8: caf\\\\u00e9'

566

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

576

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

567

'non-BMP: \\\\ud834\\\\udd1e'

577

'non-BMP: \\\\ud834\\\\udd1e'

568

>>> jsonescape(b'<foo@example.org>', paranoid=True)

578

>>> jsonescape(b'<foo@example.org>', paranoid=True)

569

'\\\\u003cfoo@example.org\\\\u003e'

579

'\\\\u003cfoo@example.org\\\\u003e'

570

"""

580

"""

571

581

572

u8chars = toutf8b(s)

582

u8chars = toutf8b(s)

573

try:

583

try:

574

return _jsonescapeu8fast(u8chars, paranoid)

584

return _jsonescapeu8fast(u8chars, paranoid)

575

except ValueError:

585

except ValueError:

576

pass

586

pass

577

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

587

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

578

588

579

589

580

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

590

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

581

# bytes are mapped to that range.

591

# bytes are mapped to that range.

582

_utf8strict = r'surrogatepass'

592

_utf8strict = r'surrogatepass'

583

593

584

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

594

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

585

595

586

596

587

def getutf8char(s: bytes, pos: int) -> bytes:

597

def getutf8char(s: bytes, pos: int) -> bytes:

588

"""get the next full utf-8 character in the given string, starting at pos

598

"""get the next full utf-8 character in the given string, starting at pos

589

599

590

Raises a UnicodeError if the given location does not start a valid

600

Raises a UnicodeError if the given location does not start a valid

591

utf-8 character.

601

utf-8 character.

592

"""

602

"""

593

603

594

# find how many bytes to attempt decoding from first nibble

604

# find how many bytes to attempt decoding from first nibble

595

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

605

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

596

if not l: # ascii

606

if not l: # ascii

597

return s[pos : pos + 1]

607

return s[pos : pos + 1]

598

608

599

c = s[pos : pos + l]

609

c = s[pos : pos + l]

600

# validate with attempted decode

610

# validate with attempted decode

601

c.decode("utf-8", _utf8strict)

611

c.decode("utf-8", _utf8strict)

602

return c

612

return c

603

613

604

614

605

def toutf8b(s: bytes) -> bytes:

615

def toutf8b(s: bytes) -> bytes:

606

"""convert a local, possibly-binary string into UTF-8b

616

"""convert a local, possibly-binary string into UTF-8b

607

617

608

This is intended as a generic method to preserve data when working

618

This is intended as a generic method to preserve data when working

609

with schemes like JSON and XML that have no provision for

619

with schemes like JSON and XML that have no provision for

610

arbitrary byte strings. As Mercurial often doesn't know

620

arbitrary byte strings. As Mercurial often doesn't know

611

what encoding data is in, we use so-called UTF-8b.

621

what encoding data is in, we use so-called UTF-8b.

612

622

613

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

623

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

614

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

624

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

615

uDC00-uDCFF.

625

uDC00-uDCFF.

616

626

617

Principles of operation:

627

Principles of operation:

618

628

619

- ASCII and UTF-8 data successfully round-trips and is understood

629

- ASCII and UTF-8 data successfully round-trips and is understood

620

by Unicode-oriented clients

630

by Unicode-oriented clients

621

- filenames and file contents in arbitrary other encodings can have

631

- filenames and file contents in arbitrary other encodings can have

622

be round-tripped or recovered by clueful clients

632

be round-tripped or recovered by clueful clients

623

- local strings that have a cached known UTF-8 encoding (aka

633

- local strings that have a cached known UTF-8 encoding (aka

624

localstr) get sent as UTF-8 so Unicode-oriented clients get the

634

localstr) get sent as UTF-8 so Unicode-oriented clients get the

625

Unicode data they want

635

Unicode data they want

626

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

636

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

627

- because we must preserve UTF-8 bytestring in places such as

637

- because we must preserve UTF-8 bytestring in places such as

628

filenames, metadata can't be roundtripped without help

638

filenames, metadata can't be roundtripped without help

629

639

630

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

640

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

631

arbitrary bytes into an internal Unicode format that can be

641

arbitrary bytes into an internal Unicode format that can be

632

re-encoded back into the original. Here we are exposing the

642

re-encoded back into the original. Here we are exposing the

633

internal surrogate encoding as a UTF-8 string.)

643

internal surrogate encoding as a UTF-8 string.)

634

"""

644

"""

635

645

636

if isinstance(s, localstr):

646

if isinstance(s, localstr):

637

# assume that the original UTF-8 sequence would never contain

647

# assume that the original UTF-8 sequence would never contain

638

# invalid characters in U+DCxx range

648

# invalid characters in U+DCxx range

639

return s._utf8

649

return s._utf8

640

elif isinstance(s, safelocalstr):

650

elif isinstance(s, safelocalstr):

641

# already verified that s is non-lossy in legacy encoding, which

651

# already verified that s is non-lossy in legacy encoding, which

642

# shouldn't contain characters in U+DCxx range

652

# shouldn't contain characters in U+DCxx range

643

return fromlocal(s)

653

return fromlocal(s)

644

elif isasciistr(s):

654

elif isasciistr(s):

645

return s

655

return s

646

if b"\xed" not in s:

656

if b"\xed" not in s:

647

try:

657

try:

648

s.decode('utf-8', _utf8strict)

658

s.decode('utf-8', _utf8strict)

649

return s

659

return s

650

except UnicodeDecodeError:

660

except UnicodeDecodeError:

651

pass

661

pass

652

662

653

s = pycompat.bytestr(s)

663

s = pycompat.bytestr(s)

654

r = bytearray()

664

r = bytearray()

655

pos = 0

665

pos = 0

656

l = len(s)

666

l = len(s)

657

while pos < l:

667

while pos < l:

658

try:

668

try:

659

c = getutf8char(s, pos)

669

c = getutf8char(s, pos)

660

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

670

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

661

# have to re-escape existing U+DCxx characters

671

# have to re-escape existing U+DCxx characters

662

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

672

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

663

pos += 1

673

pos += 1

664

else:

674

else:

665

pos += len(c)

675

pos += len(c)

666

except UnicodeDecodeError:

676

except UnicodeDecodeError:

667

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

677

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

668

pos += 1

678

pos += 1

669

r += c

679

r += c

670

return bytes(r)

680

return bytes(r)

671

681

672

682

673

def fromutf8b(s: bytes) -> bytes:

683

def fromutf8b(s: bytes) -> bytes:

674

"""Given a UTF-8b string, return a local, possibly-binary string.

684

"""Given a UTF-8b string, return a local, possibly-binary string.

675

685

676

return the original binary string. This

686

return the original binary string. This

677

is a round-trip process for strings like filenames, but metadata

687

is a round-trip process for strings like filenames, but metadata

678

that's was passed through tolocal will remain in UTF-8.

688

that's was passed through tolocal will remain in UTF-8.

679

689

680

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

690

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

681

>>> m = b"\\xc3\\xa9\\x99abcd"

691

>>> m = b"\\xc3\\xa9\\x99abcd"

682

>>> toutf8b(m)

692

>>> toutf8b(m)

683

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

693

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

684

>>> roundtrip(m)

694

>>> roundtrip(m)

685

True

695

True

686

>>> roundtrip(b"\\xc2\\xc2\\x80")

696

>>> roundtrip(b"\\xc2\\xc2\\x80")

687

True

697

True

688

>>> roundtrip(b"\\xef\\xbf\\xbd")

698

>>> roundtrip(b"\\xef\\xbf\\xbd")

689

True

699

True

690

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

700

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

691

True

701

True

692

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

702

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

693

True

703

True

694

"""

704

"""

695

705

696

if isasciistr(s):

706

if isasciistr(s):

697

return s

707

return s

698

# fast path - look for uDxxx prefixes in s

708

# fast path - look for uDxxx prefixes in s

699

if b"\xed" not in s:

709

if b"\xed" not in s:

700

return s

710

return s

701

711

702

# We could do this with the unicode type but some Python builds

712

# We could do this with the unicode type but some Python builds

703

# use UTF-16 internally (issue5031) which causes non-BMP code

713

# use UTF-16 internally (issue5031) which causes non-BMP code

704

# points to be escaped. Instead, we use our handy getutf8char

714

# points to be escaped. Instead, we use our handy getutf8char

705

# helper again to walk the string without "decoding" it.

715

# helper again to walk the string without "decoding" it.

706

716

707

s = pycompat.bytestr(s)

717

s = pycompat.bytestr(s)

708

r = bytearray()

718

r = bytearray()

709

pos = 0

719

pos = 0

710

l = len(s)

720

l = len(s)

711

while pos < l:

721

while pos < l:

712

c = getutf8char(s, pos)

722

c = getutf8char(s, pos)

713

pos += len(c)

723

pos += len(c)

714

# unescape U+DCxx characters

724

# unescape U+DCxx characters

715

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

725

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

716

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

726

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

717

r += c

727

r += c

718

return bytes(r)

728

return bytes(r)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import locale
             import os
             import re
             import typing
             import unicodedata
             from typing import (
                 Any,
                 Callable,
                 Text,
                 TypeVar,
             )
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             unichr = chr
+            if typing.TYPE_CHECKING:
+                # TODO: make a stub file for .cext.charencode, and import here
+                from .pure.charencode import (
+                    asciilower,
+                    asciiupper,
+                    isasciistr,
+                    jsonescapeu8fast as _jsonescapeu8fast,
+                )
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s: bytes) -> bytes:
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = os.supports_bytes_environ
             if _nativeenviron:
                 environ = os.environb  # re-exports
                 if pycompat.sysplatform == b'OpenVMS':
                     # workaround for a bug in VSI 3.10 port
                     # os.environb is only populated with a few Predefined symbols
                     def newget(self, key, default=None):
                         # pytype on linux does not understand OpenVMS special modules
                         import _decc  # pytype: disable=import-error
                         v = _decc.getenv(key, None)
                         if isinstance(key, bytes):
                             return default if v is None else v.encode('latin-1')
                         else:
                             return default if v is None else v
                     environ.__class__.get = newget
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = {
                     k.encode('utf-8'): v.encode('utf-8')
                     for k, v in os.environ.items()  # re-exports
                 }
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             encoding: bytes = b''  # help pytype avoid seeing None value
             try:
                 encoding = environ.get(b"HGENCODING", b'')
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 """This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back"""
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 if typing.TYPE_CHECKING:
                     # pseudo implementation to help pytype see localstr() constructor
                     def __init__(self, u: bytes, l: bytes) -> None:
                         super(localstr, self).__init__(l)
                         self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s: bytes) -> bytes:
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def fromlocal(s: bytes) -> bytes:
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def unitolocal(u: str) -> bytes:
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s: bytes) -> str:
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             strtolocal = unitolocal
             strfromlocal = unifromlocal
             strmethod = unimethod
             def lower(s: bytes) -> bytes:
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def upper(s: bytes) -> bytes:
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s: Any) -> Any:
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 if pycompat.iswindows:
                     class WindowsEnviron(dict):
                         """`os.environ` normalizes environment variables to uppercase on windows"""
                         def get(self, key, default=None):
                             return super().get(upper(key), default)
                     environ = WindowsEnviron()
                 for k, v in os.environ.items():  # re-exports
                     environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
             DRIVE_RE = re.compile(b'^[a-z]:')
             # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
             # returns bytes.
             if pycompat.iswindows:
                 # Python 3 on Windows issues a DeprecationWarning about using the bytes
                 # API when os.getcwdb() is called.
                 #
                 # Additionally, py3.8+ uppercases the drive letter when calling
                 # os.path.realpath(), which is used on ``repo.root``.  Since those
                 # strings are compared in various places as simple strings, also call
                 # realpath here.  See https://bugs.python.org/issue40368
                 #
                 # However this is not reliable, so lets explicitly make this drive
                 # letter upper case.
                 #
                 # note: we should consider dropping realpath here since it seems to
                 # change the semantic of `getcwd`.
                 def getcwd():
                     cwd = os.getcwd()  # re-exports
                     cwd = os.path.realpath(cwd)
                     cwd = strtolocal(cwd)
                     if DRIVE_RE.match(cwd):
                         cwd = cwd[0:1].upper() + cwd[1:]
                     return cwd
             else:
                 getcwd = os.getcwdb  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s: bytes) -> int:
                 """Find the column width of a string for display in the local encoding"""
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d: Text) -> int:
                 """Find the column width of a Unicode string for display"""
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s: bytes, start: int, c: int) -> bytes:
                 """Use colwidth to find a c-column substring of s starting at byte
                 index start"""
                 for x in range(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(
                 s: bytes,
                 width: int,
                 ellipsis: bytes = b'',
                 leftside: bool = False,
             ) -> bytes:
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 chars = list(u)
                 if leftside:
                     chars.reverse()
                 width_so_far = 0
                 for i, c in enumerate(chars):
                     width_so_far += ucolwidth(c)
                     if width_so_far > width:
                         break
                 chars = chars[:i]
                 if leftside:
                     chars.reverse()
                 u = u''.join(chars).encode(_sysstr(encoding))
                 if leftside:
                     return ellipsis + u
                 return u + ellipsis
             class normcasespecs:
                 """what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h."""
                 lower = -1
                 upper = 1
                 other = 0
-            def jsonescape(s: Any, paranoid: Any = False) -> Any:
+            def jsonescape(s: bytes, paranoid: bool = False) -> bytes:
                 """returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 """
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             _utf8strict = r'surrogatepass'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s: bytes, pos: int) -> bytes:
                 """get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 """
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s: bytes) -> bytes:
                 """convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 """
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = bytearray()
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return bytes(r)
             def fromutf8b(s: bytes) -> bytes:
                 """Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 """
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = bytearray()
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return bytes(r)

             # charencode.py - miscellaneous character encoding
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import array
             from .. import pycompat
-            def isasciistr(s):
+            def isasciistr(s: bytes) -> bool:
                 try:
                     s.decode('ascii')
                     return True
                 except UnicodeDecodeError:
                     return False
-            def asciilower(s):
+            def asciilower(s: bytes) -> bytes:
                 """convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found."""
                 s.decode('ascii')
                 return s.lower()
-            def asciiupper(s):
+            def asciiupper(s: bytes) -> bytes:
                 """convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found."""
                 s.decode('ascii')
                 return s.upper()
             _jsonmap = []
             _jsonmap.extend(b"\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append(b'\\u007f')
             _jsonmap[0x09] = b'\\t'
             _jsonmap[0x0A] = b'\\n'
             _jsonmap[0x22] = b'\\"'
             _jsonmap[0x5C] = b'\\\\'
             _jsonmap[0x08] = b'\\b'
             _jsonmap[0x0C] = b'\\f'
             _jsonmap[0x0D] = b'\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3C] = b'\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3E] = b'\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
-            def jsonescapeu8fast(u8chars, paranoid):
+            def jsonescapeu8fast(u8chars: bytes, paranoid: bool) -> bytes:
                 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
                 Raises ValueError if non-ASCII characters have to be escaped.
                 """
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 try:
                     return b''.join(jm[x] for x in bytearray(u8chars))
                 except IndexError:
                     raise ValueError
             _utf8strict = r'surrogatepass'
-            def jsonescapeu8fallback(u8chars, paranoid):
+            def jsonescapeu8fallback(u8chars: bytes, paranoid: bool) -> bytes:
                 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
                 Escapes all non-ASCII characters no matter if paranoid is False.
                 """
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
                 u16codes = array.array('H', u16b)
                 u16codes.pop(0)  # drop BOM
                 return b''.join(jm[x] if x < 128 else b'\\u%04x' % x for x in u16codes)