upstream/mercurial-mirror Commit - r52566:f70f61a8

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

9

import locale

9

import locale

10

import os

10

import os

11

import re

11

import re

12

import typing

12

import typing

13

import unicodedata

13

import unicodedata

14

15

from typing import (

15

from typing import (

16

Any,

16

Any,

17

Callable,

17

Callable,

18

Text,

18

Text,

19

TypeVar,

19

TypeVar,

20

)

20

)

21

22

from . import (

22

from . import (

23

error,

23

error,

24

policy,

24

policy,

25

pycompat,

25

pycompat,

26

)

26

)

27

28

from .pure import charencode as charencodepure

28

from .pure import charencode as charencodepure

29

30

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

30

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

31

32

charencode = policy.importmod('charencode')

32

charencode = policy.importmod('charencode')

33

34

isasciistr = charencode.isasciistr

34

isasciistr = charencode.isasciistr

35

asciilower = charencode.asciilower

35

asciilower = charencode.asciilower

36

asciiupper = charencode.asciiupper

36

asciiupper = charencode.asciiupper

37

_jsonescapeu8fast = charencode.jsonescapeu8fast

37

_jsonescapeu8fast = charencode.jsonescapeu8fast

38

39

_sysstr = pycompat.sysstr

39

_sysstr = pycompat.sysstr

40

41

unichr = chr

41

unichr = chr

42

43

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

43

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

44

# "Unicode Subtleties"), so we need to ignore them in some places for

44

# "Unicode Subtleties"), so we need to ignore them in some places for

45

# sanity.

45

# sanity.

46

_ignore = [

46

_ignore = [

47

unichr(int(x, 16)).encode("utf-8")

47

unichr(int(x, 16)).encode("utf-8")

48

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

48

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

49

b"206a 206b 206c 206d 206e 206f feff".split()

49

b"206a 206b 206c 206d 206e 206f feff".split()

50

]

50

]

51

# verify the next function will work

51

# verify the next function will work

52

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

52

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

53

54

55

def hfsignoreclean(s: bytes) -> bytes:

55

def hfsignoreclean(s: bytes) -> bytes:

56

"""Remove codepoints ignored by HFS+ from s.

56

"""Remove codepoints ignored by HFS+ from s.

57

58

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

58

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

59

'.hg'

59

'.hg'

60

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

60

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

61

'.hg'

61

'.hg'

62

"""

62

"""

63

if b"\xe2" in s or b"\xef" in s:

63

if b"\xe2" in s or b"\xef" in s:

64

for c in _ignore:

64

for c in _ignore:

65

s = s.replace(c, b'')

65

s = s.replace(c, b'')

66

return s

66

return s

67

68

69

# encoding.environ is provided read-only, which may not be used to modify

69

# encoding.environ is provided read-only, which may not be used to modify

70

# the process environment

70

# the process environment

71

_nativeenviron = os.supports_bytes_environ

71

_nativeenviron = os.supports_bytes_environ

72

if _nativeenviron:

72

if _nativeenviron:

73

environ = os.environb # re-exports

73

environ = os.environb # re-exports

74

if pycompat.sysplatform == b'OpenVMS':

74

if pycompat.sysplatform == b'OpenVMS':

75

# workaround for a bug in VSI 3.10 port

75

# workaround for a bug in VSI 3.10 port

76

# os.environb is only populated with a few Predefined symbols

76

# os.environb is only populated with a few Predefined symbols

77

def newget(self, key, default=None):

77

def newget(self, key, default=None):

78

# pytype on linux does not understand OpenVMS special modules

78

# pytype on linux does not understand OpenVMS special modules

79

import _decc # pytype: disable=import-error

79

import _decc # pytype: disable=import-error

80

81

v = _decc.getenv(key, None)

81

v = _decc.getenv(key, None)

82

if isinstance(key, bytes):

82

if isinstance(key, bytes):

83

return default if v is None else v.encode('latin-1')

83

return default if v is None else v.encode('latin-1')

84

else:

84

else:

85

return default if v is None else v

85

return default if v is None else v

86

87

environ.__class__.get = newget

87

environ.__class__.get = newget

88

else:

88

else:

89

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

89

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

90

# and recreate it once encoding is settled

90

# and recreate it once encoding is settled

91

environ = {

91

environ = {

92

k.encode('utf-8'): v.encode('utf-8')

92

k.encode('utf-8'): v.encode('utf-8')

93

for k, v in os.environ.items() # re-exports

93

for k, v in os.environ.items() # re-exports

94

}

94

}

95

96

_encodingrewrites = {

96

_encodingrewrites = {

97

b'646': b'ascii',

97

b'646': b'ascii',

98

b'ANSI_X3.4-1968': b'ascii',

98

b'ANSI_X3.4-1968': b'ascii',

99

}

99

}

100

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

100

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

101

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

101

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

102

# https://bugs.python.org/issue13216

102

# https://bugs.python.org/issue13216

103

if pycompat.iswindows:

103

if pycompat.iswindows:

104

_encodingrewrites[b'cp65001'] = b'utf-8'

104

_encodingrewrites[b'cp65001'] = b'utf-8'

105

106

encoding: bytes = b'' # help pytype avoid seeing None value

106

try:

107

try:

107

encoding = environ.get(b"HGENCODING")

108

encoding = environ.get(b"HGENCODING", b'')

108

if not encoding:

109

if not encoding:

109

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

110

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

110

encoding = _encodingrewrites.get(encoding, encoding)

111

encoding = _encodingrewrites.get(encoding, encoding)

111

except locale.Error:

112

except locale.Error:

112

encoding = b'ascii'

113

encoding = b'ascii'

113

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

114

encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")

114

fallbackencoding = b'ISO-8859-1'

115

fallbackencoding = b'ISO-8859-1'

115

116

117

class localstr(bytes):

118

class localstr(bytes):

118

"""This class allows strings that are unmodified to be

119

"""This class allows strings that are unmodified to be

119

round-tripped to the local encoding and back"""

120

round-tripped to the local encoding and back"""

120

121

def __new__(cls, u, l):

122

def __new__(cls, u, l):

122

s = bytes.__new__(cls, l)

123

s = bytes.__new__(cls, l)

123

s._utf8 = u

124

s._utf8 = u

124

return s

125

return s

125

126

if typing.TYPE_CHECKING:

127

if typing.TYPE_CHECKING:

127

# pseudo implementation to help pytype see localstr() constructor

128

# pseudo implementation to help pytype see localstr() constructor

128

def __init__(self, u: bytes, l: bytes) -> None:

129

def __init__(self, u: bytes, l: bytes) -> None:

129

super(localstr, self).__init__(l)

130

super(localstr, self).__init__(l)

130

self._utf8 = u

131

self._utf8 = u

131

132

def __hash__(self):

133

def __hash__(self):

133

return hash(self._utf8) # avoid collisions in local string space

134

return hash(self._utf8) # avoid collisions in local string space

134

135

136

class safelocalstr(bytes):

137

class safelocalstr(bytes):

137

"""Tagged string denoting it was previously an internal UTF-8 string,

138

"""Tagged string denoting it was previously an internal UTF-8 string,

138

and can be converted back to UTF-8 losslessly

139

and can be converted back to UTF-8 losslessly

139

140

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

141

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

141

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

142

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

142

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

143

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

143

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

144

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

144

"""

145

"""

145

146

147

def tolocal(s: bytes) -> bytes:

148

def tolocal(s: bytes) -> bytes:

148

"""

149

"""

149

Convert a string from internal UTF-8 to local encoding

150

Convert a string from internal UTF-8 to local encoding

150

151

All internal strings should be UTF-8 but some repos before the

152

All internal strings should be UTF-8 but some repos before the

152

implementation of locale support may contain latin1 or possibly

153

implementation of locale support may contain latin1 or possibly

153

other character sets. We attempt to decode everything strictly

154

other character sets. We attempt to decode everything strictly

154

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

155

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

155

replace unknown characters.

156

replace unknown characters.

156

157

The localstr class is used to cache the known UTF-8 encoding of

158

The localstr class is used to cache the known UTF-8 encoding of

158

strings next to their local representation to allow lossless

159

strings next to their local representation to allow lossless

159

round-trip conversion back to UTF-8.

160

round-trip conversion back to UTF-8.

160

161

>>> u = b'foo: \\xc3\\xa4' # utf-8

162

>>> u = b'foo: \\xc3\\xa4' # utf-8

162

>>> l = tolocal(u)

163

>>> l = tolocal(u)

163

>>> l

164

>>> l

164

'foo: ?'

165

'foo: ?'

165

>>> fromlocal(l)

166

>>> fromlocal(l)

166

'foo: \\xc3\\xa4'

167

'foo: \\xc3\\xa4'

167

>>> u2 = b'foo: \\xc3\\xa1'

168

>>> u2 = b'foo: \\xc3\\xa1'

168

>>> d = { l: 1, tolocal(u2): 2 }

169

>>> d = { l: 1, tolocal(u2): 2 }

169

>>> len(d) # no collision

170

>>> len(d) # no collision

170

2

171

2

171

>>> b'foo: ?' in d

172

>>> b'foo: ?' in d

172

False

173

False

173

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

174

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

174

>>> l = tolocal(l1)

175

>>> l = tolocal(l1)

175

>>> l

176

>>> l

176

'foo: ?'

177

'foo: ?'

177

>>> fromlocal(l) # magically in utf-8

178

>>> fromlocal(l) # magically in utf-8

178

'foo: \\xc3\\xa4'

179

'foo: \\xc3\\xa4'

179

"""

180

"""

180

181

if isasciistr(s):

182

if isasciistr(s):

182

return s

183

return s

183

184

try:

185

try:

185

try:

186

try:

186

# make sure string is actually stored in UTF-8

187

# make sure string is actually stored in UTF-8

187

u = s.decode('UTF-8')

188

u = s.decode('UTF-8')

188

if encoding == b'UTF-8':

189

if encoding == b'UTF-8':

189

# fast path

190

# fast path

190

return s

191

return s

191

r = u.encode(_sysstr(encoding), "replace")

192

r = u.encode(_sysstr(encoding), "replace")

192

if u == r.decode(_sysstr(encoding)):

193

if u == r.decode(_sysstr(encoding)):

193

# r is a safe, non-lossy encoding of s

194

# r is a safe, non-lossy encoding of s

194

return safelocalstr(r)

195

return safelocalstr(r)

195

return localstr(s, r)

196

return localstr(s, r)

196

except UnicodeDecodeError:

197

except UnicodeDecodeError:

197

# we should only get here if we're looking at an ancient changeset

198

# we should only get here if we're looking at an ancient changeset

198

try:

199

try:

199

u = s.decode(_sysstr(fallbackencoding))

200

u = s.decode(_sysstr(fallbackencoding))

200

r = u.encode(_sysstr(encoding), "replace")

201

r = u.encode(_sysstr(encoding), "replace")

201

if u == r.decode(_sysstr(encoding)):

202

if u == r.decode(_sysstr(encoding)):

202

# r is a safe, non-lossy encoding of s

203

# r is a safe, non-lossy encoding of s

203

return safelocalstr(r)

204

return safelocalstr(r)

204

return localstr(u.encode('UTF-8'), r)

205

return localstr(u.encode('UTF-8'), r)

205

except UnicodeDecodeError:

206

except UnicodeDecodeError:

206

u = s.decode("utf-8", "replace") # last ditch

207

u = s.decode("utf-8", "replace") # last ditch

207

# can't round-trip

208

# can't round-trip

208

return u.encode(_sysstr(encoding), "replace")

209

return u.encode(_sysstr(encoding), "replace")

209

except LookupError as k:

210

except LookupError as k:

210

raise error.Abort(

211

raise error.Abort(

211

pycompat.bytestr(k), hint=b"please check your locale settings"

212

pycompat.bytestr(k), hint=b"please check your locale settings"

212

)

213

)

213

214

215

def fromlocal(s: bytes) -> bytes:

216

def fromlocal(s: bytes) -> bytes:

216

"""

217

"""

217

Convert a string from the local character encoding to UTF-8

218

Convert a string from the local character encoding to UTF-8

218

219

We attempt to decode strings using the encoding mode set by

220

We attempt to decode strings using the encoding mode set by

220

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

221

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

221

characters will cause an error message. Other modes include

222

characters will cause an error message. Other modes include

222

'replace', which replaces unknown characters with a special

223

'replace', which replaces unknown characters with a special

223

Unicode character, and 'ignore', which drops the character.

224

Unicode character, and 'ignore', which drops the character.

224

"""

225

"""

225

226

# can we do a lossless round-trip?

227

# can we do a lossless round-trip?

227

if isinstance(s, localstr):

228

if isinstance(s, localstr):

228

return s._utf8

229

return s._utf8

229

if isasciistr(s):

230

if isasciistr(s):

230

return s

231

return s

231

232

try:

233

try:

233

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

234

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

234

return u.encode("utf-8")

235

return u.encode("utf-8")

235

except UnicodeDecodeError as inst:

236

except UnicodeDecodeError as inst:

236

sub = s[max(0, inst.start - 10) : inst.start + 10]

237

sub = s[max(0, inst.start - 10) : inst.start + 10]

237

raise error.Abort(

238

raise error.Abort(

238

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

239

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

239

)

240

)

240

except LookupError as k:

241

except LookupError as k:

241

raise error.Abort(

242

raise error.Abort(

242

pycompat.bytestr(k), hint=b"please check your locale settings"

243

pycompat.bytestr(k), hint=b"please check your locale settings"

243

)

244

)

244

245

246

def unitolocal(u: str) -> bytes:

247

def unitolocal(u: str) -> bytes:

247

"""Convert a unicode string to a byte string of local encoding"""

248

"""Convert a unicode string to a byte string of local encoding"""

248

return tolocal(u.encode('utf-8'))

249

return tolocal(u.encode('utf-8'))

249

250

251

def unifromlocal(s: bytes) -> str:

252

def unifromlocal(s: bytes) -> str:

252

"""Convert a byte string of local encoding to a unicode string"""

253

"""Convert a byte string of local encoding to a unicode string"""

253

return fromlocal(s).decode('utf-8')

254

return fromlocal(s).decode('utf-8')

254

255

256

def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:

257

def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:

257

"""Create a proxy method that forwards __unicode__() and __str__() of

258

"""Create a proxy method that forwards __unicode__() and __str__() of

258

Python 3 to __bytes__()"""

259

Python 3 to __bytes__()"""

259

260

def unifunc(obj):

261

def unifunc(obj):

261

return unifromlocal(bytesfunc(obj))

262

return unifromlocal(bytesfunc(obj))

262

263

return unifunc

264

return unifunc

264

265

266

# converter functions between native str and byte string. use these if the

267

# converter functions between native str and byte string. use these if the

267

# character encoding is not aware (e.g. exception message) or is known to

268

# character encoding is not aware (e.g. exception message) or is known to

268

# be locale dependent (e.g. date formatting.)

269

# be locale dependent (e.g. date formatting.)

269

strtolocal = unitolocal

270

strtolocal = unitolocal

270

strfromlocal = unifromlocal

271

strfromlocal = unifromlocal

271

strmethod = unimethod

272

strmethod = unimethod

272

273

274

def lower(s: bytes) -> bytes:

275

def lower(s: bytes) -> bytes:

275

"""best-effort encoding-aware case-folding of local string s"""

276

"""best-effort encoding-aware case-folding of local string s"""

276

try:

277

try:

277

return asciilower(s)

278

return asciilower(s)

278

except UnicodeDecodeError:

279

except UnicodeDecodeError:

279

pass

280

pass

280

try:

281

try:

281

if isinstance(s, localstr):

282

if isinstance(s, localstr):

282

u = s._utf8.decode("utf-8")

283

u = s._utf8.decode("utf-8")

283

else:

284

else:

284

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

285

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

285

286

lu = u.lower()

287

lu = u.lower()

287

if u == lu:

288

if u == lu:

288

return s # preserve localstring

289

return s # preserve localstring

289

return lu.encode(_sysstr(encoding))

290

return lu.encode(_sysstr(encoding))

290

except UnicodeError:

291

except UnicodeError:

291

return s.lower() # we don't know how to fold this except in ASCII

292

return s.lower() # we don't know how to fold this except in ASCII

292

except LookupError as k:

293

except LookupError as k:

293

raise error.Abort(

294

raise error.Abort(

294

pycompat.bytestr(k), hint=b"please check your locale settings"

295

pycompat.bytestr(k), hint=b"please check your locale settings"

295

)

296

)

296

297

298

def upper(s: bytes) -> bytes:

299

def upper(s: bytes) -> bytes:

299

"""best-effort encoding-aware case-folding of local string s"""

300

"""best-effort encoding-aware case-folding of local string s"""

300

try:

301

try:

301

return asciiupper(s)

302

return asciiupper(s)

302

except UnicodeDecodeError:

303

except UnicodeDecodeError:

303

return upperfallback(s)

304

return upperfallback(s)

304

305

306

def upperfallback(s: Any) -> Any:

307

def upperfallback(s: Any) -> Any:

307

try:

308

try:

308

if isinstance(s, localstr):

309

if isinstance(s, localstr):

309

u = s._utf8.decode("utf-8")

310

u = s._utf8.decode("utf-8")

310

else:

311

else:

311

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

312

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

312

313

uu = u.upper()

314

uu = u.upper()

314

if u == uu:

315

if u == uu:

315

return s # preserve localstring

316

return s # preserve localstring

316

return uu.encode(_sysstr(encoding))

317

return uu.encode(_sysstr(encoding))

317

except UnicodeError:

318

except UnicodeError:

318

return s.upper() # we don't know how to fold this except in ASCII

319

return s.upper() # we don't know how to fold this except in ASCII

319

except LookupError as k:

320

except LookupError as k:

320

raise error.Abort(

321

raise error.Abort(

321

pycompat.bytestr(k), hint=b"please check your locale settings"

322

pycompat.bytestr(k), hint=b"please check your locale settings"

322

)

323

)

323

324

325

if not _nativeenviron:

326

if not _nativeenviron:

326

# now encoding and helper functions are available, recreate the environ

327

# now encoding and helper functions are available, recreate the environ

327

# dict to be exported to other modules

328

# dict to be exported to other modules

328

if pycompat.iswindows:

329

if pycompat.iswindows:

329

330

class WindowsEnviron(dict):

331

class WindowsEnviron(dict):

331

"""`os.environ` normalizes environment variables to uppercase on windows"""

332

"""`os.environ` normalizes environment variables to uppercase on windows"""

332

333

def get(self, key, default=None):

334

def get(self, key, default=None):

334

return super().get(upper(key), default)

335

return super().get(upper(key), default)

335

336

environ = WindowsEnviron()

337

environ = WindowsEnviron()

337

338

for k, v in os.environ.items(): # re-exports

339

for k, v in os.environ.items(): # re-exports

339

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

340

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

340

341

342

DRIVE_RE = re.compile(b'^[a-z]:')

343

DRIVE_RE = re.compile(b'^[a-z]:')

343

344

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

345

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

345

# returns bytes.

346

# returns bytes.

346

if pycompat.iswindows:

347

if pycompat.iswindows:

347

# Python 3 on Windows issues a DeprecationWarning about using the bytes

348

# Python 3 on Windows issues a DeprecationWarning about using the bytes

348

# API when os.getcwdb() is called.

349

# API when os.getcwdb() is called.

349

#

350

#

350

# Additionally, py3.8+ uppercases the drive letter when calling

351

# Additionally, py3.8+ uppercases the drive letter when calling

351

# os.path.realpath(), which is used on ``repo.root``. Since those

352

# os.path.realpath(), which is used on ``repo.root``. Since those

352

# strings are compared in various places as simple strings, also call

353

# strings are compared in various places as simple strings, also call

353

# realpath here. See https://bugs.python.org/issue40368

354

# realpath here. See https://bugs.python.org/issue40368

354

#

355

#

355

# However this is not reliable, so lets explicitly make this drive

356

# However this is not reliable, so lets explicitly make this drive

356

# letter upper case.

357

# letter upper case.

357

#

358

#

358

# note: we should consider dropping realpath here since it seems to

359

# note: we should consider dropping realpath here since it seems to

359

# change the semantic of `getcwd`.

360

# change the semantic of `getcwd`.

360

361

def getcwd():

362

def getcwd():

362

cwd = os.getcwd() # re-exports

363

cwd = os.getcwd() # re-exports

363

cwd = os.path.realpath(cwd)

364

cwd = os.path.realpath(cwd)

364

cwd = strtolocal(cwd)

365

cwd = strtolocal(cwd)

365

if DRIVE_RE.match(cwd):

366

if DRIVE_RE.match(cwd):

366

cwd = cwd[0:1].upper() + cwd[1:]

367

cwd = cwd[0:1].upper() + cwd[1:]

367

return cwd

368

return cwd

368

369

370

else:

371

else:

371

getcwd = os.getcwdb # re-exports

372

getcwd = os.getcwdb # re-exports

372

373

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

374

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

374

_wide = _sysstr(

375

_wide = _sysstr(

375

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

376

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

376

and b"WFA"

377

and b"WFA"

377

or b"WF"

378

or b"WF"

378

)

379

)

379

380

381

def colwidth(s: bytes) -> int:

382

def colwidth(s: bytes) -> int:

382

"""Find the column width of a string for display in the local encoding"""

383

"""Find the column width of a string for display in the local encoding"""

383

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

384

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

384

385

386

def ucolwidth(d: Text) -> int:

387

def ucolwidth(d: Text) -> int:

387

"""Find the column width of a Unicode string for display"""

388

"""Find the column width of a Unicode string for display"""

388

eaw = getattr(unicodedata, 'east_asian_width', None)

389

eaw = getattr(unicodedata, 'east_asian_width', None)

389

if eaw is not None:

390

if eaw is not None:

390

return sum([eaw(c) in _wide and 2 or 1 for c in d])

391

return sum([eaw(c) in _wide and 2 or 1 for c in d])

391

return len(d)

392

return len(d)

392

393

394

def getcols(s: bytes, start: int, c: int) -> bytes:

395

def getcols(s: bytes, start: int, c: int) -> bytes:

395

"""Use colwidth to find a c-column substring of s starting at byte

396

"""Use colwidth to find a c-column substring of s starting at byte

396

index start"""

397

index start"""

397

for x in range(start + c, len(s)):

398

for x in range(start + c, len(s)):

398

t = s[start:x]

399

t = s[start:x]

399

if colwidth(t) == c:

400

if colwidth(t) == c:

400

return t

401

return t

401

raise ValueError('substring not found')

402

raise ValueError('substring not found')

402

403

404

def trim(

405

def trim(

405

s: bytes,

406

s: bytes,

406

width: int,

407

width: int,

407

ellipsis: bytes = b'',

408

ellipsis: bytes = b'',

408

leftside: bool = False,

409

leftside: bool = False,

409

) -> bytes:

410

) -> bytes:

410

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

411

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

411

412

If 'leftside' is True, left side of string 's' is trimmed.

413

If 'leftside' is True, left side of string 's' is trimmed.

413

'ellipsis' is always placed at trimmed side.

414

'ellipsis' is always placed at trimmed side.

414

415

>>> from .node import bin

416

>>> from .node import bin

416

>>> def bprint(s):

417

>>> def bprint(s):

417

... print(pycompat.sysstr(s))

418

... print(pycompat.sysstr(s))

418

>>> ellipsis = b'+++'

419

>>> ellipsis = b'+++'

419

>>> from . import encoding

420

>>> from . import encoding

420

>>> encoding.encoding = b'utf-8'

421

>>> encoding.encoding = b'utf-8'

421

>>> t = b'1234567890'

422

>>> t = b'1234567890'

422

>>> bprint(trim(t, 12, ellipsis=ellipsis))

423

>>> bprint(trim(t, 12, ellipsis=ellipsis))

423

1234567890

424

1234567890

424

>>> bprint(trim(t, 10, ellipsis=ellipsis))

425

>>> bprint(trim(t, 10, ellipsis=ellipsis))

425

1234567890

426

1234567890

426

>>> bprint(trim(t, 8, ellipsis=ellipsis))

427

>>> bprint(trim(t, 8, ellipsis=ellipsis))

427

12345+++

428

12345+++

428

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

429

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

429

+++67890

430

+++67890

430

>>> bprint(trim(t, 8))

431

>>> bprint(trim(t, 8))

431

12345678

432

12345678

432

>>> bprint(trim(t, 8, leftside=True))

433

>>> bprint(trim(t, 8, leftside=True))

433

34567890

434

34567890

434

>>> bprint(trim(t, 3, ellipsis=ellipsis))

435

>>> bprint(trim(t, 3, ellipsis=ellipsis))

435

+++

436

+++

436

>>> bprint(trim(t, 1, ellipsis=ellipsis))

437

>>> bprint(trim(t, 1, ellipsis=ellipsis))

437

+

438

+

438

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

439

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

439

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

440

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

440

>>> bprint(trim(t, 12, ellipsis=ellipsis))

441

>>> bprint(trim(t, 12, ellipsis=ellipsis))

441

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

442

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

442

>>> bprint(trim(t, 10, ellipsis=ellipsis))

443

>>> bprint(trim(t, 10, ellipsis=ellipsis))

443

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

444

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

444

>>> bprint(trim(t, 8, ellipsis=ellipsis))

445

>>> bprint(trim(t, 8, ellipsis=ellipsis))

445

\xe3\x81\x82\xe3\x81\x84+++

446

\xe3\x81\x82\xe3\x81\x84+++

446

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

447

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

447

+++\xe3\x81\x88\xe3\x81\x8a

448

+++\xe3\x81\x88\xe3\x81\x8a

448

>>> bprint(trim(t, 5))

449

>>> bprint(trim(t, 5))

449

\xe3\x81\x82\xe3\x81\x84

450

\xe3\x81\x82\xe3\x81\x84

450

>>> bprint(trim(t, 5, leftside=True))

451

>>> bprint(trim(t, 5, leftside=True))

451

\xe3\x81\x88\xe3\x81\x8a

452

\xe3\x81\x88\xe3\x81\x8a

452

>>> bprint(trim(t, 4, ellipsis=ellipsis))

453

>>> bprint(trim(t, 4, ellipsis=ellipsis))

453

+++

454

+++

454

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

455

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

455

+++

456

+++

456

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

457

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

457

>>> bprint(trim(t, 12, ellipsis=ellipsis))

458

>>> bprint(trim(t, 12, ellipsis=ellipsis))

458

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

459

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

459

>>> bprint(trim(t, 10, ellipsis=ellipsis))

460

>>> bprint(trim(t, 10, ellipsis=ellipsis))

460

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

461

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

461

>>> bprint(trim(t, 8, ellipsis=ellipsis))

462

>>> bprint(trim(t, 8, ellipsis=ellipsis))

462

\x11\x22\x33\x44\x55+++

463

\x11\x22\x33\x44\x55+++

463

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

464

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

464

+++\x66\x77\x88\x99\xaa

465

+++\x66\x77\x88\x99\xaa

465

>>> bprint(trim(t, 8))

466

>>> bprint(trim(t, 8))

466

\x11\x22\x33\x44\x55\x66\x77\x88

467

\x11\x22\x33\x44\x55\x66\x77\x88

467

>>> bprint(trim(t, 8, leftside=True))

468

>>> bprint(trim(t, 8, leftside=True))

468

\x33\x44\x55\x66\x77\x88\x99\xaa

469

\x33\x44\x55\x66\x77\x88\x99\xaa

469

>>> bprint(trim(t, 3, ellipsis=ellipsis))

470

>>> bprint(trim(t, 3, ellipsis=ellipsis))

470

+++

471

+++

471

>>> bprint(trim(t, 1, ellipsis=ellipsis))

472

>>> bprint(trim(t, 1, ellipsis=ellipsis))

472

+

473

+

473

"""

474

"""

474

try:

475

try:

475

u = s.decode(_sysstr(encoding))

476

u = s.decode(_sysstr(encoding))

476

except UnicodeDecodeError:

477

except UnicodeDecodeError:

477

if len(s) <= width: # trimming is not needed

478

if len(s) <= width: # trimming is not needed

478

return s

479

return s

479

width -= len(ellipsis)

480

width -= len(ellipsis)

480

if width <= 0: # no enough room even for ellipsis

481

if width <= 0: # no enough room even for ellipsis

481

return ellipsis[: width + len(ellipsis)]

482

return ellipsis[: width + len(ellipsis)]

482

if leftside:

483

if leftside:

483

return ellipsis + s[-width:]

484

return ellipsis + s[-width:]

484

return s[:width] + ellipsis

485

return s[:width] + ellipsis

485

486

if ucolwidth(u) <= width: # trimming is not needed

487

if ucolwidth(u) <= width: # trimming is not needed

487

return s

488

return s

488

489

width -= len(ellipsis)

490

width -= len(ellipsis)

490

if width <= 0: # no enough room even for ellipsis

491

if width <= 0: # no enough room even for ellipsis

491

return ellipsis[: width + len(ellipsis)]

492

return ellipsis[: width + len(ellipsis)]

492

493

chars = list(u)

494

chars = list(u)

494

if leftside:

495

if leftside:

495

chars.reverse()

496

chars.reverse()

496

width_so_far = 0

497

width_so_far = 0

497

for i, c in enumerate(chars):

498

for i, c in enumerate(chars):

498

width_so_far += ucolwidth(c)

499

width_so_far += ucolwidth(c)

499

if width_so_far > width:

500

if width_so_far > width:

500

break

501

break

501

chars = chars[:i]

502

chars = chars[:i]

502

if leftside:

503

if leftside:

503

chars.reverse()

504

chars.reverse()

504

u = u''.join(chars).encode(_sysstr(encoding))

505

u = u''.join(chars).encode(_sysstr(encoding))

505

if leftside:

506

if leftside:

506

return ellipsis + u

507

return ellipsis + u

507

return u + ellipsis

508

return u + ellipsis

508

509

510

class normcasespecs:

511

class normcasespecs:

511

"""what a platform's normcase does to ASCII strings

512

"""what a platform's normcase does to ASCII strings

512

513

This is specified per platform, and should be consistent with what normcase

514

This is specified per platform, and should be consistent with what normcase

514

on that platform actually does.

515

on that platform actually does.

515

516

lower: normcase lowercases ASCII strings

517

lower: normcase lowercases ASCII strings

517

upper: normcase uppercases ASCII strings

518

upper: normcase uppercases ASCII strings

518

other: the fallback function should always be called

519

other: the fallback function should always be called

519

520

This should be kept in sync with normcase_spec in util.h."""

521

This should be kept in sync with normcase_spec in util.h."""

521

522

lower = -1

523

lower = -1

523

upper = 1

524

upper = 1

524

other = 0

525

other = 0

525

526

527

def jsonescape(s: Any, paranoid: Any = False) -> Any:

528

def jsonescape(s: Any, paranoid: Any = False) -> Any:

528

"""returns a string suitable for JSON

529

"""returns a string suitable for JSON

529

530

JSON is problematic for us because it doesn't support non-Unicode

531

JSON is problematic for us because it doesn't support non-Unicode

531

bytes. To deal with this, we take the following approach:

532

bytes. To deal with this, we take the following approach:

532

533

- localstr/safelocalstr objects are converted back to UTF-8

534

- localstr/safelocalstr objects are converted back to UTF-8

534

- valid UTF-8/ASCII strings are passed as-is

535

- valid UTF-8/ASCII strings are passed as-is

535

- other strings are converted to UTF-8b surrogate encoding

536

- other strings are converted to UTF-8b surrogate encoding

536

- apply JSON-specified string escaping

537

- apply JSON-specified string escaping

537

538

(escapes are doubled in these tests)

539

(escapes are doubled in these tests)

539

540

>>> jsonescape(b'this is a test')

541

>>> jsonescape(b'this is a test')

541

'this is a test'

542

'this is a test'

542

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

543

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

543

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

544

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

544

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

545

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

545

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

546

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

546

>>> jsonescape(b'a weird byte: \\xdd')

547

>>> jsonescape(b'a weird byte: \\xdd')

547

'a weird byte: \\xed\\xb3\\x9d'

548

'a weird byte: \\xed\\xb3\\x9d'

548

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

549

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

549

'utf-8: caf\\xc3\\xa9'

550

'utf-8: caf\\xc3\\xa9'

550

>>> jsonescape(b'')

551

>>> jsonescape(b'')

551

''

552

''

552

553

If paranoid, non-ascii and common troublesome characters are also escaped.

554

If paranoid, non-ascii and common troublesome characters are also escaped.

554

This is suitable for web output.

555

This is suitable for web output.

555

556

>>> s = b'escape characters: \\0 \\x0b \\x7f'

557

>>> s = b'escape characters: \\0 \\x0b \\x7f'

557

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

558

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

558

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

559

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

559

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

560

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

560

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

561

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

561

'escape boundary: ~ \\\\u007f \\\\u0080'

562

'escape boundary: ~ \\\\u007f \\\\u0080'

562

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

563

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

563

'a weird byte: \\\\udcdd'

564

'a weird byte: \\\\udcdd'

564

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

565

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

565

'utf-8: caf\\\\u00e9'

566

'utf-8: caf\\\\u00e9'

566

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

567

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

567

'non-BMP: \\\\ud834\\\\udd1e'

568

'non-BMP: \\\\ud834\\\\udd1e'

568

>>> jsonescape(b'<foo@example.org>', paranoid=True)

569

>>> jsonescape(b'<foo@example.org>', paranoid=True)

569

'\\\\u003cfoo@example.org\\\\u003e'

570

'\\\\u003cfoo@example.org\\\\u003e'

570

"""

571

"""

571

572

u8chars = toutf8b(s)

573

u8chars = toutf8b(s)

573

try:

574

try:

574

return _jsonescapeu8fast(u8chars, paranoid)

575

return _jsonescapeu8fast(u8chars, paranoid)

575

except ValueError:

576

except ValueError:

576

pass

577

pass

577

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

578

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

578

579

580

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

581

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

581

# bytes are mapped to that range.

582

# bytes are mapped to that range.

582

_utf8strict = r'surrogatepass'

583

_utf8strict = r'surrogatepass'

583

584

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

585

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

585

586

587

def getutf8char(s: bytes, pos: int) -> bytes:

588

def getutf8char(s: bytes, pos: int) -> bytes:

588

"""get the next full utf-8 character in the given string, starting at pos

589

"""get the next full utf-8 character in the given string, starting at pos

589

590

Raises a UnicodeError if the given location does not start a valid

591

Raises a UnicodeError if the given location does not start a valid

591

utf-8 character.

592

utf-8 character.

592

"""

593

"""

593

594

# find how many bytes to attempt decoding from first nibble

595

# find how many bytes to attempt decoding from first nibble

595

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

596

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

596

if not l: # ascii

597

if not l: # ascii

597

return s[pos : pos + 1]

598

return s[pos : pos + 1]

598

599

c = s[pos : pos + l]

600

c = s[pos : pos + l]

600

# validate with attempted decode

601

# validate with attempted decode

601

c.decode("utf-8", _utf8strict)

602

c.decode("utf-8", _utf8strict)

602

return c

603

return c

603

604

605

def toutf8b(s: bytes) -> bytes:

606

def toutf8b(s: bytes) -> bytes:

606

"""convert a local, possibly-binary string into UTF-8b

607

"""convert a local, possibly-binary string into UTF-8b

607

608

This is intended as a generic method to preserve data when working

609

This is intended as a generic method to preserve data when working

609

with schemes like JSON and XML that have no provision for

610

with schemes like JSON and XML that have no provision for

610

arbitrary byte strings. As Mercurial often doesn't know

611

arbitrary byte strings. As Mercurial often doesn't know

611

what encoding data is in, we use so-called UTF-8b.

612

what encoding data is in, we use so-called UTF-8b.

612

613

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

614

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

614

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

615

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

615

uDC00-uDCFF.

616

uDC00-uDCFF.

616

617

Principles of operation:

618

Principles of operation:

618

619

- ASCII and UTF-8 data successfully round-trips and is understood

620

- ASCII and UTF-8 data successfully round-trips and is understood

620

by Unicode-oriented clients

621

by Unicode-oriented clients

621

- filenames and file contents in arbitrary other encodings can have

622

- filenames and file contents in arbitrary other encodings can have

622

be round-tripped or recovered by clueful clients

623

be round-tripped or recovered by clueful clients

623

- local strings that have a cached known UTF-8 encoding (aka

624

- local strings that have a cached known UTF-8 encoding (aka

624

localstr) get sent as UTF-8 so Unicode-oriented clients get the

625

localstr) get sent as UTF-8 so Unicode-oriented clients get the

625

Unicode data they want

626

Unicode data they want

626

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

627

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

627

- because we must preserve UTF-8 bytestring in places such as

628

- because we must preserve UTF-8 bytestring in places such as

628

filenames, metadata can't be roundtripped without help

629

filenames, metadata can't be roundtripped without help

629

630

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

631

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

631

arbitrary bytes into an internal Unicode format that can be

632

arbitrary bytes into an internal Unicode format that can be

632

re-encoded back into the original. Here we are exposing the

633

re-encoded back into the original. Here we are exposing the

633

internal surrogate encoding as a UTF-8 string.)

634

internal surrogate encoding as a UTF-8 string.)

634

"""

635

"""

635

636

if isinstance(s, localstr):

637

if isinstance(s, localstr):

637

# assume that the original UTF-8 sequence would never contain

638

# assume that the original UTF-8 sequence would never contain

638

# invalid characters in U+DCxx range

639

# invalid characters in U+DCxx range

639

return s._utf8

640

return s._utf8

640

elif isinstance(s, safelocalstr):

641

elif isinstance(s, safelocalstr):

641

# already verified that s is non-lossy in legacy encoding, which

642

# already verified that s is non-lossy in legacy encoding, which

642

# shouldn't contain characters in U+DCxx range

643

# shouldn't contain characters in U+DCxx range

643

return fromlocal(s)

644

return fromlocal(s)

644

elif isasciistr(s):

645

elif isasciistr(s):

645

return s

646

return s

646

if b"\xed" not in s:

647

if b"\xed" not in s:

647

try:

648

try:

648

s.decode('utf-8', _utf8strict)

649

s.decode('utf-8', _utf8strict)

649

return s

650

return s

650

except UnicodeDecodeError:

651

except UnicodeDecodeError:

651

pass

652

pass

652

653

s = pycompat.bytestr(s)

654

s = pycompat.bytestr(s)

654

r = bytearray()

655

r = bytearray()

655

pos = 0

656

pos = 0

656

l = len(s)

657

l = len(s)

657

while pos < l:

658

while pos < l:

658

try:

659

try:

659

c = getutf8char(s, pos)

660

c = getutf8char(s, pos)

660

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

661

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

661

# have to re-escape existing U+DCxx characters

662

# have to re-escape existing U+DCxx characters

662

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

663

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

663

pos += 1

664

pos += 1

664

else:

665

else:

665

pos += len(c)

666

pos += len(c)

666

except UnicodeDecodeError:

667

except UnicodeDecodeError:

667

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

668

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

668

pos += 1

669

pos += 1

669

r += c

670

r += c

670

return bytes(r)

671

return bytes(r)

671

672

673

def fromutf8b(s: bytes) -> bytes:

674

def fromutf8b(s: bytes) -> bytes:

674

"""Given a UTF-8b string, return a local, possibly-binary string.

675

"""Given a UTF-8b string, return a local, possibly-binary string.

675

676

return the original binary string. This

677

return the original binary string. This

677

is a round-trip process for strings like filenames, but metadata

678

is a round-trip process for strings like filenames, but metadata

678

that's was passed through tolocal will remain in UTF-8.

679

that's was passed through tolocal will remain in UTF-8.

679

680

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

681

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

681

>>> m = b"\\xc3\\xa9\\x99abcd"

682

>>> m = b"\\xc3\\xa9\\x99abcd"

682

>>> toutf8b(m)

683

>>> toutf8b(m)

683

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

684

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

684

>>> roundtrip(m)

685

>>> roundtrip(m)

685

True

686

True

686

>>> roundtrip(b"\\xc2\\xc2\\x80")

687

>>> roundtrip(b"\\xc2\\xc2\\x80")

687

True

688

True

688

>>> roundtrip(b"\\xef\\xbf\\xbd")

689

>>> roundtrip(b"\\xef\\xbf\\xbd")

689

True

690

True

690

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

691

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

691

True

692

True

692

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

693

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

693

True

694

True

694

"""

695

"""

695

696

if isasciistr(s):

697

if isasciistr(s):

697

return s

698

return s

698

# fast path - look for uDxxx prefixes in s

699

# fast path - look for uDxxx prefixes in s

699

if b"\xed" not in s:

700

if b"\xed" not in s:

700

return s

701

return s

701

702

# We could do this with the unicode type but some Python builds

703

# We could do this with the unicode type but some Python builds

703

# use UTF-16 internally (issue5031) which causes non-BMP code

704

# use UTF-16 internally (issue5031) which causes non-BMP code

704

# points to be escaped. Instead, we use our handy getutf8char

705

# points to be escaped. Instead, we use our handy getutf8char

705

# helper again to walk the string without "decoding" it.

706

# helper again to walk the string without "decoding" it.

706

707

s = pycompat.bytestr(s)

708

s = pycompat.bytestr(s)

708

r = bytearray()

709

r = bytearray()

709

pos = 0

710

pos = 0

710

l = len(s)

711

l = len(s)

711

while pos < l:

712

while pos < l:

712

c = getutf8char(s, pos)

713

c = getutf8char(s, pos)

713

pos += len(c)

714

pos += len(c)

714

# unescape U+DCxx characters

715

# unescape U+DCxx characters

715

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

716

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

716

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

717

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

717

r += c

718

r += c

718

return bytes(r)

719

return bytes(r)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import locale
             import os
             import re
             import typing
             import unicodedata
             from typing import (
                 Any,
                 Callable,
                 Text,
                 TypeVar,
             )
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s: bytes) -> bytes:
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = os.supports_bytes_environ
             if _nativeenviron:
                 environ = os.environb  # re-exports
                 if pycompat.sysplatform == b'OpenVMS':
                     # workaround for a bug in VSI 3.10 port
                     # os.environb is only populated with a few Predefined symbols
                     def newget(self, key, default=None):
                         # pytype on linux does not understand OpenVMS special modules
                         import _decc  # pytype: disable=import-error
                         v = _decc.getenv(key, None)
                         if isinstance(key, bytes):
                             return default if v is None else v.encode('latin-1')
                         else:
                             return default if v is None else v
                     environ.__class__.get = newget
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = {
                     k.encode('utf-8'): v.encode('utf-8')
                     for k, v in os.environ.items()  # re-exports
                 }
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
+            encoding: bytes = b''  # help pytype avoid seeing None value
             try:
-                encoding = environ.get(b"HGENCODING")
+                encoding = environ.get(b"HGENCODING", b'')
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
-            encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
+            encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 """This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back"""
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 if typing.TYPE_CHECKING:
                     # pseudo implementation to help pytype see localstr() constructor
                     def __init__(self, u: bytes, l: bytes) -> None:
                         super(localstr, self).__init__(l)
                         self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s: bytes) -> bytes:
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def fromlocal(s: bytes) -> bytes:
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def unitolocal(u: str) -> bytes:
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s: bytes) -> str:
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             strtolocal = unitolocal
             strfromlocal = unifromlocal
             strmethod = unimethod
             def lower(s: bytes) -> bytes:
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def upper(s: bytes) -> bytes:
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s: Any) -> Any:
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 if pycompat.iswindows:
                     class WindowsEnviron(dict):
                         """`os.environ` normalizes environment variables to uppercase on windows"""
                         def get(self, key, default=None):
                             return super().get(upper(key), default)
                     environ = WindowsEnviron()
                 for k, v in os.environ.items():  # re-exports
                     environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
             DRIVE_RE = re.compile(b'^[a-z]:')
             # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
             # returns bytes.
             if pycompat.iswindows:
                 # Python 3 on Windows issues a DeprecationWarning about using the bytes
                 # API when os.getcwdb() is called.
                 #
                 # Additionally, py3.8+ uppercases the drive letter when calling
                 # os.path.realpath(), which is used on ``repo.root``.  Since those
                 # strings are compared in various places as simple strings, also call
                 # realpath here.  See https://bugs.python.org/issue40368
                 #
                 # However this is not reliable, so lets explicitly make this drive
                 # letter upper case.
                 #
                 # note: we should consider dropping realpath here since it seems to
                 # change the semantic of `getcwd`.
                 def getcwd():
                     cwd = os.getcwd()  # re-exports
                     cwd = os.path.realpath(cwd)
                     cwd = strtolocal(cwd)
                     if DRIVE_RE.match(cwd):
                         cwd = cwd[0:1].upper() + cwd[1:]
                     return cwd
             else:
                 getcwd = os.getcwdb  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s: bytes) -> int:
                 """Find the column width of a string for display in the local encoding"""
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d: Text) -> int:
                 """Find the column width of a Unicode string for display"""
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s: bytes, start: int, c: int) -> bytes:
                 """Use colwidth to find a c-column substring of s starting at byte
                 index start"""
                 for x in range(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(
                 s: bytes,
                 width: int,
                 ellipsis: bytes = b'',
                 leftside: bool = False,
             ) -> bytes:
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 chars = list(u)
                 if leftside:
                     chars.reverse()
                 width_so_far = 0
                 for i, c in enumerate(chars):
                     width_so_far += ucolwidth(c)
                     if width_so_far > width:
                         break
                 chars = chars[:i]
                 if leftside:
                     chars.reverse()
                 u = u''.join(chars).encode(_sysstr(encoding))
                 if leftside:
                     return ellipsis + u
                 return u + ellipsis
             class normcasespecs:
                 """what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h."""
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s: Any, paranoid: Any = False) -> Any:
                 """returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 """
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             _utf8strict = r'surrogatepass'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s: bytes, pos: int) -> bytes:
                 """get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 """
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s: bytes) -> bytes:
                 """convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 """
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = bytearray()
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return bytes(r)
             def fromutf8b(s: bytes) -> bytes:
                 """Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 """
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = bytearray()
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return bytes(r)