upstream/mercurial-mirror Commit - r48421:d6ee6456

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import re

12

import unicodedata

13

import unicodedata

13

14

from .pycompat import getattr

15

from .pycompat import getattr

15

from . import (

16

from . import (

16

error,

17

error,

17

policy,

18

policy,

18

pycompat,

19

pycompat,

19

)

20

)

20

21

from .pure import charencode as charencodepure

22

from .pure import charencode as charencodepure

22

23

if pycompat.TYPE_CHECKING:

24

if pycompat.TYPE_CHECKING:

24

from typing import (

25

from typing import (

25

Any,

26

Any,

26

Callable,

27

Callable,

27

List,

28

List,

28

Text,

29

Text,

29

Type,

30

Type,

30

TypeVar,

31

TypeVar,

31

Union,

32

Union,

32

)

33

)

33

34

# keep pyflakes happy

35

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

36

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

37

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

40

charencode = policy.importmod('charencode')

41

charencode = policy.importmod('charencode')

41

42

isasciistr = charencode.isasciistr

43

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

44

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

45

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

47

_sysstr = pycompat.sysstr

48

_sysstr = pycompat.sysstr

48

49

if pycompat.ispy3:

50

if pycompat.ispy3:

50

unichr = chr

51

unichr = chr

51

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# sanity.

55

# sanity.

55

_ignore = [

56

_ignore = [

56

unichr(int(x, 16)).encode("utf-8")

57

unichr(int(x, 16)).encode("utf-8")

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

b"206a 206b 206c 206d 206e 206f feff".split()

59

b"206a 206b 206c 206d 206e 206f feff".split()

59

]

60

]

60

# verify the next function will work

61

# verify the next function will work

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

63

64

def hfsignoreclean(s):

65

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

66

# type: (bytes) -> bytes

66

"""Remove codepoints ignored by HFS+ from s.

67

"""Remove codepoints ignored by HFS+ from s.

67

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

'.hg'

70

'.hg'

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

'.hg'

72

'.hg'

72

"""

73

"""

73

if b"\xe2" in s or b"\xef" in s:

74

if b"\xe2" in s or b"\xef" in s:

74

for c in _ignore:

75

for c in _ignore:

75

s = s.replace(c, b'')

76

s = s.replace(c, b'')

76

return s

77

return s

77

78

79

# encoding.environ is provided read-only, which may not be used to modify

80

# encoding.environ is provided read-only, which may not be used to modify

80

# the process environment

81

# the process environment

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

82

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

82

if not pycompat.ispy3:

83

if not pycompat.ispy3:

83

environ = os.environ # re-exports

84

environ = os.environ # re-exports

84

elif _nativeenviron:

85

elif _nativeenviron:

85

environ = os.environb # re-exports

86

environ = os.environb # re-exports

86

else:

87

else:

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# and recreate it once encoding is settled

89

# and recreate it once encoding is settled

89

environ = {

90

environ = {

90

k.encode('utf-8'): v.encode('utf-8')

91

k.encode('utf-8'): v.encode('utf-8')

91

for k, v in os.environ.items() # re-exports

92

for k, v in os.environ.items() # re-exports

92

}

93

}

93

94

_encodingrewrites = {

95

_encodingrewrites = {

95

b'646': b'ascii',

96

b'646': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

97

b'ANSI_X3.4-1968': b'ascii',

97

}

98

}

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# https://bugs.python.org/issue13216

101

# https://bugs.python.org/issue13216

101

if pycompat.iswindows and not pycompat.ispy3:

102

if pycompat.iswindows and not pycompat.ispy3:

102

_encodingrewrites[b'cp65001'] = b'utf-8'

103

_encodingrewrites[b'cp65001'] = b'utf-8'

103

104

try:

105

try:

105

encoding = environ.get(b"HGENCODING")

106

encoding = environ.get(b"HGENCODING")

106

if not encoding:

107

if not encoding:

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = _encodingrewrites.get(encoding, encoding)

109

encoding = _encodingrewrites.get(encoding, encoding)

109

except locale.Error:

110

except locale.Error:

110

encoding = b'ascii'

111

encoding = b'ascii'

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

fallbackencoding = b'ISO-8859-1'

113

fallbackencoding = b'ISO-8859-1'

113

114

115

class localstr(bytes):

116

class localstr(bytes):

116

"""This class allows strings that are unmodified to be

117

"""This class allows strings that are unmodified to be

117

round-tripped to the local encoding and back"""

118

round-tripped to the local encoding and back"""

118

119

def __new__(cls, u, l):

120

def __new__(cls, u, l):

120

s = bytes.__new__(cls, l)

121

s = bytes.__new__(cls, l)

121

s._utf8 = u

122

s._utf8 = u

122

return s

123

return s

123

124

if pycompat.TYPE_CHECKING:

125

if pycompat.TYPE_CHECKING:

125

# pseudo implementation to help pytype see localstr() constructor

126

# pseudo implementation to help pytype see localstr() constructor

126

def __init__(self, u, l):

127

def __init__(self, u, l):

127

# type: (bytes, bytes) -> None

128

# type: (bytes, bytes) -> None

128

super(localstr, self).__init__(l)

129

super(localstr, self).__init__(l)

129

self._utf8 = u

130

self._utf8 = u

130

131

def __hash__(self):

132

def __hash__(self):

132

return hash(self._utf8) # avoid collisions in local string space

133

return hash(self._utf8) # avoid collisions in local string space

133

134

135

class safelocalstr(bytes):

136

class safelocalstr(bytes):

136

"""Tagged string denoting it was previously an internal UTF-8 string,

137

"""Tagged string denoting it was previously an internal UTF-8 string,

137

and can be converted back to UTF-8 losslessly

138

and can be converted back to UTF-8 losslessly

138

139

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

140

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

140

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

141

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

141

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

142

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

142

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

143

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

143

"""

144

"""

144

145

146

def tolocal(s):

147

def tolocal(s):

147

# type: (bytes) -> bytes

148

# type: (bytes) -> bytes

148

"""

149

"""

149

Convert a string from internal UTF-8 to local encoding

150

Convert a string from internal UTF-8 to local encoding

150

151

All internal strings should be UTF-8 but some repos before the

152

All internal strings should be UTF-8 but some repos before the

152

implementation of locale support may contain latin1 or possibly

153

implementation of locale support may contain latin1 or possibly

153

other character sets. We attempt to decode everything strictly

154

other character sets. We attempt to decode everything strictly

154

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

155

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

155

replace unknown characters.

156

replace unknown characters.

156

157

The localstr class is used to cache the known UTF-8 encoding of

158

The localstr class is used to cache the known UTF-8 encoding of

158

strings next to their local representation to allow lossless

159

strings next to their local representation to allow lossless

159

round-trip conversion back to UTF-8.

160

round-trip conversion back to UTF-8.

160

161

>>> u = b'foo: \\xc3\\xa4' # utf-8

162

>>> u = b'foo: \\xc3\\xa4' # utf-8

162

>>> l = tolocal(u)

163

>>> l = tolocal(u)

163

>>> l

164

>>> l

164

'foo: ?'

165

'foo: ?'

165

>>> fromlocal(l)

166

>>> fromlocal(l)

166

'foo: \\xc3\\xa4'

167

'foo: \\xc3\\xa4'

167

>>> u2 = b'foo: \\xc3\\xa1'

168

>>> u2 = b'foo: \\xc3\\xa1'

168

>>> d = { l: 1, tolocal(u2): 2 }

169

>>> d = { l: 1, tolocal(u2): 2 }

169

>>> len(d) # no collision

170

>>> len(d) # no collision

170

2

171

2

171

>>> b'foo: ?' in d

172

>>> b'foo: ?' in d

172

False

173

False

173

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

174

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

174

>>> l = tolocal(l1)

175

>>> l = tolocal(l1)

175

>>> l

176

>>> l

176

'foo: ?'

177

'foo: ?'

177

>>> fromlocal(l) # magically in utf-8

178

>>> fromlocal(l) # magically in utf-8

178

'foo: \\xc3\\xa4'

179

'foo: \\xc3\\xa4'

179

"""

180

"""

180

181

if isasciistr(s):

182

if isasciistr(s):

182

return s

183

return s

183

184

try:

185

try:

185

try:

186

try:

186

# make sure string is actually stored in UTF-8

187

# make sure string is actually stored in UTF-8

187

u = s.decode('UTF-8')

188

u = s.decode('UTF-8')

188

if encoding == b'UTF-8':

189

if encoding == b'UTF-8':

189

# fast path

190

# fast path

190

return s

191

return s

191

r = u.encode(_sysstr(encoding), "replace")

192

r = u.encode(_sysstr(encoding), "replace")

192

if u == r.decode(_sysstr(encoding)):

193

if u == r.decode(_sysstr(encoding)):

193

# r is a safe, non-lossy encoding of s

194

# r is a safe, non-lossy encoding of s

194

return safelocalstr(r)

195

return safelocalstr(r)

195

return localstr(s, r)

196

return localstr(s, r)

196

except UnicodeDecodeError:

197

except UnicodeDecodeError:

197

# we should only get here if we're looking at an ancient changeset

198

# we should only get here if we're looking at an ancient changeset

198

try:

199

try:

199

u = s.decode(_sysstr(fallbackencoding))

200

u = s.decode(_sysstr(fallbackencoding))

200

r = u.encode(_sysstr(encoding), "replace")

201

r = u.encode(_sysstr(encoding), "replace")

201

if u == r.decode(_sysstr(encoding)):

202

if u == r.decode(_sysstr(encoding)):

202

# r is a safe, non-lossy encoding of s

203

# r is a safe, non-lossy encoding of s

203

return safelocalstr(r)

204

return safelocalstr(r)

204

return localstr(u.encode('UTF-8'), r)

205

return localstr(u.encode('UTF-8'), r)

205

except UnicodeDecodeError:

206

except UnicodeDecodeError:

206

u = s.decode("utf-8", "replace") # last ditch

207

u = s.decode("utf-8", "replace") # last ditch

207

# can't round-trip

208

# can't round-trip

208

return u.encode(_sysstr(encoding), "replace")

209

return u.encode(_sysstr(encoding), "replace")

209

except LookupError as k:

210

except LookupError as k:

210

raise error.Abort(

211

raise error.Abort(

211

pycompat.bytestr(k), hint=b"please check your locale settings"

212

pycompat.bytestr(k), hint=b"please check your locale settings"

212

)

213

)

213

214

215

def fromlocal(s):

216

def fromlocal(s):

216

# type: (bytes) -> bytes

217

# type: (bytes) -> bytes

217

"""

218

"""

218

Convert a string from the local character encoding to UTF-8

219

Convert a string from the local character encoding to UTF-8

219

220

We attempt to decode strings using the encoding mode set by

221

We attempt to decode strings using the encoding mode set by

221

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

222

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

222

characters will cause an error message. Other modes include

223

characters will cause an error message. Other modes include

223

'replace', which replaces unknown characters with a special

224

'replace', which replaces unknown characters with a special

224

Unicode character, and 'ignore', which drops the character.

225

Unicode character, and 'ignore', which drops the character.

225

"""

226

"""

226

227

# can we do a lossless round-trip?

228

# can we do a lossless round-trip?

228

if isinstance(s, localstr):

229

if isinstance(s, localstr):

229

return s._utf8

230

return s._utf8

230

if isasciistr(s):

231

if isasciistr(s):

231

return s

232

return s

232

233

try:

234

try:

234

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

235

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

235

return u.encode("utf-8")

236

return u.encode("utf-8")

236

except UnicodeDecodeError as inst:

237

except UnicodeDecodeError as inst:

237

sub = s[max(0, inst.start - 10) : inst.start + 10]

238

sub = s[max(0, inst.start - 10) : inst.start + 10]

238

raise error.Abort(

239

raise error.Abort(

239

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

240

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

240

)

241

)

241

except LookupError as k:

242

except LookupError as k:

242

raise error.Abort(k, hint=b"please check your locale settings")

243

raise error.Abort(k, hint=b"please check your locale settings")

243

244

245

def unitolocal(u):

246

def unitolocal(u):

246

# type: (Text) -> bytes

247

# type: (Text) -> bytes

247

"""Convert a unicode string to a byte string of local encoding"""

248

"""Convert a unicode string to a byte string of local encoding"""

248

return tolocal(u.encode('utf-8'))

249

return tolocal(u.encode('utf-8'))

249

250

251

def unifromlocal(s):

252

def unifromlocal(s):

252

# type: (bytes) -> Text

253

# type: (bytes) -> Text

253

"""Convert a byte string of local encoding to a unicode string"""

254

"""Convert a byte string of local encoding to a unicode string"""

254

return fromlocal(s).decode('utf-8')

255

return fromlocal(s).decode('utf-8')

255

256

257

def unimethod(bytesfunc):

258

def unimethod(bytesfunc):

258

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

259

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

259

"""Create a proxy method that forwards __unicode__() and __str__() of

260

"""Create a proxy method that forwards __unicode__() and __str__() of

260

Python 3 to __bytes__()"""

261

Python 3 to __bytes__()"""

261

262

def unifunc(obj):

263

def unifunc(obj):

263

return unifromlocal(bytesfunc(obj))

264

return unifromlocal(bytesfunc(obj))

264

265

return unifunc

266

return unifunc

266

267

268

# converter functions between native str and byte string. use these if the

269

# converter functions between native str and byte string. use these if the

269

# character encoding is not aware (e.g. exception message) or is known to

270

# character encoding is not aware (e.g. exception message) or is known to

270

# be locale dependent (e.g. date formatting.)

271

# be locale dependent (e.g. date formatting.)

271

if pycompat.ispy3:

272

if pycompat.ispy3:

272

strtolocal = unitolocal

273

strtolocal = unitolocal

273

strfromlocal = unifromlocal

274

strfromlocal = unifromlocal

274

strmethod = unimethod

275

strmethod = unimethod

275

else:

276

else:

276

277

def strtolocal(s):

278

def strtolocal(s):

278

# type: (str) -> bytes

279

# type: (str) -> bytes

279

return s # pytype: disable=bad-return-type

280

return s # pytype: disable=bad-return-type

280

281

def strfromlocal(s):

282

def strfromlocal(s):

282

# type: (bytes) -> str

283

# type: (bytes) -> str

283

return s # pytype: disable=bad-return-type

284

return s # pytype: disable=bad-return-type

284

285

strmethod = pycompat.identity

286

strmethod = pycompat.identity

286

287

288

def lower(s):

289

def lower(s):

289

# type: (bytes) -> bytes

290

# type: (bytes) -> bytes

290

"""best-effort encoding-aware case-folding of local string s"""

291

"""best-effort encoding-aware case-folding of local string s"""

291

try:

292

try:

292

return asciilower(s)

293

return asciilower(s)

293

except UnicodeDecodeError:

294

except UnicodeDecodeError:

294

pass

295

pass

295

try:

296

try:

296

if isinstance(s, localstr):

297

if isinstance(s, localstr):

297

u = s._utf8.decode("utf-8")

298

u = s._utf8.decode("utf-8")

298

else:

299

else:

299

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

300

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

300

301

lu = u.lower()

302

lu = u.lower()

302

if u == lu:

303

if u == lu:

303

return s # preserve localstring

304

return s # preserve localstring

304

return lu.encode(_sysstr(encoding))

305

return lu.encode(_sysstr(encoding))

305

except UnicodeError:

306

except UnicodeError:

306

return s.lower() # we don't know how to fold this except in ASCII

307

return s.lower() # we don't know how to fold this except in ASCII

307

except LookupError as k:

308

except LookupError as k:

308

raise error.Abort(k, hint=b"please check your locale settings")

309

raise error.Abort(k, hint=b"please check your locale settings")

309

310

311

def upper(s):

312

def upper(s):

312

# type: (bytes) -> bytes

313

# type: (bytes) -> bytes

313

"""best-effort encoding-aware case-folding of local string s"""

314

"""best-effort encoding-aware case-folding of local string s"""

314

try:

315

try:

315

return asciiupper(s)

316

return asciiupper(s)

316

except UnicodeDecodeError:

317

except UnicodeDecodeError:

317

return upperfallback(s)

318

return upperfallback(s)

318

319

320

def upperfallback(s):

321

def upperfallback(s):

321

# type: (Any) -> Any

322

# type: (Any) -> Any

322

try:

323

try:

323

if isinstance(s, localstr):

324

if isinstance(s, localstr):

324

u = s._utf8.decode("utf-8")

325

u = s._utf8.decode("utf-8")

325

else:

326

else:

326

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

327

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

327

328

uu = u.upper()

329

uu = u.upper()

329

if u == uu:

330

if u == uu:

330

return s # preserve localstring

331

return s # preserve localstring

331

return uu.encode(_sysstr(encoding))

332

return uu.encode(_sysstr(encoding))

332

except UnicodeError:

333

except UnicodeError:

333

return s.upper() # we don't know how to fold this except in ASCII

334

return s.upper() # we don't know how to fold this except in ASCII

334

except LookupError as k:

335

except LookupError as k:

335

raise error.Abort(k, hint=b"please check your locale settings")

336

raise error.Abort(k, hint=b"please check your locale settings")

336

337

338

if not _nativeenviron:

339

if not _nativeenviron:

339

# now encoding and helper functions are available, recreate the environ

340

# now encoding and helper functions are available, recreate the environ

340

# dict to be exported to other modules

341

# dict to be exported to other modules

341

if pycompat.iswindows and pycompat.ispy3:

342

if pycompat.iswindows and pycompat.ispy3:

342

343

class WindowsEnviron(dict):

344

class WindowsEnviron(dict):

344

"""`os.environ` normalizes environment variables to uppercase on windows"""

345

"""`os.environ` normalizes environment variables to uppercase on windows"""

345

346

def get(self, key, default=None):

347

def get(self, key, default=None):

347

return super().get(upper(key), default)

348

return super().get(upper(key), default)

348

349

environ = WindowsEnviron()

350

environ = WindowsEnviron()

350

351

for k, v in os.environ.items(): # re-exports

352

for k, v in os.environ.items(): # re-exports

352

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

353

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

353

354

355

356

DRIVE_RE = re.compile(b'^[a-z]:')

357

355

if pycompat.ispy3:

358

if pycompat.ispy3:

356

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

359

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

357

# returns bytes.

360

# returns bytes.

358

if pycompat.iswindows:

361

if pycompat.iswindows:

359

# Python 3 on Windows issues a DeprecationWarning about using the bytes

362

# Python 3 on Windows issues a DeprecationWarning about using the bytes

360

# API when os.getcwdb() is called.

363

# API when os.getcwdb() is called.

361

#

364

#

362

# Additionally, py3.8+ uppercases the drive letter when calling

365

# Additionally, py3.8+ uppercases the drive letter when calling

363

# os.path.realpath(), which is used on ``repo.root``. Since those

366

# os.path.realpath(), which is used on ``repo.root``. Since those

364

# strings are compared in various places as simple strings, also call

367

# strings are compared in various places as simple strings, also call

365

# realpath here. See https://bugs.python.org/issue40368

368

# realpath here. See https://bugs.python.org/issue40368

366

getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports

369

#

370

# However this is not reliable, so lets explicitly make this drive

371

# letter upper case.

372

#

373

# note: we should consider dropping realpath here since it seems to

374

# change the semantic of `getcwd`.

375

376

def getcwd():

377

cwd = os.getcwd() # re-exports

378

cwd = os.path.realpath(cwd)

379

cwd = strtolocal(cwd)

380

if DRIVE_RE.match(cwd):

381

cwd = cwd[0:1].upper() + cwd[1:]

382

return cwd

383

367

else:

384

else:

368

getcwd = os.getcwdb # re-exports

385

getcwd = os.getcwdb # re-exports

369

else:

386

else:

370

getcwd = os.getcwd # re-exports

387

getcwd = os.getcwd # re-exports

371

388

372

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

389

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

373

_wide = _sysstr(

390

_wide = _sysstr(

374

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

391

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

375

and b"WFA"

392

and b"WFA"

376

or b"WF"

393

or b"WF"

377

)

394

)

378

395

379

396

380

def colwidth(s):

397

def colwidth(s):

381

# type: (bytes) -> int

398

# type: (bytes) -> int

382

"""Find the column width of a string for display in the local encoding"""

399

"""Find the column width of a string for display in the local encoding"""

383

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

400

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

384

401

385

402

386

def ucolwidth(d):

403

def ucolwidth(d):

387

# type: (Text) -> int

404

# type: (Text) -> int

388

"""Find the column width of a Unicode string for display"""

405

"""Find the column width of a Unicode string for display"""

389

eaw = getattr(unicodedata, 'east_asian_width', None)

406

eaw = getattr(unicodedata, 'east_asian_width', None)

390

if eaw is not None:

407

if eaw is not None:

391

return sum([eaw(c) in _wide and 2 or 1 for c in d])

408

return sum([eaw(c) in _wide and 2 or 1 for c in d])

392

return len(d)

409

return len(d)

393

410

394

411

395

def getcols(s, start, c):

412

def getcols(s, start, c):

396

# type: (bytes, int, int) -> bytes

413

# type: (bytes, int, int) -> bytes

397

"""Use colwidth to find a c-column substring of s starting at byte

414

"""Use colwidth to find a c-column substring of s starting at byte

398

index start"""

415

index start"""

399

for x in pycompat.xrange(start + c, len(s)):

416

for x in pycompat.xrange(start + c, len(s)):

400

t = s[start:x]

417

t = s[start:x]

401

if colwidth(t) == c:

418

if colwidth(t) == c:

402

return t

419

return t

403

raise ValueError('substring not found')

420

raise ValueError('substring not found')

404

421

405

422

406

def trim(s, width, ellipsis=b'', leftside=False):

423

def trim(s, width, ellipsis=b'', leftside=False):

407

# type: (bytes, int, bytes, bool) -> bytes

424

# type: (bytes, int, bytes, bool) -> bytes

408

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

425

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

409

426

410

If 'leftside' is True, left side of string 's' is trimmed.

427

If 'leftside' is True, left side of string 's' is trimmed.

411

'ellipsis' is always placed at trimmed side.

428

'ellipsis' is always placed at trimmed side.

412

429

413

>>> from .node import bin

430

>>> from .node import bin

414

>>> def bprint(s):

431

>>> def bprint(s):

415

... print(pycompat.sysstr(s))

432

... print(pycompat.sysstr(s))

416

>>> ellipsis = b'+++'

433

>>> ellipsis = b'+++'

417

>>> from . import encoding

434

>>> from . import encoding

418

>>> encoding.encoding = b'utf-8'

435

>>> encoding.encoding = b'utf-8'

419

>>> t = b'1234567890'

436

>>> t = b'1234567890'

420

>>> bprint(trim(t, 12, ellipsis=ellipsis))

437

>>> bprint(trim(t, 12, ellipsis=ellipsis))

421

1234567890

438

1234567890

422

>>> bprint(trim(t, 10, ellipsis=ellipsis))

439

>>> bprint(trim(t, 10, ellipsis=ellipsis))

423

1234567890

440

1234567890

424

>>> bprint(trim(t, 8, ellipsis=ellipsis))

441

>>> bprint(trim(t, 8, ellipsis=ellipsis))

425

12345+++

442

12345+++

426

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

443

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

427

+++67890

444

+++67890

428

>>> bprint(trim(t, 8))

445

>>> bprint(trim(t, 8))

429

12345678

446

12345678

430

>>> bprint(trim(t, 8, leftside=True))

447

>>> bprint(trim(t, 8, leftside=True))

431

34567890

448

34567890

432

>>> bprint(trim(t, 3, ellipsis=ellipsis))

449

>>> bprint(trim(t, 3, ellipsis=ellipsis))

433

+++

450

+++

434

>>> bprint(trim(t, 1, ellipsis=ellipsis))

451

>>> bprint(trim(t, 1, ellipsis=ellipsis))

435

+

452

+

436

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

453

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

437

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

454

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

438

>>> bprint(trim(t, 12, ellipsis=ellipsis))

455

>>> bprint(trim(t, 12, ellipsis=ellipsis))

439

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

456

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

440

>>> bprint(trim(t, 10, ellipsis=ellipsis))

457

>>> bprint(trim(t, 10, ellipsis=ellipsis))

441

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

458

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

442

>>> bprint(trim(t, 8, ellipsis=ellipsis))

459

>>> bprint(trim(t, 8, ellipsis=ellipsis))

443

\xe3\x81\x82\xe3\x81\x84+++

460

\xe3\x81\x82\xe3\x81\x84+++

444

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

461

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

445

+++\xe3\x81\x88\xe3\x81\x8a

462

+++\xe3\x81\x88\xe3\x81\x8a

446

>>> bprint(trim(t, 5))

463

>>> bprint(trim(t, 5))

447

\xe3\x81\x82\xe3\x81\x84

464

\xe3\x81\x82\xe3\x81\x84

448

>>> bprint(trim(t, 5, leftside=True))

465

>>> bprint(trim(t, 5, leftside=True))

449

\xe3\x81\x88\xe3\x81\x8a

466

\xe3\x81\x88\xe3\x81\x8a

450

>>> bprint(trim(t, 4, ellipsis=ellipsis))

467

>>> bprint(trim(t, 4, ellipsis=ellipsis))

451

+++

468

+++

452

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

469

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

453

+++

470

+++

454

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

471

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

455

>>> bprint(trim(t, 12, ellipsis=ellipsis))

472

>>> bprint(trim(t, 12, ellipsis=ellipsis))

456

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

473

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

457

>>> bprint(trim(t, 10, ellipsis=ellipsis))

474

>>> bprint(trim(t, 10, ellipsis=ellipsis))

458

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

475

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

459

>>> bprint(trim(t, 8, ellipsis=ellipsis))

476

>>> bprint(trim(t, 8, ellipsis=ellipsis))

460

\x11\x22\x33\x44\x55+++

477

\x11\x22\x33\x44\x55+++

461

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

478

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

462

+++\x66\x77\x88\x99\xaa

479

+++\x66\x77\x88\x99\xaa

463

>>> bprint(trim(t, 8))

480

>>> bprint(trim(t, 8))

464

\x11\x22\x33\x44\x55\x66\x77\x88

481

\x11\x22\x33\x44\x55\x66\x77\x88

465

>>> bprint(trim(t, 8, leftside=True))

482

>>> bprint(trim(t, 8, leftside=True))

466

\x33\x44\x55\x66\x77\x88\x99\xaa

483

\x33\x44\x55\x66\x77\x88\x99\xaa

467

>>> bprint(trim(t, 3, ellipsis=ellipsis))

484

>>> bprint(trim(t, 3, ellipsis=ellipsis))

468

+++

485

+++

469

>>> bprint(trim(t, 1, ellipsis=ellipsis))

486

>>> bprint(trim(t, 1, ellipsis=ellipsis))

470

+

487

+

471

"""

488

"""

472

try:

489

try:

473

u = s.decode(_sysstr(encoding))

490

u = s.decode(_sysstr(encoding))

474

except UnicodeDecodeError:

491

except UnicodeDecodeError:

475

if len(s) <= width: # trimming is not needed

492

if len(s) <= width: # trimming is not needed

476

return s

493

return s

477

width -= len(ellipsis)

494

width -= len(ellipsis)

478

if width <= 0: # no enough room even for ellipsis

495

if width <= 0: # no enough room even for ellipsis

479

return ellipsis[: width + len(ellipsis)]

496

return ellipsis[: width + len(ellipsis)]

480

if leftside:

497

if leftside:

481

return ellipsis + s[-width:]

498

return ellipsis + s[-width:]

482

return s[:width] + ellipsis

499

return s[:width] + ellipsis

483

500

484

if ucolwidth(u) <= width: # trimming is not needed

501

if ucolwidth(u) <= width: # trimming is not needed

485

return s

502

return s

486

503

487

width -= len(ellipsis)

504

width -= len(ellipsis)

488

if width <= 0: # no enough room even for ellipsis

505

if width <= 0: # no enough room even for ellipsis

489

return ellipsis[: width + len(ellipsis)]

506

return ellipsis[: width + len(ellipsis)]

490

507

491

if leftside:

508

if leftside:

492

uslice = lambda i: u[i:]

509

uslice = lambda i: u[i:]

493

concat = lambda s: ellipsis + s

510

concat = lambda s: ellipsis + s

494

else:

511

else:

495

uslice = lambda i: u[:-i]

512

uslice = lambda i: u[:-i]

496

concat = lambda s: s + ellipsis

513

concat = lambda s: s + ellipsis

497

for i in pycompat.xrange(1, len(u)):

514

for i in pycompat.xrange(1, len(u)):

498

usub = uslice(i)

515

usub = uslice(i)

499

if ucolwidth(usub) <= width:

516

if ucolwidth(usub) <= width:

500

return concat(usub.encode(_sysstr(encoding)))

517

return concat(usub.encode(_sysstr(encoding)))

501

return ellipsis # no enough room for multi-column characters

518

return ellipsis # no enough room for multi-column characters

502

519

503

520

504

class normcasespecs(object):

521

class normcasespecs(object):

505

"""what a platform's normcase does to ASCII strings

522

"""what a platform's normcase does to ASCII strings

506

523

507

This is specified per platform, and should be consistent with what normcase

524

This is specified per platform, and should be consistent with what normcase

508

on that platform actually does.

525

on that platform actually does.

509

526

510

lower: normcase lowercases ASCII strings

527

lower: normcase lowercases ASCII strings

511

upper: normcase uppercases ASCII strings

528

upper: normcase uppercases ASCII strings

512

other: the fallback function should always be called

529

other: the fallback function should always be called

513

530

514

This should be kept in sync with normcase_spec in util.h."""

531

This should be kept in sync with normcase_spec in util.h."""

515

532

516

lower = -1

533

lower = -1

517

upper = 1

534

upper = 1

518

other = 0

535

other = 0

519

536

520

537

521

def jsonescape(s, paranoid=False):

538

def jsonescape(s, paranoid=False):

522

# type: (Any, Any) -> Any

539

# type: (Any, Any) -> Any

523

"""returns a string suitable for JSON

540

"""returns a string suitable for JSON

524

541

525

JSON is problematic for us because it doesn't support non-Unicode

542

JSON is problematic for us because it doesn't support non-Unicode

526

bytes. To deal with this, we take the following approach:

543

bytes. To deal with this, we take the following approach:

527

544

528

- localstr/safelocalstr objects are converted back to UTF-8

545

- localstr/safelocalstr objects are converted back to UTF-8

529

- valid UTF-8/ASCII strings are passed as-is

546

- valid UTF-8/ASCII strings are passed as-is

530

- other strings are converted to UTF-8b surrogate encoding

547

- other strings are converted to UTF-8b surrogate encoding

531

- apply JSON-specified string escaping

548

- apply JSON-specified string escaping

532

549

533

(escapes are doubled in these tests)

550

(escapes are doubled in these tests)

534

551

535

>>> jsonescape(b'this is a test')

552

>>> jsonescape(b'this is a test')

536

'this is a test'

553

'this is a test'

537

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

554

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

538

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

555

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

539

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

556

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

540

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

557

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

541

>>> jsonescape(b'a weird byte: \\xdd')

558

>>> jsonescape(b'a weird byte: \\xdd')

542

'a weird byte: \\xed\\xb3\\x9d'

559

'a weird byte: \\xed\\xb3\\x9d'

543

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

560

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

544

'utf-8: caf\\xc3\\xa9'

561

'utf-8: caf\\xc3\\xa9'

545

>>> jsonescape(b'')

562

>>> jsonescape(b'')

546

''

563

''

547

564

548

If paranoid, non-ascii and common troublesome characters are also escaped.

565

If paranoid, non-ascii and common troublesome characters are also escaped.

549

This is suitable for web output.

566

This is suitable for web output.

550

567

551

>>> s = b'escape characters: \\0 \\x0b \\x7f'

568

>>> s = b'escape characters: \\0 \\x0b \\x7f'

552

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

569

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

553

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

570

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

554

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

571

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

555

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

572

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

556

'escape boundary: ~ \\\\u007f \\\\u0080'

573

'escape boundary: ~ \\\\u007f \\\\u0080'

557

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

574

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

558

'a weird byte: \\\\udcdd'

575

'a weird byte: \\\\udcdd'

559

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

576

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

560

'utf-8: caf\\\\u00e9'

577

'utf-8: caf\\\\u00e9'

561

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

578

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

562

'non-BMP: \\\\ud834\\\\udd1e'

579

'non-BMP: \\\\ud834\\\\udd1e'

563

>>> jsonescape(b'<foo@example.org>', paranoid=True)

580

>>> jsonescape(b'<foo@example.org>', paranoid=True)

564

'\\\\u003cfoo@example.org\\\\u003e'

581

'\\\\u003cfoo@example.org\\\\u003e'

565

"""

582

"""

566

583

567

u8chars = toutf8b(s)

584

u8chars = toutf8b(s)

568

try:

585

try:

569

return _jsonescapeu8fast(u8chars, paranoid)

586

return _jsonescapeu8fast(u8chars, paranoid)

570

except ValueError:

587

except ValueError:

571

pass

588

pass

572

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

589

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

573

590

574

591

575

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

592

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

576

# bytes are mapped to that range.

593

# bytes are mapped to that range.

577

if pycompat.ispy3:

594

if pycompat.ispy3:

578

_utf8strict = r'surrogatepass'

595

_utf8strict = r'surrogatepass'

579

else:

596

else:

580

_utf8strict = r'strict'

597

_utf8strict = r'strict'

581

598

582

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

599

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

583

600

584

601

585

def getutf8char(s, pos):

602

def getutf8char(s, pos):

586

# type: (bytes, int) -> bytes

603

# type: (bytes, int) -> bytes

587

"""get the next full utf-8 character in the given string, starting at pos

604

"""get the next full utf-8 character in the given string, starting at pos

588

605

589

Raises a UnicodeError if the given location does not start a valid

606

Raises a UnicodeError if the given location does not start a valid

590

utf-8 character.

607

utf-8 character.

591

"""

608

"""

592

609

593

# find how many bytes to attempt decoding from first nibble

610

# find how many bytes to attempt decoding from first nibble

594

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

611

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

595

if not l: # ascii

612

if not l: # ascii

596

return s[pos : pos + 1]

613

return s[pos : pos + 1]

597

614

598

c = s[pos : pos + l]

615

c = s[pos : pos + l]

599

# validate with attempted decode

616

# validate with attempted decode

600

c.decode("utf-8", _utf8strict)

617

c.decode("utf-8", _utf8strict)

601

return c

618

return c

602

619

603

620

604

def toutf8b(s):

621

def toutf8b(s):

605

# type: (bytes) -> bytes

622

# type: (bytes) -> bytes

606

"""convert a local, possibly-binary string into UTF-8b

623

"""convert a local, possibly-binary string into UTF-8b

607

624

608

This is intended as a generic method to preserve data when working

625

This is intended as a generic method to preserve data when working

609

with schemes like JSON and XML that have no provision for

626

with schemes like JSON and XML that have no provision for

610

arbitrary byte strings. As Mercurial often doesn't know

627

arbitrary byte strings. As Mercurial often doesn't know

611

what encoding data is in, we use so-called UTF-8b.

628

what encoding data is in, we use so-called UTF-8b.

612

629

613

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

630

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

614

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

631

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

615

uDC00-uDCFF.

632

uDC00-uDCFF.

616

633

617

Principles of operation:

634

Principles of operation:

618

635

619

- ASCII and UTF-8 data successfully round-trips and is understood

636

- ASCII and UTF-8 data successfully round-trips and is understood

620

by Unicode-oriented clients

637

by Unicode-oriented clients

621

- filenames and file contents in arbitrary other encodings can have

638

- filenames and file contents in arbitrary other encodings can have

622

be round-tripped or recovered by clueful clients

639

be round-tripped or recovered by clueful clients

623

- local strings that have a cached known UTF-8 encoding (aka

640

- local strings that have a cached known UTF-8 encoding (aka

624

localstr) get sent as UTF-8 so Unicode-oriented clients get the

641

localstr) get sent as UTF-8 so Unicode-oriented clients get the

625

Unicode data they want

642

Unicode data they want

626

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

643

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

627

- because we must preserve UTF-8 bytestring in places such as

644

- because we must preserve UTF-8 bytestring in places such as

628

filenames, metadata can't be roundtripped without help

645

filenames, metadata can't be roundtripped without help

629

646

630

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

647

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

631

arbitrary bytes into an internal Unicode format that can be

648

arbitrary bytes into an internal Unicode format that can be

632

re-encoded back into the original. Here we are exposing the

649

re-encoded back into the original. Here we are exposing the

633

internal surrogate encoding as a UTF-8 string.)

650

internal surrogate encoding as a UTF-8 string.)

634

"""

651

"""

635

652

636

if isinstance(s, localstr):

653

if isinstance(s, localstr):

637

# assume that the original UTF-8 sequence would never contain

654

# assume that the original UTF-8 sequence would never contain

638

# invalid characters in U+DCxx range

655

# invalid characters in U+DCxx range

639

return s._utf8

656

return s._utf8

640

elif isinstance(s, safelocalstr):

657

elif isinstance(s, safelocalstr):

641

# already verified that s is non-lossy in legacy encoding, which

658

# already verified that s is non-lossy in legacy encoding, which

642

# shouldn't contain characters in U+DCxx range

659

# shouldn't contain characters in U+DCxx range

643

return fromlocal(s)

660

return fromlocal(s)

644

elif isasciistr(s):

661

elif isasciistr(s):

645

return s

662

return s

646

if b"\xed" not in s:

663

if b"\xed" not in s:

647

try:

664

try:

648

s.decode('utf-8', _utf8strict)

665

s.decode('utf-8', _utf8strict)

649

return s

666

return s

650

except UnicodeDecodeError:

667

except UnicodeDecodeError:

651

pass

668

pass

652

669

653

s = pycompat.bytestr(s)

670

s = pycompat.bytestr(s)

654

r = b""

671

r = b""

655

pos = 0

672

pos = 0

656

l = len(s)

673

l = len(s)

657

while pos < l:

674

while pos < l:

658

try:

675

try:

659

c = getutf8char(s, pos)

676

c = getutf8char(s, pos)

660

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

677

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

661

# have to re-escape existing U+DCxx characters

678

# have to re-escape existing U+DCxx characters

662

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

679

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

663

pos += 1

680

pos += 1

664

else:

681

else:

665

pos += len(c)

682

pos += len(c)

666

except UnicodeDecodeError:

683

except UnicodeDecodeError:

667

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

684

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

668

pos += 1

685

pos += 1

669

r += c

686

r += c

670

return r

687

return r

671

688

672

689

673

def fromutf8b(s):

690

def fromutf8b(s):

674

# type: (bytes) -> bytes

691

# type: (bytes) -> bytes

675

"""Given a UTF-8b string, return a local, possibly-binary string.

692

"""Given a UTF-8b string, return a local, possibly-binary string.

676

693

677

return the original binary string. This

694

return the original binary string. This

678

is a round-trip process for strings like filenames, but metadata

695

is a round-trip process for strings like filenames, but metadata

679

that's was passed through tolocal will remain in UTF-8.

696

that's was passed through tolocal will remain in UTF-8.

680

697

681

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

698

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

682

>>> m = b"\\xc3\\xa9\\x99abcd"

699

>>> m = b"\\xc3\\xa9\\x99abcd"

683

>>> toutf8b(m)

700

>>> toutf8b(m)

684

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

701

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

685

>>> roundtrip(m)

702

>>> roundtrip(m)

686

True

703

True

687

>>> roundtrip(b"\\xc2\\xc2\\x80")

704

>>> roundtrip(b"\\xc2\\xc2\\x80")

688

True

705

True

689

>>> roundtrip(b"\\xef\\xbf\\xbd")

706

>>> roundtrip(b"\\xef\\xbf\\xbd")

690

True

707

True

691

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

708

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

692

True

709

True

693

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

710

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

694

True

711

True

695

"""

712

"""

696

713

697

if isasciistr(s):

714

if isasciistr(s):

698

return s

715

return s

699

# fast path - look for uDxxx prefixes in s

716

# fast path - look for uDxxx prefixes in s

700

if b"\xed" not in s:

717

if b"\xed" not in s:

701

return s

718

return s

702

719

703

# We could do this with the unicode type but some Python builds

720

# We could do this with the unicode type but some Python builds

704

# use UTF-16 internally (issue5031) which causes non-BMP code

721

# use UTF-16 internally (issue5031) which causes non-BMP code

705

# points to be escaped. Instead, we use our handy getutf8char

722

# points to be escaped. Instead, we use our handy getutf8char

706

# helper again to walk the string without "decoding" it.

723

# helper again to walk the string without "decoding" it.

707

724

708

s = pycompat.bytestr(s)

725

s = pycompat.bytestr(s)

709

r = b""

726

r = b""

710

pos = 0

727

pos = 0

711

l = len(s)

728

l = len(s)

712

while pos < l:

729

while pos < l:

713

c = getutf8char(s, pos)

730

c = getutf8char(s, pos)

714

pos += len(c)

731

pos += len(c)

715

# unescape U+DCxx characters

732

# unescape U+DCxx characters

716

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

733

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

717

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

734

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

718

r += c

735

r += c

719

return r

736

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
+            import re
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             if pycompat.TYPE_CHECKING:
                 from typing import (
                     Any,
                     Callable,
                     List,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = {
                     k.encode('utf-8'): v.encode('utf-8')
                     for k, v in os.environ.items()  # re-exports
                 }
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 """This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back"""
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 if pycompat.TYPE_CHECKING:
                     # pseudo implementation to help pytype see localstr() constructor
                     def __init__(self, u, l):
                         # type: (bytes, bytes) -> None
                         super(localstr, self).__init__(l)
                         self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 def strtolocal(s):
                     # type: (str) -> bytes
                     return s  # pytype: disable=bad-return-type
                 def strfromlocal(s):
                     # type: (bytes) -> str
                     return s  # pytype: disable=bad-return-type
                 strmethod = pycompat.identity
             def lower(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 if pycompat.iswindows and pycompat.ispy3:
                     class WindowsEnviron(dict):
                         """`os.environ` normalizes environment variables to uppercase on windows"""
                         def get(self, key, default=None):
                             return super().get(upper(key), default)
                     environ = WindowsEnviron()
                 for k, v in os.environ.items():  # re-exports
                     environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
+            DRIVE_RE = re.compile(b'^[a-z]:')
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     #
                     # Additionally, py3.8+ uppercases the drive letter when calling
                     # os.path.realpath(), which is used on ``repo.root``.  Since those
                     # strings are compared in various places as simple strings, also call
                     # realpath here.  See https://bugs.python.org/issue40368
-                    getcwd = lambda: strtolocal(os.path.realpath(os.getcwd()))  # re-exports
+                    # However this is not reliable, so lets explicitly make this drive
+                    # letter upper case.
+                    #
+                    # note: we should consider dropping realpath here since it seems to
+                    # change the semantic of `getcwd`.
+                    def getcwd():
+                        cwd = os.getcwd()  # re-exports
+                        cwd = os.path.realpath(cwd)
+                        cwd = strtolocal(cwd)
+                        if DRIVE_RE.match(cwd):
+                            cwd = cwd[0:1].upper() + cwd[1:]
+                        return cwd
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 """Find the column width of a string for display in the local encoding"""
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 """Find the column width of a Unicode string for display"""
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 """Use colwidth to find a c-column substring of s starting at byte
                 index start"""
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             class normcasespecs(object):
                 """what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h."""
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 """returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 """
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (bytes, int) -> bytes
                 """get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 """
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (bytes) -> bytes
                 """convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 """
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 # type: (bytes) -> bytes
                 """Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 """
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r