upstream/mercurial-mirror Commit - r49747:fa2b1a46

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

9

import locale

9

import locale

10

import os

10

import os

11

import re

11

import re

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

if pycompat.TYPE_CHECKING:

23

if pycompat.TYPE_CHECKING:

24

from typing import (

24

from typing import (

25

Any,

25

Any,

26

Callable,

26

Callable,

27

List,

27

List,

28

Text,

28

Text,

29

Type,

29

Type,

30

TypeVar,

30

TypeVar,

31

Union,

31

Union,

32

)

32

)

33

34

# keep pyflakes happy

34

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

35

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

36

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

40

charencode = policy.importmod('charencode')

40

charencode = policy.importmod('charencode')

41

42

isasciistr = charencode.isasciistr

42

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

43

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

44

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

47

_sysstr = pycompat.sysstr

47

_sysstr = pycompat.sysstr

48

49

if pycompat.ispy3:

49

unichr = chr

50

unichr = chr

51

50

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

51

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# "Unicode Subtleties"), so we need to ignore them in some places for

52

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# sanity.

53

# sanity.

55

_ignore = [

54

_ignore = [

56

unichr(int(x, 16)).encode("utf-8")

55

unichr(int(x, 16)).encode("utf-8")

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

56

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

b"206a 206b 206c 206d 206e 206f feff".split()

57

b"206a 206b 206c 206d 206e 206f feff".split()

59

]

58

]

60

# verify the next function will work

59

# verify the next function will work

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

60

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

61

63

62

64

def hfsignoreclean(s):

63

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

64

# type: (bytes) -> bytes

66

"""Remove codepoints ignored by HFS+ from s.

65

"""Remove codepoints ignored by HFS+ from s.

67

66

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

67

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

'.hg'

68

'.hg'

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

69

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

'.hg'

70

'.hg'

72

"""

71

"""

73

if b"\xe2" in s or b"\xef" in s:

72

if b"\xe2" in s or b"\xef" in s:

74

for c in _ignore:

73

for c in _ignore:

75

s = s.replace(c, b'')

74

s = s.replace(c, b'')

76

return s

75

return s

77

76

78

77

79

# encoding.environ is provided read-only, which may not be used to modify

78

# encoding.environ is provided read-only, which may not be used to modify

80

# the process environment

79

# the process environment

81

_nativeenviron = ~~not~~ ~~pycompat~~.~~ispy3~~ or os.supports_bytes_environ

80

_nativeenviron = os.supports_bytes_environ

82

if not pycompat.ispy3:

81

if _nativeenviron:

83

environ = os.environ # re-exports

84

elif _nativeenviron:

85

environ = os.environb # re-exports

82

environ = os.environb # re-exports

86

else:

83

else:

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

84

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# and recreate it once encoding is settled

85

# and recreate it once encoding is settled

89

environ = {

86

environ = {

90

k.encode('utf-8'): v.encode('utf-8')

87

k.encode('utf-8'): v.encode('utf-8')

91

for k, v in os.environ.items() # re-exports

88

for k, v in os.environ.items() # re-exports

92

}

89

}

93

90

94

_encodingrewrites = {

91

_encodingrewrites = {

95

b'646': b'ascii',

92

b'646': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

93

b'ANSI_X3.4-1968': b'ascii',

97

}

94

}

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

95

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

96

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# https://bugs.python.org/issue13216

97

# https://bugs.python.org/issue13216

101

if pycompat.iswindows ~~and~~ ~~not~~ ~~pycompat~~.~~ispy3~~:

98

if pycompat.iswindows:

102

_encodingrewrites[b'cp65001'] = b'utf-8'

99

_encodingrewrites[b'cp65001'] = b'utf-8'

103

100

104

try:

101

try:

105

encoding = environ.get(b"HGENCODING")

102

encoding = environ.get(b"HGENCODING")

106

if not encoding:

103

if not encoding:

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

104

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = _encodingrewrites.get(encoding, encoding)

105

encoding = _encodingrewrites.get(encoding, encoding)

109

except locale.Error:

106

except locale.Error:

110

encoding = b'ascii'

107

encoding = b'ascii'

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

108

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

fallbackencoding = b'ISO-8859-1'

109

fallbackencoding = b'ISO-8859-1'

113

110

114

111

115

class localstr(bytes):

112

class localstr(bytes):

116

"""This class allows strings that are unmodified to be

113

"""This class allows strings that are unmodified to be

117

round-tripped to the local encoding and back"""

114

round-tripped to the local encoding and back"""

118

115

119

def __new__(cls, u, l):

116

def __new__(cls, u, l):

120

s = bytes.__new__(cls, l)

117

s = bytes.__new__(cls, l)

121

s._utf8 = u

118

s._utf8 = u

122

return s

119

return s

123

120

124

if pycompat.TYPE_CHECKING:

121

if pycompat.TYPE_CHECKING:

125

# pseudo implementation to help pytype see localstr() constructor

122

# pseudo implementation to help pytype see localstr() constructor

126

def __init__(self, u, l):

123

def __init__(self, u, l):

127

# type: (bytes, bytes) -> None

124

# type: (bytes, bytes) -> None

128

super(localstr, self).__init__(l)

125

super(localstr, self).__init__(l)

129

self._utf8 = u

126

self._utf8 = u

130

127

131

def __hash__(self):

128

def __hash__(self):

132

return hash(self._utf8) # avoid collisions in local string space

129

return hash(self._utf8) # avoid collisions in local string space

133

130

134

131

135

class safelocalstr(bytes):

132

class safelocalstr(bytes):

136

"""Tagged string denoting it was previously an internal UTF-8 string,

133

"""Tagged string denoting it was previously an internal UTF-8 string,

137

and can be converted back to UTF-8 losslessly

134

and can be converted back to UTF-8 losslessly

138

135

139

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

136

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

140

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

137

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

141

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

138

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

142

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

139

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

143

"""

140

"""

144

141

145

142

146

def tolocal(s):

143

def tolocal(s):

147

# type: (bytes) -> bytes

144

# type: (bytes) -> bytes

148

"""

145

"""

149

Convert a string from internal UTF-8 to local encoding

146

Convert a string from internal UTF-8 to local encoding

150

147

151

All internal strings should be UTF-8 but some repos before the

148

All internal strings should be UTF-8 but some repos before the

152

implementation of locale support may contain latin1 or possibly

149

implementation of locale support may contain latin1 or possibly

153

other character sets. We attempt to decode everything strictly

150

other character sets. We attempt to decode everything strictly

154

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

151

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

155

replace unknown characters.

152

replace unknown characters.

156

153

157

The localstr class is used to cache the known UTF-8 encoding of

154

The localstr class is used to cache the known UTF-8 encoding of

158

strings next to their local representation to allow lossless

155

strings next to their local representation to allow lossless

159

round-trip conversion back to UTF-8.

156

round-trip conversion back to UTF-8.

160

157

161

>>> u = b'foo: \\xc3\\xa4' # utf-8

158

>>> u = b'foo: \\xc3\\xa4' # utf-8

162

>>> l = tolocal(u)

159

>>> l = tolocal(u)

163

>>> l

160

>>> l

164

'foo: ?'

161

'foo: ?'

165

>>> fromlocal(l)

162

>>> fromlocal(l)

166

'foo: \\xc3\\xa4'

163

'foo: \\xc3\\xa4'

167

>>> u2 = b'foo: \\xc3\\xa1'

164

>>> u2 = b'foo: \\xc3\\xa1'

168

>>> d = { l: 1, tolocal(u2): 2 }

165

>>> d = { l: 1, tolocal(u2): 2 }

169

>>> len(d) # no collision

166

>>> len(d) # no collision

170

2

167

2

171

>>> b'foo: ?' in d

168

>>> b'foo: ?' in d

172

False

169

False

173

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

170

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

174

>>> l = tolocal(l1)

171

>>> l = tolocal(l1)

175

>>> l

172

>>> l

176

'foo: ?'

173

'foo: ?'

177

>>> fromlocal(l) # magically in utf-8

174

>>> fromlocal(l) # magically in utf-8

178

'foo: \\xc3\\xa4'

175

'foo: \\xc3\\xa4'

179

"""

176

"""

180

177

181

if isasciistr(s):

178

if isasciistr(s):

182

return s

179

return s

183

180

184

try:

181

try:

185

try:

182

try:

186

# make sure string is actually stored in UTF-8

183

# make sure string is actually stored in UTF-8

187

u = s.decode('UTF-8')

184

u = s.decode('UTF-8')

188

if encoding == b'UTF-8':

185

if encoding == b'UTF-8':

189

# fast path

186

# fast path

190

return s

187

return s

191

r = u.encode(_sysstr(encoding), "replace")

188

r = u.encode(_sysstr(encoding), "replace")

192

if u == r.decode(_sysstr(encoding)):

189

if u == r.decode(_sysstr(encoding)):

193

# r is a safe, non-lossy encoding of s

190

# r is a safe, non-lossy encoding of s

194

return safelocalstr(r)

191

return safelocalstr(r)

195

return localstr(s, r)

192

return localstr(s, r)

196

except UnicodeDecodeError:

193

except UnicodeDecodeError:

197

# we should only get here if we're looking at an ancient changeset

194

# we should only get here if we're looking at an ancient changeset

198

try:

195

try:

199

u = s.decode(_sysstr(fallbackencoding))

196

u = s.decode(_sysstr(fallbackencoding))

200

r = u.encode(_sysstr(encoding), "replace")

197

r = u.encode(_sysstr(encoding), "replace")

201

if u == r.decode(_sysstr(encoding)):

198

if u == r.decode(_sysstr(encoding)):

202

# r is a safe, non-lossy encoding of s

199

# r is a safe, non-lossy encoding of s

203

return safelocalstr(r)

200

return safelocalstr(r)

204

return localstr(u.encode('UTF-8'), r)

201

return localstr(u.encode('UTF-8'), r)

205

except UnicodeDecodeError:

202

except UnicodeDecodeError:

206

u = s.decode("utf-8", "replace") # last ditch

203

u = s.decode("utf-8", "replace") # last ditch

207

# can't round-trip

204

# can't round-trip

208

return u.encode(_sysstr(encoding), "replace")

205

return u.encode(_sysstr(encoding), "replace")

209

except LookupError as k:

206

except LookupError as k:

210

raise error.Abort(

207

raise error.Abort(

211

pycompat.bytestr(k), hint=b"please check your locale settings"

208

pycompat.bytestr(k), hint=b"please check your locale settings"

212

)

209

)

213

210

214

211

215

def fromlocal(s):

212

def fromlocal(s):

216

# type: (bytes) -> bytes

213

# type: (bytes) -> bytes

217

"""

214

"""

218

Convert a string from the local character encoding to UTF-8

215

Convert a string from the local character encoding to UTF-8

219

216

220

We attempt to decode strings using the encoding mode set by

217

We attempt to decode strings using the encoding mode set by

221

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

218

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

222

characters will cause an error message. Other modes include

219

characters will cause an error message. Other modes include

223

'replace', which replaces unknown characters with a special

220

'replace', which replaces unknown characters with a special

224

Unicode character, and 'ignore', which drops the character.

221

Unicode character, and 'ignore', which drops the character.

225

"""

222

"""

226

223

227

# can we do a lossless round-trip?

224

# can we do a lossless round-trip?

228

if isinstance(s, localstr):

225

if isinstance(s, localstr):

229

return s._utf8

226

return s._utf8

230

if isasciistr(s):

227

if isasciistr(s):

231

return s

228

return s

232

229

233

try:

230

try:

234

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

231

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

235

return u.encode("utf-8")

232

return u.encode("utf-8")

236

except UnicodeDecodeError as inst:

233

except UnicodeDecodeError as inst:

237

sub = s[max(0, inst.start - 10) : inst.start + 10]

234

sub = s[max(0, inst.start - 10) : inst.start + 10]

238

raise error.Abort(

235

raise error.Abort(

239

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

236

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

240

)

237

)

241

except LookupError as k:

238

except LookupError as k:

242

raise error.Abort(

239

raise error.Abort(

243

pycompat.bytestr(k), hint=b"please check your locale settings"

240

pycompat.bytestr(k), hint=b"please check your locale settings"

244

)

241

)

245

242

246

243

247

def unitolocal(u):

244

def unitolocal(u):

248

# type: (Text) -> bytes

245

# type: (Text) -> bytes

249

"""Convert a unicode string to a byte string of local encoding"""

246

"""Convert a unicode string to a byte string of local encoding"""

250

return tolocal(u.encode('utf-8'))

247

return tolocal(u.encode('utf-8'))

251

248

252

249

253

def unifromlocal(s):

250

def unifromlocal(s):

254

# type: (bytes) -> Text

251

# type: (bytes) -> Text

255

"""Convert a byte string of local encoding to a unicode string"""

252

"""Convert a byte string of local encoding to a unicode string"""

256

return fromlocal(s).decode('utf-8')

253

return fromlocal(s).decode('utf-8')

257

254

258

255

259

def unimethod(bytesfunc):

256

def unimethod(bytesfunc):

260

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

257

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

261

"""Create a proxy method that forwards __unicode__() and __str__() of

258

"""Create a proxy method that forwards __unicode__() and __str__() of

262

Python 3 to __bytes__()"""

259

Python 3 to __bytes__()"""

263

260

264

def unifunc(obj):

261

def unifunc(obj):

265

return unifromlocal(bytesfunc(obj))

262

return unifromlocal(bytesfunc(obj))

266

263

267

return unifunc

264

return unifunc

268

265

269

266

270

# converter functions between native str and byte string. use these if the

267

# converter functions between native str and byte string. use these if the

271

# character encoding is not aware (e.g. exception message) or is known to

268

# character encoding is not aware (e.g. exception message) or is known to

272

# be locale dependent (e.g. date formatting.)

269

# be locale dependent (e.g. date formatting.)

273

if pycompat.ispy3:

270

strtolocal = unitolocal

274

~~strto~~local = unitolocal

271

strfromlocal = unifromlocal

275

strfromlocal = unifromlocal

272

strmethod = unimethod

276

strmethod = unimethod

277

else:

278

279

def strtolocal(s):

280

# type: (str) -> bytes

281

return s # pytype: disable=bad-return-type

282

283

def strfromlocal(s):

284

# type: (bytes) -> str

285

return s # pytype: disable=bad-return-type

286

287

strmethod = pycompat.identity

288

273

289

274

290

def lower(s):

275

def lower(s):

291

# type: (bytes) -> bytes

276

# type: (bytes) -> bytes

292

"""best-effort encoding-aware case-folding of local string s"""

277

"""best-effort encoding-aware case-folding of local string s"""

293

try:

278

try:

294

return asciilower(s)

279

return asciilower(s)

295

except UnicodeDecodeError:

280

except UnicodeDecodeError:

296

pass

281

pass

297

try:

282

try:

298

if isinstance(s, localstr):

283

if isinstance(s, localstr):

299

u = s._utf8.decode("utf-8")

284

u = s._utf8.decode("utf-8")

300

else:

285

else:

301

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

286

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

302

287

303

lu = u.lower()

288

lu = u.lower()

304

if u == lu:

289

if u == lu:

305

return s # preserve localstring

290

return s # preserve localstring

306

return lu.encode(_sysstr(encoding))

291

return lu.encode(_sysstr(encoding))

307

except UnicodeError:

292

except UnicodeError:

308

return s.lower() # we don't know how to fold this except in ASCII

293

return s.lower() # we don't know how to fold this except in ASCII

309

except LookupError as k:

294

except LookupError as k:

310

raise error.Abort(

295

raise error.Abort(

311

pycompat.bytestr(k), hint=b"please check your locale settings"

296

pycompat.bytestr(k), hint=b"please check your locale settings"

312

)

297

)

313

298

314

299

315

def upper(s):

300

def upper(s):

316

# type: (bytes) -> bytes

301

# type: (bytes) -> bytes

317

"""best-effort encoding-aware case-folding of local string s"""

302

"""best-effort encoding-aware case-folding of local string s"""

318

try:

303

try:

319

return asciiupper(s)

304

return asciiupper(s)

320

except UnicodeDecodeError:

305

except UnicodeDecodeError:

321

return upperfallback(s)

306

return upperfallback(s)

322

307

323

308

324

def upperfallback(s):

309

def upperfallback(s):

325

# type: (Any) -> Any

310

# type: (Any) -> Any

326

try:

311

try:

327

if isinstance(s, localstr):

312

if isinstance(s, localstr):

328

u = s._utf8.decode("utf-8")

313

u = s._utf8.decode("utf-8")

329

else:

314

else:

330

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

315

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

331

316

332

uu = u.upper()

317

uu = u.upper()

333

if u == uu:

318

if u == uu:

334

return s # preserve localstring

319

return s # preserve localstring

335

return uu.encode(_sysstr(encoding))

320

return uu.encode(_sysstr(encoding))

336

except UnicodeError:

321

except UnicodeError:

337

return s.upper() # we don't know how to fold this except in ASCII

322

return s.upper() # we don't know how to fold this except in ASCII

338

except LookupError as k:

323

except LookupError as k:

339

raise error.Abort(

324

raise error.Abort(

340

pycompat.bytestr(k), hint=b"please check your locale settings"

325

pycompat.bytestr(k), hint=b"please check your locale settings"

341

)

326

)

342

327

343

328

344

if not _nativeenviron:

329

if not _nativeenviron:

345

# now encoding and helper functions are available, recreate the environ

330

# now encoding and helper functions are available, recreate the environ

346

# dict to be exported to other modules

331

# dict to be exported to other modules

347

if pycompat.iswindows ~~and~~ ~~pycompat~~.~~ispy3~~:

332

if pycompat.iswindows:

348

333

349

class WindowsEnviron(dict):

334

class WindowsEnviron(dict):

350

"""`os.environ` normalizes environment variables to uppercase on windows"""

335

"""`os.environ` normalizes environment variables to uppercase on windows"""

351

336

352

def get(self, key, default=None):

337

def get(self, key, default=None):

353

return super().get(upper(key), default)

338

return super().get(upper(key), default)

354

339

355

environ = WindowsEnviron()

340

environ = WindowsEnviron()

356

341

357

for k, v in os.environ.items(): # re-exports

342

for k, v in os.environ.items(): # re-exports

358

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

343

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

359

344

360

345

361

DRIVE_RE = re.compile(b'^[a-z]:')

346

DRIVE_RE = re.compile(b'^[a-z]:')

362

347

363

if pycompat.ispy3:

348

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

364

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

349

# returns bytes.

365

# returns bytes.

350

if pycompat.iswindows:

366

if pycompat.iswindows:

351

# Python 3 on Windows issues a DeprecationWarning about using the bytes

367

# Python 3 on Windows issues a DeprecationWarning about using the bytes

352

# API when os.getcwdb() is called.

368

# API when os.getcwdb() is called.

353

#

369

#

354

# Additionally, py3.8+ uppercases the drive letter when calling

370

# Additionally, py3.8+ uppercases the drive letter when calling

355

# os.path.realpath(), which is used on ``repo.root``. Since those

371

# os.path.realpath(), which is used on ``repo.root``. Since those

356

# strings are compared in various places as simple strings, also call

372

# strings are compared in various places as simple strings, also call

357

# realpath here. See https://bugs.python.org/issue40368

373

# realpath here. See https://bugs.python.org/issue40368

358

#

374

#

359

# However this is not reliable, so lets explicitly make this drive

375

# However this is not reliable, so lets explicitly make this drive

360

# letter upper case.

376

# letter upper case.

361

#

377

#

362

# note: we should consider dropping realpath here since it seems to

378

# note: we should consider dropping realpath here since it seems to

363

# change the semantic of `getcwd`.

379

# change the semantic of `getcwd`.

380

364

381

def getcwd():

365

def getcwd():

382

cwd = os.getcwd() # re-exports

366

cwd = os.getcwd() # re-exports

383

cwd = os.path.realpath(cwd)

367

cwd = os.path.realpath(cwd)

384

cwd = strtolocal(cwd)

368

cwd = strtolocal(cwd)

385

if DRIVE_RE.match(cwd):

369

if DRIVE_RE.match(cwd):

386

cwd = cwd[0:1].upper() + cwd[1:]

370

cwd = cwd[0:1].upper() + cwd[1:]

387

return cwd

371

return cwd

388

372

389

else:

373

390

getcwd = os.getcwdb # re-exports

391

else:

374

else:

392

getcwd = os.getcwd # re-exports

375

getcwd = os.getcwdb # re-exports

393

376

394

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

377

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

395

_wide = _sysstr(

378

_wide = _sysstr(

396

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

379

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

397

and b"WFA"

380

and b"WFA"

398

or b"WF"

381

or b"WF"

399

)

382

)

400

383

401

384

402

def colwidth(s):

385

def colwidth(s):

403

# type: (bytes) -> int

386

# type: (bytes) -> int

404

"""Find the column width of a string for display in the local encoding"""

387

"""Find the column width of a string for display in the local encoding"""

405

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

388

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

406

389

407

390

408

def ucolwidth(d):

391

def ucolwidth(d):

409

# type: (Text) -> int

392

# type: (Text) -> int

410

"""Find the column width of a Unicode string for display"""

393

"""Find the column width of a Unicode string for display"""

411

eaw = getattr(unicodedata, 'east_asian_width', None)

394

eaw = getattr(unicodedata, 'east_asian_width', None)

412

if eaw is not None:

395

if eaw is not None:

413

return sum([eaw(c) in _wide and 2 or 1 for c in d])

396

return sum([eaw(c) in _wide and 2 or 1 for c in d])

414

return len(d)

397

return len(d)

415

398

416

399

417

def getcols(s, start, c):

400

def getcols(s, start, c):

418

# type: (bytes, int, int) -> bytes

401

# type: (bytes, int, int) -> bytes

419

"""Use colwidth to find a c-column substring of s starting at byte

402

"""Use colwidth to find a c-column substring of s starting at byte

420

index start"""

403

index start"""

421

for x in pycompat.xrange(start + c, len(s)):

404

for x in pycompat.xrange(start + c, len(s)):

422

t = s[start:x]

405

t = s[start:x]

423

if colwidth(t) == c:

406

if colwidth(t) == c:

424

return t

407

return t

425

raise ValueError('substring not found')

408

raise ValueError('substring not found')

426

409

427

410

428

def trim(s, width, ellipsis=b'', leftside=False):

411

def trim(s, width, ellipsis=b'', leftside=False):

429

# type: (bytes, int, bytes, bool) -> bytes

412

# type: (bytes, int, bytes, bool) -> bytes

430

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

413

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

431

414

432

If 'leftside' is True, left side of string 's' is trimmed.

415

If 'leftside' is True, left side of string 's' is trimmed.

433

'ellipsis' is always placed at trimmed side.

416

'ellipsis' is always placed at trimmed side.

434

417

435

>>> from .node import bin

418

>>> from .node import bin

436

>>> def bprint(s):

419

>>> def bprint(s):

437

... print(pycompat.sysstr(s))

420

... print(pycompat.sysstr(s))

438

>>> ellipsis = b'+++'

421

>>> ellipsis = b'+++'

439

>>> from . import encoding

422

>>> from . import encoding

440

>>> encoding.encoding = b'utf-8'

423

>>> encoding.encoding = b'utf-8'

441

>>> t = b'1234567890'

424

>>> t = b'1234567890'

442

>>> bprint(trim(t, 12, ellipsis=ellipsis))

425

>>> bprint(trim(t, 12, ellipsis=ellipsis))

443

1234567890

426

1234567890

444

>>> bprint(trim(t, 10, ellipsis=ellipsis))

427

>>> bprint(trim(t, 10, ellipsis=ellipsis))

445

1234567890

428

1234567890

446

>>> bprint(trim(t, 8, ellipsis=ellipsis))

429

>>> bprint(trim(t, 8, ellipsis=ellipsis))

447

12345+++

430

12345+++

448

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

431

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

449

+++67890

432

+++67890

450

>>> bprint(trim(t, 8))

433

>>> bprint(trim(t, 8))

451

12345678

434

12345678

452

>>> bprint(trim(t, 8, leftside=True))

435

>>> bprint(trim(t, 8, leftside=True))

453

34567890

436

34567890

454

>>> bprint(trim(t, 3, ellipsis=ellipsis))

437

>>> bprint(trim(t, 3, ellipsis=ellipsis))

455

+++

438

+++

456

>>> bprint(trim(t, 1, ellipsis=ellipsis))

439

>>> bprint(trim(t, 1, ellipsis=ellipsis))

457

+

440

+

458

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

441

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

459

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

442

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

460

>>> bprint(trim(t, 12, ellipsis=ellipsis))

443

>>> bprint(trim(t, 12, ellipsis=ellipsis))

461

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

444

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

462

>>> bprint(trim(t, 10, ellipsis=ellipsis))

445

>>> bprint(trim(t, 10, ellipsis=ellipsis))

463

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

446

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

464

>>> bprint(trim(t, 8, ellipsis=ellipsis))

447

>>> bprint(trim(t, 8, ellipsis=ellipsis))

465

\xe3\x81\x82\xe3\x81\x84+++

448

\xe3\x81\x82\xe3\x81\x84+++

466

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

449

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

467

+++\xe3\x81\x88\xe3\x81\x8a

450

+++\xe3\x81\x88\xe3\x81\x8a

468

>>> bprint(trim(t, 5))

451

>>> bprint(trim(t, 5))

469

\xe3\x81\x82\xe3\x81\x84

452

\xe3\x81\x82\xe3\x81\x84

470

>>> bprint(trim(t, 5, leftside=True))

453

>>> bprint(trim(t, 5, leftside=True))

471

\xe3\x81\x88\xe3\x81\x8a

454

\xe3\x81\x88\xe3\x81\x8a

472

>>> bprint(trim(t, 4, ellipsis=ellipsis))

455

>>> bprint(trim(t, 4, ellipsis=ellipsis))

473

+++

456

+++

474

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

457

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

475

+++

458

+++

476

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

459

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

477

>>> bprint(trim(t, 12, ellipsis=ellipsis))

460

>>> bprint(trim(t, 12, ellipsis=ellipsis))

478

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

461

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

479

>>> bprint(trim(t, 10, ellipsis=ellipsis))

462

>>> bprint(trim(t, 10, ellipsis=ellipsis))

480

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

463

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

481

>>> bprint(trim(t, 8, ellipsis=ellipsis))

464

>>> bprint(trim(t, 8, ellipsis=ellipsis))

482

\x11\x22\x33\x44\x55+++

465

\x11\x22\x33\x44\x55+++

483

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

466

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

484

+++\x66\x77\x88\x99\xaa

467

+++\x66\x77\x88\x99\xaa

485

>>> bprint(trim(t, 8))

468

>>> bprint(trim(t, 8))

486

\x11\x22\x33\x44\x55\x66\x77\x88

469

\x11\x22\x33\x44\x55\x66\x77\x88

487

>>> bprint(trim(t, 8, leftside=True))

470

>>> bprint(trim(t, 8, leftside=True))

488

\x33\x44\x55\x66\x77\x88\x99\xaa

471

\x33\x44\x55\x66\x77\x88\x99\xaa

489

>>> bprint(trim(t, 3, ellipsis=ellipsis))

472

>>> bprint(trim(t, 3, ellipsis=ellipsis))

490

+++

473

+++

491

>>> bprint(trim(t, 1, ellipsis=ellipsis))

474

>>> bprint(trim(t, 1, ellipsis=ellipsis))

492

+

475

+

493

"""

476

"""

494

try:

477

try:

495

u = s.decode(_sysstr(encoding))

478

u = s.decode(_sysstr(encoding))

496

except UnicodeDecodeError:

479

except UnicodeDecodeError:

497

if len(s) <= width: # trimming is not needed

480

if len(s) <= width: # trimming is not needed

498

return s

481

return s

499

width -= len(ellipsis)

482

width -= len(ellipsis)

500

if width <= 0: # no enough room even for ellipsis

483

if width <= 0: # no enough room even for ellipsis

501

return ellipsis[: width + len(ellipsis)]

484

return ellipsis[: width + len(ellipsis)]

502

if leftside:

485

if leftside:

503

return ellipsis + s[-width:]

486

return ellipsis + s[-width:]

504

return s[:width] + ellipsis

487

return s[:width] + ellipsis

505

488

506

if ucolwidth(u) <= width: # trimming is not needed

489

if ucolwidth(u) <= width: # trimming is not needed

507

return s

490

return s

508

491

509

width -= len(ellipsis)

492

width -= len(ellipsis)

510

if width <= 0: # no enough room even for ellipsis

493

if width <= 0: # no enough room even for ellipsis

511

return ellipsis[: width + len(ellipsis)]

494

return ellipsis[: width + len(ellipsis)]

512

495

513

chars = list(u)

496

chars = list(u)

514

if leftside:

497

if leftside:

515

chars.reverse()

498

chars.reverse()

516

width_so_far = 0

499

width_so_far = 0

517

for i, c in enumerate(chars):

500

for i, c in enumerate(chars):

518

width_so_far += ucolwidth(c)

501

width_so_far += ucolwidth(c)

519

if width_so_far > width:

502

if width_so_far > width:

520

break

503

break

521

chars = chars[:i]

504

chars = chars[:i]

522

if leftside:

505

if leftside:

523

chars.reverse()

506

chars.reverse()

524

u = u''.join(chars).encode(_sysstr(encoding))

507

u = u''.join(chars).encode(_sysstr(encoding))

525

if leftside:

508

if leftside:

526

return ellipsis + u

509

return ellipsis + u

527

return u + ellipsis

510

return u + ellipsis

528

511

529

512

530

class normcasespecs(object):

513

class normcasespecs(object):

531

"""what a platform's normcase does to ASCII strings

514

"""what a platform's normcase does to ASCII strings

532

515

533

This is specified per platform, and should be consistent with what normcase

516

This is specified per platform, and should be consistent with what normcase

534

on that platform actually does.

517

on that platform actually does.

535

518

536

lower: normcase lowercases ASCII strings

519

lower: normcase lowercases ASCII strings

537

upper: normcase uppercases ASCII strings

520

upper: normcase uppercases ASCII strings

538

other: the fallback function should always be called

521

other: the fallback function should always be called

539

522

540

This should be kept in sync with normcase_spec in util.h."""

523

This should be kept in sync with normcase_spec in util.h."""

541

524

542

lower = -1

525

lower = -1

543

upper = 1

526

upper = 1

544

other = 0

527

other = 0

545

528

546

529

547

def jsonescape(s, paranoid=False):

530

def jsonescape(s, paranoid=False):

548

# type: (Any, Any) -> Any

531

# type: (Any, Any) -> Any

549

"""returns a string suitable for JSON

532

"""returns a string suitable for JSON

550

533

551

JSON is problematic for us because it doesn't support non-Unicode

534

JSON is problematic for us because it doesn't support non-Unicode

552

bytes. To deal with this, we take the following approach:

535

bytes. To deal with this, we take the following approach:

553

536

554

- localstr/safelocalstr objects are converted back to UTF-8

537

- localstr/safelocalstr objects are converted back to UTF-8

555

- valid UTF-8/ASCII strings are passed as-is

538

- valid UTF-8/ASCII strings are passed as-is

556

- other strings are converted to UTF-8b surrogate encoding

539

- other strings are converted to UTF-8b surrogate encoding

557

- apply JSON-specified string escaping

540

- apply JSON-specified string escaping

558

541

559

(escapes are doubled in these tests)

542

(escapes are doubled in these tests)

560

543

561

>>> jsonescape(b'this is a test')

544

>>> jsonescape(b'this is a test')

562

'this is a test'

545

'this is a test'

563

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

546

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

564

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

547

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

565

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

548

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

566

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

549

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

567

>>> jsonescape(b'a weird byte: \\xdd')

550

>>> jsonescape(b'a weird byte: \\xdd')

568

'a weird byte: \\xed\\xb3\\x9d'

551

'a weird byte: \\xed\\xb3\\x9d'

569

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

552

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

570

'utf-8: caf\\xc3\\xa9'

553

'utf-8: caf\\xc3\\xa9'

571

>>> jsonescape(b'')

554

>>> jsonescape(b'')

572

''

555

''

573

556

574

If paranoid, non-ascii and common troublesome characters are also escaped.

557

If paranoid, non-ascii and common troublesome characters are also escaped.

575

This is suitable for web output.

558

This is suitable for web output.

576

559

577

>>> s = b'escape characters: \\0 \\x0b \\x7f'

560

>>> s = b'escape characters: \\0 \\x0b \\x7f'

578

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

561

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

579

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

562

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

580

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

563

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

581

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

564

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

582

'escape boundary: ~ \\\\u007f \\\\u0080'

565

'escape boundary: ~ \\\\u007f \\\\u0080'

583

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

566

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

584

'a weird byte: \\\\udcdd'

567

'a weird byte: \\\\udcdd'

585

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

568

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

586

'utf-8: caf\\\\u00e9'

569

'utf-8: caf\\\\u00e9'

587

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

570

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

588

'non-BMP: \\\\ud834\\\\udd1e'

571

'non-BMP: \\\\ud834\\\\udd1e'

589

>>> jsonescape(b'<foo@example.org>', paranoid=True)

572

>>> jsonescape(b'<foo@example.org>', paranoid=True)

590

'\\\\u003cfoo@example.org\\\\u003e'

573

'\\\\u003cfoo@example.org\\\\u003e'

591

"""

574

"""

592

575

593

u8chars = toutf8b(s)

576

u8chars = toutf8b(s)

594

try:

577

try:

595

return _jsonescapeu8fast(u8chars, paranoid)

578

return _jsonescapeu8fast(u8chars, paranoid)

596

except ValueError:

579

except ValueError:

597

pass

580

pass

598

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

581

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

599

582

600

583

601

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

584

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

602

# bytes are mapped to that range.

585

# bytes are mapped to that range.

603

if pycompat.ispy3:

586

_utf8strict = r'surrogatepass'

604

_utf8strict = r'surrogatepass'

605

else:

606

_utf8strict = r'strict'

607

587

608

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

588

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

609

589

610

590

611

def getutf8char(s, pos):

591

def getutf8char(s, pos):

612

# type: (bytes, int) -> bytes

592

# type: (bytes, int) -> bytes

613

"""get the next full utf-8 character in the given string, starting at pos

593

"""get the next full utf-8 character in the given string, starting at pos

614

594

615

Raises a UnicodeError if the given location does not start a valid

595

Raises a UnicodeError if the given location does not start a valid

616

utf-8 character.

596

utf-8 character.

617

"""

597

"""

618

598

619

# find how many bytes to attempt decoding from first nibble

599

# find how many bytes to attempt decoding from first nibble

620

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

600

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

621

if not l: # ascii

601

if not l: # ascii

622

return s[pos : pos + 1]

602

return s[pos : pos + 1]

623

603

624

c = s[pos : pos + l]

604

c = s[pos : pos + l]

625

# validate with attempted decode

605

# validate with attempted decode

626

c.decode("utf-8", _utf8strict)

606

c.decode("utf-8", _utf8strict)

627

return c

607

return c

628

608

629

609

630

def toutf8b(s):

610

def toutf8b(s):

631

# type: (bytes) -> bytes

611

# type: (bytes) -> bytes

632

"""convert a local, possibly-binary string into UTF-8b

612

"""convert a local, possibly-binary string into UTF-8b

633

613

634

This is intended as a generic method to preserve data when working

614

This is intended as a generic method to preserve data when working

635

with schemes like JSON and XML that have no provision for

615

with schemes like JSON and XML that have no provision for

636

arbitrary byte strings. As Mercurial often doesn't know

616

arbitrary byte strings. As Mercurial often doesn't know

637

what encoding data is in, we use so-called UTF-8b.

617

what encoding data is in, we use so-called UTF-8b.

638

618

639

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

619

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

640

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

620

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

641

uDC00-uDCFF.

621

uDC00-uDCFF.

642

622

643

Principles of operation:

623

Principles of operation:

644

624

645

- ASCII and UTF-8 data successfully round-trips and is understood

625

- ASCII and UTF-8 data successfully round-trips and is understood

646

by Unicode-oriented clients

626

by Unicode-oriented clients

647

- filenames and file contents in arbitrary other encodings can have

627

- filenames and file contents in arbitrary other encodings can have

648

be round-tripped or recovered by clueful clients

628

be round-tripped or recovered by clueful clients

649

- local strings that have a cached known UTF-8 encoding (aka

629

- local strings that have a cached known UTF-8 encoding (aka

650

localstr) get sent as UTF-8 so Unicode-oriented clients get the

630

localstr) get sent as UTF-8 so Unicode-oriented clients get the

651

Unicode data they want

631

Unicode data they want

652

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

632

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

653

- because we must preserve UTF-8 bytestring in places such as

633

- because we must preserve UTF-8 bytestring in places such as

654

filenames, metadata can't be roundtripped without help

634

filenames, metadata can't be roundtripped without help

655

635

656

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

636

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

657

arbitrary bytes into an internal Unicode format that can be

637

arbitrary bytes into an internal Unicode format that can be

658

re-encoded back into the original. Here we are exposing the

638

re-encoded back into the original. Here we are exposing the

659

internal surrogate encoding as a UTF-8 string.)

639

internal surrogate encoding as a UTF-8 string.)

660

"""

640

"""

661

641

662

if isinstance(s, localstr):

642

if isinstance(s, localstr):

663

# assume that the original UTF-8 sequence would never contain

643

# assume that the original UTF-8 sequence would never contain

664

# invalid characters in U+DCxx range

644

# invalid characters in U+DCxx range

665

return s._utf8

645

return s._utf8

666

elif isinstance(s, safelocalstr):

646

elif isinstance(s, safelocalstr):

667

# already verified that s is non-lossy in legacy encoding, which

647

# already verified that s is non-lossy in legacy encoding, which

668

# shouldn't contain characters in U+DCxx range

648

# shouldn't contain characters in U+DCxx range

669

return fromlocal(s)

649

return fromlocal(s)

670

elif isasciistr(s):

650

elif isasciistr(s):

671

return s

651

return s

672

if b"\xed" not in s:

652

if b"\xed" not in s:

673

try:

653

try:

674

s.decode('utf-8', _utf8strict)

654

s.decode('utf-8', _utf8strict)

675

return s

655

return s

676

except UnicodeDecodeError:

656

except UnicodeDecodeError:

677

pass

657

pass

678

658

679

s = pycompat.bytestr(s)

659

s = pycompat.bytestr(s)

680

r = b""

660

r = b""

681

pos = 0

661

pos = 0

682

l = len(s)

662

l = len(s)

683

while pos < l:

663

while pos < l:

684

try:

664

try:

685

c = getutf8char(s, pos)

665

c = getutf8char(s, pos)

686

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

666

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

687

# have to re-escape existing U+DCxx characters

667

# have to re-escape existing U+DCxx characters

688

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

668

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

689

pos += 1

669

pos += 1

690

else:

670

else:

691

pos += len(c)

671

pos += len(c)

692

except UnicodeDecodeError:

672

except UnicodeDecodeError:

693

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

673

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

694

pos += 1

674

pos += 1

695

r += c

675

r += c

696

return r

676

return r

697

677

698

678

699

def fromutf8b(s):

679

def fromutf8b(s):

700

# type: (bytes) -> bytes

680

# type: (bytes) -> bytes

701

"""Given a UTF-8b string, return a local, possibly-binary string.

681

"""Given a UTF-8b string, return a local, possibly-binary string.

702

682

703

return the original binary string. This

683

return the original binary string. This

704

is a round-trip process for strings like filenames, but metadata

684

is a round-trip process for strings like filenames, but metadata

705

that's was passed through tolocal will remain in UTF-8.

685

that's was passed through tolocal will remain in UTF-8.

706

686

707

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

687

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

708

>>> m = b"\\xc3\\xa9\\x99abcd"

688

>>> m = b"\\xc3\\xa9\\x99abcd"

709

>>> toutf8b(m)

689

>>> toutf8b(m)

710

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

690

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

711

>>> roundtrip(m)

691

>>> roundtrip(m)

712

True

692

True

713

>>> roundtrip(b"\\xc2\\xc2\\x80")

693

>>> roundtrip(b"\\xc2\\xc2\\x80")

714

True

694

True

715

>>> roundtrip(b"\\xef\\xbf\\xbd")

695

>>> roundtrip(b"\\xef\\xbf\\xbd")

716

True

696

True

717

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

697

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

718

True

698

True

719

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

699

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

720

True

700

True

721

"""

701

"""

722

702

723

if isasciistr(s):

703

if isasciistr(s):

724

return s

704

return s

725

# fast path - look for uDxxx prefixes in s

705

# fast path - look for uDxxx prefixes in s

726

if b"\xed" not in s:

706

if b"\xed" not in s:

727

return s

707

return s

728

708

729

# We could do this with the unicode type but some Python builds

709

# We could do this with the unicode type but some Python builds

730

# use UTF-16 internally (issue5031) which causes non-BMP code

710

# use UTF-16 internally (issue5031) which causes non-BMP code

731

# points to be escaped. Instead, we use our handy getutf8char

711

# points to be escaped. Instead, we use our handy getutf8char

732

# helper again to walk the string without "decoding" it.

712

# helper again to walk the string without "decoding" it.

733

713

734

s = pycompat.bytestr(s)

714

s = pycompat.bytestr(s)

735

r = b""

715

r = b""

736

pos = 0

716

pos = 0

737

l = len(s)

717

l = len(s)

738

while pos < l:

718

while pos < l:

739

c = getutf8char(s, pos)

719

c = getutf8char(s, pos)

740

pos += len(c)

720

pos += len(c)

741

# unescape U+DCxx characters

721

# unescape U+DCxx characters

742

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

722

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

743

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

723

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

744

r += c

724

r += c

745

return r

725

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import locale
             import os
             import re
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             if pycompat.TYPE_CHECKING:
                 from typing import (
                     Any,
                     Callable,
                     List,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
-            if pycompat.ispy3:
+            unichr = chr
-                unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
-            _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
+            _nativeenviron = os.supports_bytes_environ
-            if not pycompat.ispy3:
+            if _nativeenviron:
-                environ = os.environ  # re-exports
-            elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = {
                     k.encode('utf-8'): v.encode('utf-8')
                     for k, v in os.environ.items()  # re-exports
                 }
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
-            if pycompat.iswindows and not pycompat.ispy3:
+            if pycompat.iswindows:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 """This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back"""
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 if pycompat.TYPE_CHECKING:
                     # pseudo implementation to help pytype see localstr() constructor
                     def __init__(self, u, l):
                         # type: (bytes, bytes) -> None
                         super(localstr, self).__init__(l)
                         self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
-            if pycompat.ispy3:
+            strtolocal = unitolocal
-                strtolocal = unitolocal
+            strfromlocal = unifromlocal
-                strfromlocal = unifromlocal
+            strmethod = unimethod
-                strmethod = unimethod
-            else:
-                def strtolocal(s):
-                    # type: (str) -> bytes
-                    return s  # pytype: disable=bad-return-type
-                def strfromlocal(s):
-                    # type: (bytes) -> str
-                    return s  # pytype: disable=bad-return-type
-                strmethod = pycompat.identity
             def lower(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def upper(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
-                if pycompat.iswindows and pycompat.ispy3:
+                if pycompat.iswindows:
                     class WindowsEnviron(dict):
                         """`os.environ` normalizes environment variables to uppercase on windows"""
                         def get(self, key, default=None):
                             return super().get(upper(key), default)
                     environ = WindowsEnviron()
                 for k, v in os.environ.items():  # re-exports
                     environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
             DRIVE_RE = re.compile(b'^[a-z]:')
-            if pycompat.ispy3:
+            # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
-                # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
+            # returns bytes.
-                # returns bytes.
+            if pycompat.iswindows:
-                if pycompat.iswindows:
+                # Python 3 on Windows issues a DeprecationWarning about using the bytes
-                    # Python 3 on Windows issues a DeprecationWarning about using the bytes
+                # API when os.getcwdb() is called.
-                    # API when os.getcwdb() is called.
+                # Additionally, py3.8+ uppercases the drive letter when calling
-                    # Additionally, py3.8+ uppercases the drive letter when calling
+                # os.path.realpath(), which is used on ``repo.root``.  Since those
-                    # os.path.realpath(), which is used on ``repo.root``.  Since those
+                # strings are compared in various places as simple strings, also call
-                    # strings are compared in various places as simple strings, also call
+                # realpath here.  See https://bugs.python.org/issue40368
-                    # realpath here.  See https://bugs.python.org/issue40368
+                # However this is not reliable, so lets explicitly make this drive
-                    # However this is not reliable, so lets explicitly make this drive
+                # letter upper case.
-                    # letter upper case.
+                # note: we should consider dropping realpath here since it seems to
-                    # note: we should consider dropping realpath here since it seems to
+                # change the semantic of `getcwd`.
-                    # change the semantic of `getcwd`.
-                    def getcwd():
+                def getcwd():
-                        cwd = os.getcwd()  # re-exports
+                    cwd = os.getcwd()  # re-exports
-                        cwd = os.path.realpath(cwd)
+                    cwd = os.path.realpath(cwd)
-                        cwd = strtolocal(cwd)
+                    cwd = strtolocal(cwd)
-                        if DRIVE_RE.match(cwd):
+                    if DRIVE_RE.match(cwd):
-                            cwd = cwd[0:1].upper() + cwd[1:]
+                        cwd = cwd[0:1].upper() + cwd[1:]
-                        return cwd
+                    return cwd
-                else:
-                    getcwd = os.getcwdb  # re-exports
             else:
-                getcwd = os.getcwd  # re-exports
+                getcwd = os.getcwdb  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 """Find the column width of a string for display in the local encoding"""
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 """Find the column width of a Unicode string for display"""
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 """Use colwidth to find a c-column substring of s starting at byte
                 index start"""
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 chars = list(u)
                 if leftside:
                     chars.reverse()
                 width_so_far = 0
                 for i, c in enumerate(chars):
                     width_so_far += ucolwidth(c)
                     if width_so_far > width:
                         break
                 chars = chars[:i]
                 if leftside:
                     chars.reverse()
                 u = u''.join(chars).encode(_sysstr(encoding))
                 if leftside:
                     return ellipsis + u
                 return u + ellipsis
             class normcasespecs(object):
                 """what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h."""
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 """returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 """
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
-            if pycompat.ispy3:
+            _utf8strict = r'surrogatepass'
-                _utf8strict = r'surrogatepass'
-            else:
-                _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (bytes, int) -> bytes
                 """get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 """
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (bytes) -> bytes
                 """convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 """
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 # type: (bytes) -> bytes
                 """Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 """
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r