upstream/mercurial-mirror Commit - r51214:95acba2c

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

9

import locale

9

import locale

10

import os

10

import os

11

import re

11

import re

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

if pycompat.TYPE_CHECKING:

23

if pycompat.TYPE_CHECKING:

24

from typing import (

24

from typing import (

25

Any,

25

Any,

26

Callable,

26

Callable,

27

List,

27

List,

28

Text,

28

Text,

29

Type,

29

Type,

30

TypeVar,

30

TypeVar,

31

Union,

31

Union,

32

)

32

)

33

34

# keep pyflakes happy

34

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

35

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

36

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

40

charencode = policy.importmod('charencode')

40

charencode = policy.importmod('charencode')

41

42

isasciistr = charencode.isasciistr

42

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

43

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

44

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

47

_sysstr = pycompat.sysstr

47

_sysstr = pycompat.sysstr

48

49

unichr = chr

49

unichr = chr

50

51

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

51

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

52

# "Unicode Subtleties"), so we need to ignore them in some places for

52

# "Unicode Subtleties"), so we need to ignore them in some places for

53

# sanity.

53

# sanity.

54

_ignore = [

54

_ignore = [

55

unichr(int(x, 16)).encode("utf-8")

55

unichr(int(x, 16)).encode("utf-8")

56

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

56

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

57

b"206a 206b 206c 206d 206e 206f feff".split()

57

b"206a 206b 206c 206d 206e 206f feff".split()

58

]

58

]

59

# verify the next function will work

59

# verify the next function will work

60

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

60

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

61

62

63

def hfsignoreclean(s):

63

def hfsignoreclean(s):

64

# type: (bytes) -> bytes

64

# type: (bytes) -> bytes

65

"""Remove codepoints ignored by HFS+ from s.

65

"""Remove codepoints ignored by HFS+ from s.

66

67

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

67

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

68

'.hg'

68

'.hg'

69

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

69

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

70

'.hg'

70

'.hg'

71

"""

71

"""

72

if b"\xe2" in s or b"\xef" in s:

72

if b"\xe2" in s or b"\xef" in s:

73

for c in _ignore:

73

for c in _ignore:

74

s = s.replace(c, b'')

74

s = s.replace(c, b'')

75

return s

75

return s

76

77

78

# encoding.environ is provided read-only, which may not be used to modify

78

# encoding.environ is provided read-only, which may not be used to modify

79

# the process environment

79

# the process environment

80

_nativeenviron = os.supports_bytes_environ

80

_nativeenviron = os.supports_bytes_environ

81

if _nativeenviron:

81

if _nativeenviron:

82

environ = os.environb # re-exports

82

environ = os.environb # re-exports

83

else:

83

else:

84

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

84

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

85

# and recreate it once encoding is settled

85

# and recreate it once encoding is settled

86

environ = {

86

environ = {

87

k.encode('utf-8'): v.encode('utf-8')

87

k.encode('utf-8'): v.encode('utf-8')

88

for k, v in os.environ.items() # re-exports

88

for k, v in os.environ.items() # re-exports

89

}

89

}

90

91

_encodingrewrites = {

91

_encodingrewrites = {

92

b'646': b'ascii',

92

b'646': b'ascii',

93

b'ANSI_X3.4-1968': b'ascii',

93

b'ANSI_X3.4-1968': b'ascii',

94

}

94

}

95

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

95

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

96

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

96

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

97

# https://bugs.python.org/issue13216

97

# https://bugs.python.org/issue13216

98

if pycompat.iswindows:

98

if pycompat.iswindows:

99

_encodingrewrites[b'cp65001'] = b'utf-8'

99

_encodingrewrites[b'cp65001'] = b'utf-8'

100

101

try:

101

try:

102

encoding = environ.get(b"HGENCODING")

102

encoding = environ.get(b"HGENCODING")

103

if not encoding:

103

if not encoding:

104

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

104

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

105

encoding = _encodingrewrites.get(encoding, encoding)

105

encoding = _encodingrewrites.get(encoding, encoding)

106

except locale.Error:

106

except locale.Error:

107

encoding = b'ascii'

107

encoding = b'ascii'

108

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

108

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

109

fallbackencoding = b'ISO-8859-1'

109

fallbackencoding = b'ISO-8859-1'

110

111

112

class localstr(bytes):

112

class localstr(bytes):

113

"""This class allows strings that are unmodified to be

113

"""This class allows strings that are unmodified to be

114

round-tripped to the local encoding and back"""

114

round-tripped to the local encoding and back"""

115

116

def __new__(cls, u, l):

116

def __new__(cls, u, l):

117

s = bytes.__new__(cls, l)

117

s = bytes.__new__(cls, l)

118

s._utf8 = u

118

s._utf8 = u

119

return s

119

return s

120

121

if pycompat.TYPE_CHECKING:

121

if pycompat.TYPE_CHECKING:

122

# pseudo implementation to help pytype see localstr() constructor

122

# pseudo implementation to help pytype see localstr() constructor

123

def __init__(self, u, l):

123

def __init__(self, u, l):

124

# type: (bytes, bytes) -> None

124

# type: (bytes, bytes) -> None

125

super(localstr, self).__init__(l)

125

super(localstr, self).__init__(l)

126

self._utf8 = u

126

self._utf8 = u

127

128

def __hash__(self):

128

def __hash__(self):

129

return hash(self._utf8) # avoid collisions in local string space

129

return hash(self._utf8) # avoid collisions in local string space

130

131

132

class safelocalstr(bytes):

132

class safelocalstr(bytes):

133

"""Tagged string denoting it was previously an internal UTF-8 string,

133

"""Tagged string denoting it was previously an internal UTF-8 string,

134

and can be converted back to UTF-8 losslessly

134

and can be converted back to UTF-8 losslessly

135

136

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

136

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

137

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

137

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

138

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

138

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

139

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

139

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

140

"""

140

"""

141

142

143

def tolocal(s):

143

def tolocal(s):

144

# type: (bytes) -> bytes

144

# type: (bytes) -> bytes

145

"""

145

"""

146

Convert a string from internal UTF-8 to local encoding

146

Convert a string from internal UTF-8 to local encoding

147

148

All internal strings should be UTF-8 but some repos before the

148

All internal strings should be UTF-8 but some repos before the

149

implementation of locale support may contain latin1 or possibly

149

implementation of locale support may contain latin1 or possibly

150

other character sets. We attempt to decode everything strictly

150

other character sets. We attempt to decode everything strictly

151

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

151

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

152

replace unknown characters.

152

replace unknown characters.

153

154

The localstr class is used to cache the known UTF-8 encoding of

154

The localstr class is used to cache the known UTF-8 encoding of

155

strings next to their local representation to allow lossless

155

strings next to their local representation to allow lossless

156

round-trip conversion back to UTF-8.

156

round-trip conversion back to UTF-8.

157

158

>>> u = b'foo: \\xc3\\xa4' # utf-8

158

>>> u = b'foo: \\xc3\\xa4' # utf-8

159

>>> l = tolocal(u)

159

>>> l = tolocal(u)

160

>>> l

160

>>> l

161

'foo: ?'

161

'foo: ?'

162

>>> fromlocal(l)

162

>>> fromlocal(l)

163

'foo: \\xc3\\xa4'

163

'foo: \\xc3\\xa4'

164

>>> u2 = b'foo: \\xc3\\xa1'

164

>>> u2 = b'foo: \\xc3\\xa1'

165

>>> d = { l: 1, tolocal(u2): 2 }

165

>>> d = { l: 1, tolocal(u2): 2 }

166

>>> len(d) # no collision

166

>>> len(d) # no collision

167

2

167

2

168

>>> b'foo: ?' in d

168

>>> b'foo: ?' in d

169

False

169

False

170

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

170

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

171

>>> l = tolocal(l1)

171

>>> l = tolocal(l1)

172

>>> l

172

>>> l

173

'foo: ?'

173

'foo: ?'

174

>>> fromlocal(l) # magically in utf-8

174

>>> fromlocal(l) # magically in utf-8

175

'foo: \\xc3\\xa4'

175

'foo: \\xc3\\xa4'

176

"""

176

"""

177

178

if isasciistr(s):

178

if isasciistr(s):

179

return s

179

return s

180

181

try:

181

try:

182

try:

182

try:

183

# make sure string is actually stored in UTF-8

183

# make sure string is actually stored in UTF-8

184

u = s.decode('UTF-8')

184

u = s.decode('UTF-8')

185

if encoding == b'UTF-8':

185

if encoding == b'UTF-8':

186

# fast path

186

# fast path

187

return s

187

return s

188

r = u.encode(_sysstr(encoding), "replace")

188

r = u.encode(_sysstr(encoding), "replace")

189

if u == r.decode(_sysstr(encoding)):

189

if u == r.decode(_sysstr(encoding)):

190

# r is a safe, non-lossy encoding of s

190

# r is a safe, non-lossy encoding of s

191

return safelocalstr(r)

191

return safelocalstr(r)

192

return localstr(s, r)

192

return localstr(s, r)

193

except UnicodeDecodeError:

193

except UnicodeDecodeError:

194

# we should only get here if we're looking at an ancient changeset

194

# we should only get here if we're looking at an ancient changeset

195

try:

195

try:

196

u = s.decode(_sysstr(fallbackencoding))

196

u = s.decode(_sysstr(fallbackencoding))

197

r = u.encode(_sysstr(encoding), "replace")

197

r = u.encode(_sysstr(encoding), "replace")

198

if u == r.decode(_sysstr(encoding)):

198

if u == r.decode(_sysstr(encoding)):

199

# r is a safe, non-lossy encoding of s

199

# r is a safe, non-lossy encoding of s

200

return safelocalstr(r)

200

return safelocalstr(r)

201

return localstr(u.encode('UTF-8'), r)

201

return localstr(u.encode('UTF-8'), r)

202

except UnicodeDecodeError:

202

except UnicodeDecodeError:

203

u = s.decode("utf-8", "replace") # last ditch

203

u = s.decode("utf-8", "replace") # last ditch

204

# can't round-trip

204

# can't round-trip

205

return u.encode(_sysstr(encoding), "replace")

205

return u.encode(_sysstr(encoding), "replace")

206

except LookupError as k:

206

except LookupError as k:

207

raise error.Abort(

207

raise error.Abort(

208

pycompat.bytestr(k), hint=b"please check your locale settings"

208

pycompat.bytestr(k), hint=b"please check your locale settings"

209

)

209

)

210

211

212

def fromlocal(s):

212

def fromlocal(s):

213

# type: (bytes) -> bytes

213

# type: (bytes) -> bytes

214

"""

214

"""

215

Convert a string from the local character encoding to UTF-8

215

Convert a string from the local character encoding to UTF-8

216

217

We attempt to decode strings using the encoding mode set by

217

We attempt to decode strings using the encoding mode set by

218

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

218

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

219

characters will cause an error message. Other modes include

219

characters will cause an error message. Other modes include

220

'replace', which replaces unknown characters with a special

220

'replace', which replaces unknown characters with a special

221

Unicode character, and 'ignore', which drops the character.

221

Unicode character, and 'ignore', which drops the character.

222

"""

222

"""

223

224

# can we do a lossless round-trip?

224

# can we do a lossless round-trip?

225

if isinstance(s, localstr):

225

if isinstance(s, localstr):

226

return s._utf8

226

return s._utf8

227

if isasciistr(s):

227

if isasciistr(s):

228

return s

228

return s

229

230

try:

230

try:

231

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

231

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

232

return u.encode("utf-8")

232

return u.encode("utf-8")

233

except UnicodeDecodeError as inst:

233

except UnicodeDecodeError as inst:

234

sub = s[max(0, inst.start - 10) : inst.start + 10]

234

sub = s[max(0, inst.start - 10) : inst.start + 10]

235

raise error.Abort(

235

raise error.Abort(

236

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

236

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

237

)

237

)

238

except LookupError as k:

238

except LookupError as k:

239

raise error.Abort(

239

raise error.Abort(

240

pycompat.bytestr(k), hint=b"please check your locale settings"

240

pycompat.bytestr(k), hint=b"please check your locale settings"

241

)

241

)

242

243

244

def unitolocal(u):

244

def unitolocal(u):

245

# type: (Text) -> bytes

245

# type: (Text) -> bytes

246

"""Convert a unicode string to a byte string of local encoding"""

246

"""Convert a unicode string to a byte string of local encoding"""

247

return tolocal(u.encode('utf-8'))

247

return tolocal(u.encode('utf-8'))

248

249

250

def unifromlocal(s):

250

def unifromlocal(s):

251

# type: (bytes) -> Text

251

# type: (bytes) -> Text

252

"""Convert a byte string of local encoding to a unicode string"""

252

"""Convert a byte string of local encoding to a unicode string"""

253

return fromlocal(s).decode('utf-8')

253

return fromlocal(s).decode('utf-8')

254

255

256

def unimethod(bytesfunc):

256

def unimethod(bytesfunc):

257

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

257

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

258

"""Create a proxy method that forwards __unicode__() and __str__() of

258

"""Create a proxy method that forwards __unicode__() and __str__() of

259

Python 3 to __bytes__()"""

259

Python 3 to __bytes__()"""

260

261

def unifunc(obj):

261

def unifunc(obj):

262

return unifromlocal(bytesfunc(obj))

262

return unifromlocal(bytesfunc(obj))

263

264

return unifunc

264

return unifunc

265

266

267

# converter functions between native str and byte string. use these if the

267

# converter functions between native str and byte string. use these if the

268

# character encoding is not aware (e.g. exception message) or is known to

268

# character encoding is not aware (e.g. exception message) or is known to

269

# be locale dependent (e.g. date formatting.)

269

# be locale dependent (e.g. date formatting.)

270

strtolocal = unitolocal

270

strtolocal = unitolocal

271

strfromlocal = unifromlocal

271

strfromlocal = unifromlocal

272

strmethod = unimethod

272

strmethod = unimethod

273

274

275

def lower(s):

275

def lower(s):

276

# type: (bytes) -> bytes

276

# type: (bytes) -> bytes

277

"""best-effort encoding-aware case-folding of local string s"""

277

"""best-effort encoding-aware case-folding of local string s"""

278

try:

278

try:

279

return asciilower(s)

279

return asciilower(s)

280

except UnicodeDecodeError:

280

except UnicodeDecodeError:

281

pass

281

pass

282

try:

282

try:

283

if isinstance(s, localstr):

283

if isinstance(s, localstr):

284

u = s._utf8.decode("utf-8")

284

u = s._utf8.decode("utf-8")

285

else:

285

else:

286

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

286

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

287

288

lu = u.lower()

288

lu = u.lower()

289

if u == lu:

289

if u == lu:

290

return s # preserve localstring

290

return s # preserve localstring

291

return lu.encode(_sysstr(encoding))

291

return lu.encode(_sysstr(encoding))

292

except UnicodeError:

292

except UnicodeError:

293

return s.lower() # we don't know how to fold this except in ASCII

293

return s.lower() # we don't know how to fold this except in ASCII

294

except LookupError as k:

294

except LookupError as k:

295

raise error.Abort(

295

raise error.Abort(

296

pycompat.bytestr(k), hint=b"please check your locale settings"

296

pycompat.bytestr(k), hint=b"please check your locale settings"

297

)

297

)

298

299

300

def upper(s):

300

def upper(s):

301

# type: (bytes) -> bytes

301

# type: (bytes) -> bytes

302

"""best-effort encoding-aware case-folding of local string s"""

302

"""best-effort encoding-aware case-folding of local string s"""

303

try:

303

try:

304

return asciiupper(s)

304

return asciiupper(s)

305

except UnicodeDecodeError:

305

except UnicodeDecodeError:

306

return upperfallback(s)

306

return upperfallback(s)

307

308

309

def upperfallback(s):

309

def upperfallback(s):

310

# type: (Any) -> Any

310

# type: (Any) -> Any

311

try:

311

try:

312

if isinstance(s, localstr):

312

if isinstance(s, localstr):

313

u = s._utf8.decode("utf-8")

313

u = s._utf8.decode("utf-8")

314

else:

314

else:

315

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

315

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

316

317

uu = u.upper()

317

uu = u.upper()

318

if u == uu:

318

if u == uu:

319

return s # preserve localstring

319

return s # preserve localstring

320

return uu.encode(_sysstr(encoding))

320

return uu.encode(_sysstr(encoding))

321

except UnicodeError:

321

except UnicodeError:

322

return s.upper() # we don't know how to fold this except in ASCII

322

return s.upper() # we don't know how to fold this except in ASCII

323

except LookupError as k:

323

except LookupError as k:

324

raise error.Abort(

324

raise error.Abort(

325

pycompat.bytestr(k), hint=b"please check your locale settings"

325

pycompat.bytestr(k), hint=b"please check your locale settings"

326

)

326

)

327

328

329

if not _nativeenviron:

329

if not _nativeenviron:

330

# now encoding and helper functions are available, recreate the environ

330

# now encoding and helper functions are available, recreate the environ

331

# dict to be exported to other modules

331

# dict to be exported to other modules

332

if pycompat.iswindows:

332

if pycompat.iswindows:

333

334

class WindowsEnviron(dict):

334

class WindowsEnviron(dict):

335

"""`os.environ` normalizes environment variables to uppercase on windows"""

335

"""`os.environ` normalizes environment variables to uppercase on windows"""

336

337

def get(self, key, default=None):

337

def get(self, key, default=None):

338

return super().get(upper(key), default)

338

return super().get(upper(key), default)

339

340

environ = WindowsEnviron()

340

environ = WindowsEnviron()

341

342

for k, v in os.environ.items(): # re-exports

342

for k, v in os.environ.items(): # re-exports

343

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

343

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

344

345

346

DRIVE_RE = re.compile(b'^[a-z]:')

346

DRIVE_RE = re.compile(b'^[a-z]:')

347

348

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

348

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

349

# returns bytes.

349

# returns bytes.

350

if pycompat.iswindows:

350

if pycompat.iswindows:

351

# Python 3 on Windows issues a DeprecationWarning about using the bytes

351

# Python 3 on Windows issues a DeprecationWarning about using the bytes

352

# API when os.getcwdb() is called.

352

# API when os.getcwdb() is called.

353

#

353

#

354

# Additionally, py3.8+ uppercases the drive letter when calling

354

# Additionally, py3.8+ uppercases the drive letter when calling

355

# os.path.realpath(), which is used on ``repo.root``. Since those

355

# os.path.realpath(), which is used on ``repo.root``. Since those

356

# strings are compared in various places as simple strings, also call

356

# strings are compared in various places as simple strings, also call

357

# realpath here. See https://bugs.python.org/issue40368

357

# realpath here. See https://bugs.python.org/issue40368

358

#

358

#

359

# However this is not reliable, so lets explicitly make this drive

359

# However this is not reliable, so lets explicitly make this drive

360

# letter upper case.

360

# letter upper case.

361

#

361

#

362

# note: we should consider dropping realpath here since it seems to

362

# note: we should consider dropping realpath here since it seems to

363

# change the semantic of `getcwd`.

363

# change the semantic of `getcwd`.

364

365

def getcwd():

365

def getcwd():

366

cwd = os.getcwd() # re-exports

366

cwd = os.getcwd() # re-exports

367

cwd = os.path.realpath(cwd)

367

cwd = os.path.realpath(cwd)

368

cwd = strtolocal(cwd)

368

cwd = strtolocal(cwd)

369

if DRIVE_RE.match(cwd):

369

if DRIVE_RE.match(cwd):

370

cwd = cwd[0:1].upper() + cwd[1:]

370

cwd = cwd[0:1].upper() + cwd[1:]

371

return cwd

371

return cwd

372

373

374

else:

374

else:

375

getcwd = os.getcwdb # re-exports

375

getcwd = os.getcwdb # re-exports

376

377

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

377

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

378

_wide = _sysstr(

378

_wide = _sysstr(

379

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

379

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

380

and b"WFA"

380

and b"WFA"

381

or b"WF"

381

or b"WF"

382

)

382

)

383

384

385

def colwidth(s):

385

def colwidth(s):

386

# type: (bytes) -> int

386

# type: (bytes) -> int

387

"""Find the column width of a string for display in the local encoding"""

387

"""Find the column width of a string for display in the local encoding"""

388

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

388

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

389

390

391

def ucolwidth(d):

391

def ucolwidth(d):

392

# type: (Text) -> int

392

# type: (Text) -> int

393

"""Find the column width of a Unicode string for display"""

393

"""Find the column width of a Unicode string for display"""

394

eaw = getattr(unicodedata, 'east_asian_width', None)

394

eaw = getattr(unicodedata, 'east_asian_width', None)

395

if eaw is not None:

395

if eaw is not None:

396

return sum([eaw(c) in _wide and 2 or 1 for c in d])

396

return sum([eaw(c) in _wide and 2 or 1 for c in d])

397

return len(d)

397

return len(d)

398

399

400

def getcols(s, start, c):

400

def getcols(s, start, c):

401

# type: (bytes, int, int) -> bytes

401

# type: (bytes, int, int) -> bytes

402

"""Use colwidth to find a c-column substring of s starting at byte

402

"""Use colwidth to find a c-column substring of s starting at byte

403

index start"""

403

index start"""

404

for x in range(start + c, len(s)):

404

for x in range(start + c, len(s)):

405

t = s[start:x]

405

t = s[start:x]

406

if colwidth(t) == c:

406

if colwidth(t) == c:

407

return t

407

return t

408

raise ValueError('substring not found')

408

raise ValueError('substring not found')

409

410

411

def trim(s, width, ellipsis=b'', leftside=False):

411

def trim(s, width, ellipsis=b'', leftside=False):

412

# type: (bytes, int, bytes, bool) -> bytes

412

# type: (bytes, int, bytes, bool) -> bytes

413

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

413

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

414

415

If 'leftside' is True, left side of string 's' is trimmed.

415

If 'leftside' is True, left side of string 's' is trimmed.

416

'ellipsis' is always placed at trimmed side.

416

'ellipsis' is always placed at trimmed side.

417

418

>>> from .node import bin

418

>>> from .node import bin

419

>>> def bprint(s):

419

>>> def bprint(s):

420

... print(pycompat.sysstr(s))

420

... print(pycompat.sysstr(s))

421

>>> ellipsis = b'+++'

421

>>> ellipsis = b'+++'

422

>>> from . import encoding

422

>>> from . import encoding

423

>>> encoding.encoding = b'utf-8'

423

>>> encoding.encoding = b'utf-8'

424

>>> t = b'1234567890'

424

>>> t = b'1234567890'

425

>>> bprint(trim(t, 12, ellipsis=ellipsis))

425

>>> bprint(trim(t, 12, ellipsis=ellipsis))

426

1234567890

426

1234567890

427

>>> bprint(trim(t, 10, ellipsis=ellipsis))

427

>>> bprint(trim(t, 10, ellipsis=ellipsis))

428

1234567890

428

1234567890

429

>>> bprint(trim(t, 8, ellipsis=ellipsis))

429

>>> bprint(trim(t, 8, ellipsis=ellipsis))

430

12345+++

430

12345+++

431

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

431

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

432

+++67890

432

+++67890

433

>>> bprint(trim(t, 8))

433

>>> bprint(trim(t, 8))

434

12345678

434

12345678

435

>>> bprint(trim(t, 8, leftside=True))

435

>>> bprint(trim(t, 8, leftside=True))

436

34567890

436

34567890

437

>>> bprint(trim(t, 3, ellipsis=ellipsis))

437

>>> bprint(trim(t, 3, ellipsis=ellipsis))

438

+++

438

+++

439

>>> bprint(trim(t, 1, ellipsis=ellipsis))

439

>>> bprint(trim(t, 1, ellipsis=ellipsis))

440

+

440

+

441

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

441

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

442

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

442

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

443

>>> bprint(trim(t, 12, ellipsis=ellipsis))

443

>>> bprint(trim(t, 12, ellipsis=ellipsis))

444

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

444

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

445

>>> bprint(trim(t, 10, ellipsis=ellipsis))

445

>>> bprint(trim(t, 10, ellipsis=ellipsis))

446

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

446

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

447

>>> bprint(trim(t, 8, ellipsis=ellipsis))

447

>>> bprint(trim(t, 8, ellipsis=ellipsis))

448

\xe3\x81\x82\xe3\x81\x84+++

448

\xe3\x81\x82\xe3\x81\x84+++

449

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

449

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

450

+++\xe3\x81\x88\xe3\x81\x8a

450

+++\xe3\x81\x88\xe3\x81\x8a

451

>>> bprint(trim(t, 5))

451

>>> bprint(trim(t, 5))

452

\xe3\x81\x82\xe3\x81\x84

452

\xe3\x81\x82\xe3\x81\x84

453

>>> bprint(trim(t, 5, leftside=True))

453

>>> bprint(trim(t, 5, leftside=True))

454

\xe3\x81\x88\xe3\x81\x8a

454

\xe3\x81\x88\xe3\x81\x8a

455

>>> bprint(trim(t, 4, ellipsis=ellipsis))

455

>>> bprint(trim(t, 4, ellipsis=ellipsis))

456

+++

456

+++

457

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

457

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

458

+++

458

+++

459

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

459

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

460

>>> bprint(trim(t, 12, ellipsis=ellipsis))

460

>>> bprint(trim(t, 12, ellipsis=ellipsis))

461

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

461

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

462

>>> bprint(trim(t, 10, ellipsis=ellipsis))

462

>>> bprint(trim(t, 10, ellipsis=ellipsis))

463

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

463

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

464

>>> bprint(trim(t, 8, ellipsis=ellipsis))

464

>>> bprint(trim(t, 8, ellipsis=ellipsis))

465

\x11\x22\x33\x44\x55+++

465

\x11\x22\x33\x44\x55+++

466

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

466

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

467

+++\x66\x77\x88\x99\xaa

467

+++\x66\x77\x88\x99\xaa

468

>>> bprint(trim(t, 8))

468

>>> bprint(trim(t, 8))

469

\x11\x22\x33\x44\x55\x66\x77\x88

469

\x11\x22\x33\x44\x55\x66\x77\x88

470

>>> bprint(trim(t, 8, leftside=True))

470

>>> bprint(trim(t, 8, leftside=True))

471

\x33\x44\x55\x66\x77\x88\x99\xaa

471

\x33\x44\x55\x66\x77\x88\x99\xaa

472

>>> bprint(trim(t, 3, ellipsis=ellipsis))

472

>>> bprint(trim(t, 3, ellipsis=ellipsis))

473

+++

473

+++

474

>>> bprint(trim(t, 1, ellipsis=ellipsis))

474

>>> bprint(trim(t, 1, ellipsis=ellipsis))

475

+

475

+

476

"""

476

"""

477

try:

477

try:

478

u = s.decode(_sysstr(encoding))

478

u = s.decode(_sysstr(encoding))

479

except UnicodeDecodeError:

479

except UnicodeDecodeError:

480

if len(s) <= width: # trimming is not needed

480

if len(s) <= width: # trimming is not needed

481

return s

481

return s

482

width -= len(ellipsis)

482

width -= len(ellipsis)

483

if width <= 0: # no enough room even for ellipsis

483

if width <= 0: # no enough room even for ellipsis

484

return ellipsis[: width + len(ellipsis)]

484

return ellipsis[: width + len(ellipsis)]

485

if leftside:

485

if leftside:

486

return ellipsis + s[-width:]

486

return ellipsis + s[-width:]

487

return s[:width] + ellipsis

487

return s[:width] + ellipsis

488

489

if ucolwidth(u) <= width: # trimming is not needed

489

if ucolwidth(u) <= width: # trimming is not needed

490

return s

490

return s

491

492

width -= len(ellipsis)

492

width -= len(ellipsis)

493

if width <= 0: # no enough room even for ellipsis

493

if width <= 0: # no enough room even for ellipsis

494

return ellipsis[: width + len(ellipsis)]

494

return ellipsis[: width + len(ellipsis)]

495

496

chars = list(u)

496

chars = list(u)

497

if leftside:

497

if leftside:

498

chars.reverse()

498

chars.reverse()

499

width_so_far = 0

499

width_so_far = 0

500

for i, c in enumerate(chars):

500

for i, c in enumerate(chars):

501

width_so_far += ucolwidth(c)

501

width_so_far += ucolwidth(c)

502

if width_so_far > width:

502

if width_so_far > width:

503

break

503

break

504

chars = chars[:i]

504

chars = chars[:i]

505

if leftside:

505

if leftside:

506

chars.reverse()

506

chars.reverse()

507

u = u''.join(chars).encode(_sysstr(encoding))

507

u = u''.join(chars).encode(_sysstr(encoding))

508

if leftside:

508

if leftside:

509

return ellipsis + u

509

return ellipsis + u

510

return u + ellipsis

510

return u + ellipsis

511

512

513

class normcasespecs:

513

class normcasespecs:

514

"""what a platform's normcase does to ASCII strings

514

"""what a platform's normcase does to ASCII strings

515

516

This is specified per platform, and should be consistent with what normcase

516

This is specified per platform, and should be consistent with what normcase

517

on that platform actually does.

517

on that platform actually does.

518

519

lower: normcase lowercases ASCII strings

519

lower: normcase lowercases ASCII strings

520

upper: normcase uppercases ASCII strings

520

upper: normcase uppercases ASCII strings

521

other: the fallback function should always be called

521

other: the fallback function should always be called

522

523

This should be kept in sync with normcase_spec in util.h."""

523

This should be kept in sync with normcase_spec in util.h."""

524

525

lower = -1

525

lower = -1

526

upper = 1

526

upper = 1

527

other = 0

527

other = 0

528

529

530

def jsonescape(s, paranoid=False):

530

def jsonescape(s, paranoid=False):

531

# type: (Any, Any) -> Any

531

# type: (Any, Any) -> Any

532

"""returns a string suitable for JSON

532

"""returns a string suitable for JSON

533

534

JSON is problematic for us because it doesn't support non-Unicode

534

JSON is problematic for us because it doesn't support non-Unicode

535

bytes. To deal with this, we take the following approach:

535

bytes. To deal with this, we take the following approach:

536

537

- localstr/safelocalstr objects are converted back to UTF-8

537

- localstr/safelocalstr objects are converted back to UTF-8

538

- valid UTF-8/ASCII strings are passed as-is

538

- valid UTF-8/ASCII strings are passed as-is

539

- other strings are converted to UTF-8b surrogate encoding

539

- other strings are converted to UTF-8b surrogate encoding

540

- apply JSON-specified string escaping

540

- apply JSON-specified string escaping

541

542

(escapes are doubled in these tests)

542

(escapes are doubled in these tests)

543

544

>>> jsonescape(b'this is a test')

544

>>> jsonescape(b'this is a test')

545

'this is a test'

545

'this is a test'

546

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

546

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

547

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

547

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

548

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

548

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

549

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

549

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

550

>>> jsonescape(b'a weird byte: \\xdd')

550

>>> jsonescape(b'a weird byte: \\xdd')

551

'a weird byte: \\xed\\xb3\\x9d'

551

'a weird byte: \\xed\\xb3\\x9d'

552

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

552

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

553

'utf-8: caf\\xc3\\xa9'

553

'utf-8: caf\\xc3\\xa9'

554

>>> jsonescape(b'')

554

>>> jsonescape(b'')

555

''

555

''

556

557

If paranoid, non-ascii and common troublesome characters are also escaped.

557

If paranoid, non-ascii and common troublesome characters are also escaped.

558

This is suitable for web output.

558

This is suitable for web output.

559

560

>>> s = b'escape characters: \\0 \\x0b \\x7f'

560

>>> s = b'escape characters: \\0 \\x0b \\x7f'

561

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

561

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

562

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

562

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

563

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

563

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

564

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

564

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

565

'escape boundary: ~ \\\\u007f \\\\u0080'

565

'escape boundary: ~ \\\\u007f \\\\u0080'

566

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

566

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

567

'a weird byte: \\\\udcdd'

567

'a weird byte: \\\\udcdd'

568

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

568

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

569

'utf-8: caf\\\\u00e9'

569

'utf-8: caf\\\\u00e9'

570

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

570

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

571

'non-BMP: \\\\ud834\\\\udd1e'

571

'non-BMP: \\\\ud834\\\\udd1e'

572

>>> jsonescape(b'<foo@example.org>', paranoid=True)

572

>>> jsonescape(b'<foo@example.org>', paranoid=True)

573

'\\\\u003cfoo@example.org\\\\u003e'

573

'\\\\u003cfoo@example.org\\\\u003e'

574

"""

574

"""

575

576

u8chars = toutf8b(s)

576

u8chars = toutf8b(s)

577

try:

577

try:

578

return _jsonescapeu8fast(u8chars, paranoid)

578

return _jsonescapeu8fast(u8chars, paranoid)

579

except ValueError:

579

except ValueError:

580

pass

580

pass

581

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

581

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

582

583

584

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

584

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

585

# bytes are mapped to that range.

585

# bytes are mapped to that range.

586

_utf8strict = r'surrogatepass'

586

_utf8strict = r'surrogatepass'

587

588

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

588

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

589

590

591

def getutf8char(s, pos):

591

def getutf8char(s, pos):

592

# type: (bytes, int) -> bytes

592

# type: (bytes, int) -> bytes

593

"""get the next full utf-8 character in the given string, starting at pos

593

"""get the next full utf-8 character in the given string, starting at pos

594

595

Raises a UnicodeError if the given location does not start a valid

595

Raises a UnicodeError if the given location does not start a valid

596

utf-8 character.

596

utf-8 character.

597

"""

597

"""

598

599

# find how many bytes to attempt decoding from first nibble

599

# find how many bytes to attempt decoding from first nibble

600

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

600

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

601

if not l: # ascii

601

if not l: # ascii

602

return s[pos : pos + 1]

602

return s[pos : pos + 1]

603

604

c = s[pos : pos + l]

604

c = s[pos : pos + l]

605

# validate with attempted decode

605

# validate with attempted decode

606

c.decode("utf-8", _utf8strict)

606

c.decode("utf-8", _utf8strict)

607

return c

607

return c

608

609

610

def toutf8b(s):

610

def toutf8b(s):

611

# type: (bytes) -> bytes

611

# type: (bytes) -> bytes

612

"""convert a local, possibly-binary string into UTF-8b

612

"""convert a local, possibly-binary string into UTF-8b

613

614

This is intended as a generic method to preserve data when working

614

This is intended as a generic method to preserve data when working

615

with schemes like JSON and XML that have no provision for

615

with schemes like JSON and XML that have no provision for

616

arbitrary byte strings. As Mercurial often doesn't know

616

arbitrary byte strings. As Mercurial often doesn't know

617

what encoding data is in, we use so-called UTF-8b.

617

what encoding data is in, we use so-called UTF-8b.

618

619

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

619

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

620

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

620

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

621

uDC00-uDCFF.

621

uDC00-uDCFF.

622

623

Principles of operation:

623

Principles of operation:

624

625

- ASCII and UTF-8 data successfully round-trips and is understood

625

- ASCII and UTF-8 data successfully round-trips and is understood

626

by Unicode-oriented clients

626

by Unicode-oriented clients

627

- filenames and file contents in arbitrary other encodings can have

627

- filenames and file contents in arbitrary other encodings can have

628

be round-tripped or recovered by clueful clients

628

be round-tripped or recovered by clueful clients

629

- local strings that have a cached known UTF-8 encoding (aka

629

- local strings that have a cached known UTF-8 encoding (aka

630

localstr) get sent as UTF-8 so Unicode-oriented clients get the

630

localstr) get sent as UTF-8 so Unicode-oriented clients get the

631

Unicode data they want

631

Unicode data they want

632

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

632

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

633

- because we must preserve UTF-8 bytestring in places such as

633

- because we must preserve UTF-8 bytestring in places such as

634

filenames, metadata can't be roundtripped without help

634

filenames, metadata can't be roundtripped without help

635

636

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

636

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

637

arbitrary bytes into an internal Unicode format that can be

637

arbitrary bytes into an internal Unicode format that can be

638

re-encoded back into the original. Here we are exposing the

638

re-encoded back into the original. Here we are exposing the

639

internal surrogate encoding as a UTF-8 string.)

639

internal surrogate encoding as a UTF-8 string.)

640

"""

640

"""

641

642

if isinstance(s, localstr):

642

if isinstance(s, localstr):

643

# assume that the original UTF-8 sequence would never contain

643

# assume that the original UTF-8 sequence would never contain

644

# invalid characters in U+DCxx range

644

# invalid characters in U+DCxx range

645

return s._utf8

645

return s._utf8

646

elif isinstance(s, safelocalstr):

646

elif isinstance(s, safelocalstr):

647

# already verified that s is non-lossy in legacy encoding, which

647

# already verified that s is non-lossy in legacy encoding, which

648

# shouldn't contain characters in U+DCxx range

648

# shouldn't contain characters in U+DCxx range

649

return fromlocal(s)

649

return fromlocal(s)

650

elif isasciistr(s):

650

elif isasciistr(s):

651

return s

651

return s

652

if b"\xed" not in s:

652

if b"\xed" not in s:

653

try:

653

try:

654

s.decode('utf-8', _utf8strict)

654

s.decode('utf-8', _utf8strict)

655

return s

655

return s

656

except UnicodeDecodeError:

656

except UnicodeDecodeError:

657

pass

657

pass

658

659

s = pycompat.bytestr(s)

659

s = pycompat.bytestr(s)

660

r = b""

660

r = bytearray()

661

pos = 0

661

pos = 0

662

l = len(s)

662

l = len(s)

663

while pos < l:

663

while pos < l:

664

try:

664

try:

665

c = getutf8char(s, pos)

665

c = getutf8char(s, pos)

666

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

666

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

667

# have to re-escape existing U+DCxx characters

667

# have to re-escape existing U+DCxx characters

668

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

668

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

669

pos += 1

669

pos += 1

670

else:

670

else:

671

pos += len(c)

671

pos += len(c)

672

except UnicodeDecodeError:

672

except UnicodeDecodeError:

673

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

673

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

674

pos += 1

674

pos += 1

675

r += c

675

r += c

676

return r

676

return bytes(r)

677

678

679

def fromutf8b(s):

679

def fromutf8b(s):

680

# type: (bytes) -> bytes

680

# type: (bytes) -> bytes

681

"""Given a UTF-8b string, return a local, possibly-binary string.

681

"""Given a UTF-8b string, return a local, possibly-binary string.

682

683

return the original binary string. This

683

return the original binary string. This

684

is a round-trip process for strings like filenames, but metadata

684

is a round-trip process for strings like filenames, but metadata

685

that's was passed through tolocal will remain in UTF-8.

685

that's was passed through tolocal will remain in UTF-8.

686

687

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

687

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

688

>>> m = b"\\xc3\\xa9\\x99abcd"

688

>>> m = b"\\xc3\\xa9\\x99abcd"

689

>>> toutf8b(m)

689

>>> toutf8b(m)

690

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

690

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

691

>>> roundtrip(m)

691

>>> roundtrip(m)

692

True

692

True

693

>>> roundtrip(b"\\xc2\\xc2\\x80")

693

>>> roundtrip(b"\\xc2\\xc2\\x80")

694

True

694

True

695

>>> roundtrip(b"\\xef\\xbf\\xbd")

695

>>> roundtrip(b"\\xef\\xbf\\xbd")

696

True

696

True

697

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

697

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

698

True

698

True

699

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

699

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

700

True

700

True

701

"""

701

"""

702

703

if isasciistr(s):

703

if isasciistr(s):

704

return s

704

return s

705

# fast path - look for uDxxx prefixes in s

705

# fast path - look for uDxxx prefixes in s

706

if b"\xed" not in s:

706

if b"\xed" not in s:

707

return s

707

return s

708

709

# We could do this with the unicode type but some Python builds

709

# We could do this with the unicode type but some Python builds

710

# use UTF-16 internally (issue5031) which causes non-BMP code

710

# use UTF-16 internally (issue5031) which causes non-BMP code

711

# points to be escaped. Instead, we use our handy getutf8char

711

# points to be escaped. Instead, we use our handy getutf8char

712

# helper again to walk the string without "decoding" it.

712

# helper again to walk the string without "decoding" it.

713

714

s = pycompat.bytestr(s)

714

s = pycompat.bytestr(s)

715

r = b""

715

r = bytearray()

716

pos = 0

716

pos = 0

717

l = len(s)

717

l = len(s)

718

while pos < l:

718

while pos < l:

719

c = getutf8char(s, pos)

719

c = getutf8char(s, pos)

720

pos += len(c)

720

pos += len(c)

721

# unescape U+DCxx characters

721

# unescape U+DCxx characters

722

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

722

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

723

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

723

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

724

r += c

724

r += c

725

return r

725

return bytes(r)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import locale
             import os
             import re
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             if pycompat.TYPE_CHECKING:
                 from typing import (
                     Any,
                     Callable,
                     List,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = os.supports_bytes_environ
             if _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = {
                     k.encode('utf-8'): v.encode('utf-8')
                     for k, v in os.environ.items()  # re-exports
                 }
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 """This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back"""
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 if pycompat.TYPE_CHECKING:
                     # pseudo implementation to help pytype see localstr() constructor
                     def __init__(self, u, l):
                         # type: (bytes, bytes) -> None
                         super(localstr, self).__init__(l)
                         self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             strtolocal = unitolocal
             strfromlocal = unifromlocal
             strmethod = unimethod
             def lower(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def upper(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 if pycompat.iswindows:
                     class WindowsEnviron(dict):
                         """`os.environ` normalizes environment variables to uppercase on windows"""
                         def get(self, key, default=None):
                             return super().get(upper(key), default)
                     environ = WindowsEnviron()
                 for k, v in os.environ.items():  # re-exports
                     environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
             DRIVE_RE = re.compile(b'^[a-z]:')
             # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
             # returns bytes.
             if pycompat.iswindows:
                 # Python 3 on Windows issues a DeprecationWarning about using the bytes
                 # API when os.getcwdb() is called.
                 #
                 # Additionally, py3.8+ uppercases the drive letter when calling
                 # os.path.realpath(), which is used on ``repo.root``.  Since those
                 # strings are compared in various places as simple strings, also call
                 # realpath here.  See https://bugs.python.org/issue40368
                 #
                 # However this is not reliable, so lets explicitly make this drive
                 # letter upper case.
                 #
                 # note: we should consider dropping realpath here since it seems to
                 # change the semantic of `getcwd`.
                 def getcwd():
                     cwd = os.getcwd()  # re-exports
                     cwd = os.path.realpath(cwd)
                     cwd = strtolocal(cwd)
                     if DRIVE_RE.match(cwd):
                         cwd = cwd[0:1].upper() + cwd[1:]
                     return cwd
             else:
                 getcwd = os.getcwdb  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 """Find the column width of a string for display in the local encoding"""
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 """Find the column width of a Unicode string for display"""
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 """Use colwidth to find a c-column substring of s starting at byte
                 index start"""
                 for x in range(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 chars = list(u)
                 if leftside:
                     chars.reverse()
                 width_so_far = 0
                 for i, c in enumerate(chars):
                     width_so_far += ucolwidth(c)
                     if width_so_far > width:
                         break
                 chars = chars[:i]
                 if leftside:
                     chars.reverse()
                 u = u''.join(chars).encode(_sysstr(encoding))
                 if leftside:
                     return ellipsis + u
                 return u + ellipsis
             class normcasespecs:
                 """what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h."""
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 """returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 """
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             _utf8strict = r'surrogatepass'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (bytes, int) -> bytes
                 """get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 """
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (bytes) -> bytes
                 """convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 """
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
-                r = b""
+                r = bytearray()
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
-                return r
+                return bytes(r)
             def fromutf8b(s):
                 # type: (bytes) -> bytes
                 """Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 """
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
-                r = b""
+                r = bytearray()
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
-                return r
+                return bytes(r)