upstream/mercurial-mirror Commit - r48360:af633293

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

if pycompat.TYPE_CHECKING:

23

if pycompat.TYPE_CHECKING:

24

from typing import (

24

from typing import (

25

Any,

25

Any,

26

Callable,

26

Callable,

27

List,

27

List,

28

Text,

28

Text,

29

Type,

29

Type,

30

TypeVar,

30

TypeVar,

31

Union,

31

Union,

32

)

32

)

33

34

# keep pyflakes happy

34

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

35

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

36

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

40

charencode = policy.importmod('charencode')

40

charencode = policy.importmod('charencode')

41

42

isasciistr = charencode.isasciistr

42

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

43

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

44

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

47

_sysstr = pycompat.sysstr

47

_sysstr = pycompat.sysstr

48

49

if pycompat.ispy3:

49

if pycompat.ispy3:

50

unichr = chr

50

unichr = chr

51

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# "Unicode Subtleties"), so we need to ignore them in some places for

53

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# sanity.

54

# sanity.

55

_ignore = [

55

_ignore = [

56

unichr(int(x, 16)).encode("utf-8")

56

unichr(int(x, 16)).encode("utf-8")

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

b"206a 206b 206c 206d 206e 206f feff".split()

58

b"206a 206b 206c 206d 206e 206f feff".split()

59

]

59

]

60

# verify the next function will work

60

# verify the next function will work

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

63

64

def hfsignoreclean(s):

64

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

65

# type: (bytes) -> bytes

66

"""Remove codepoints ignored by HFS+ from s.

66

"""Remove codepoints ignored by HFS+ from s.

67

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

'.hg'

69

'.hg'

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

'.hg'

71

'.hg'

72

"""

72

"""

73

if b"\xe2" in s or b"\xef" in s:

73

if b"\xe2" in s or b"\xef" in s:

74

for c in _ignore:

74

for c in _ignore:

75

s = s.replace(c, b'')

75

s = s.replace(c, b'')

76

return s

76

return s

77

78

79

# encoding.environ is provided read-only, which may not be used to modify

79

# encoding.environ is provided read-only, which may not be used to modify

80

# the process environment

80

# the process environment

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

82

if not pycompat.ispy3:

82

if not pycompat.ispy3:

83

environ = os.environ # re-exports

83

environ = os.environ # re-exports

84

elif _nativeenviron:

84

elif _nativeenviron:

85

environ = os.environb # re-exports

85

environ = os.environb # re-exports

86

else:

86

else:

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# and recreate it once encoding is settled

88

# and recreate it once encoding is settled

89

environ = {

89

environ = {

90

k.encode('utf-8'): v.encode('utf-8')

90

k.encode('utf-8'): v.encode('utf-8')

91

for k, v in os.environ.items() # re-exports

91

for k, v in os.environ.items() # re-exports

92

}

92

}

93

94

_encodingrewrites = {

94

_encodingrewrites = {

95

b'646': b'ascii',

95

b'646': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

97

}

97

}

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# https://bugs.python.org/issue13216

100

# https://bugs.python.org/issue13216

101

if pycompat.iswindows and not pycompat.ispy3:

101

if pycompat.iswindows and not pycompat.ispy3:

102

_encodingrewrites[b'cp65001'] = b'utf-8'

102

_encodingrewrites[b'cp65001'] = b'utf-8'

103

104

try:

104

try:

105

encoding = environ.get(b"HGENCODING")

105

encoding = environ.get(b"HGENCODING")

106

if not encoding:

106

if not encoding:

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = _encodingrewrites.get(encoding, encoding)

108

encoding = _encodingrewrites.get(encoding, encoding)

109

except locale.Error:

109

except locale.Error:

110

encoding = b'ascii'

110

encoding = b'ascii'

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

fallbackencoding = b'ISO-8859-1'

112

fallbackencoding = b'ISO-8859-1'

113

114

115

class localstr(bytes):

115

class localstr(bytes):

116

"""This class allows strings that are unmodified to be

116

"""This class allows strings that are unmodified to be

117

round-tripped to the local encoding and back"""

117

round-tripped to the local encoding and back"""

118

119

def __new__(cls, u, l):

119

def __new__(cls, u, l):

120

s = bytes.__new__(cls, l)

120

s = bytes.__new__(cls, l)

121

s._utf8 = u

121

s._utf8 = u

122

return s

122

return s

123

124

if pycompat.TYPE_CHECKING:

124

if pycompat.TYPE_CHECKING:

125

# pseudo implementation to help pytype see localstr() constructor

125

# pseudo implementation to help pytype see localstr() constructor

126

def __init__(self, u, l):

126

def __init__(self, u, l):

127

# type: (bytes, bytes) -> None

127

# type: (bytes, bytes) -> None

128

super(localstr, self).__init__(l)

128

super(localstr, self).__init__(l)

129

self._utf8 = u

129

self._utf8 = u

130

131

def __hash__(self):

131

def __hash__(self):

132

return hash(self._utf8) # avoid collisions in local string space

132

return hash(self._utf8) # avoid collisions in local string space

133

134

135

class safelocalstr(bytes):

135

class safelocalstr(bytes):

136

"""Tagged string denoting it was previously an internal UTF-8 string,

136

"""Tagged string denoting it was previously an internal UTF-8 string,

137

and can be converted back to UTF-8 losslessly

137

and can be converted back to UTF-8 losslessly

138

139

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

139

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

140

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

140

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

141

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

141

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

142

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

142

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

143

"""

143

"""

144

145

146

def tolocal(s):

146

def tolocal(s):

147

# type: (bytes) -> bytes

147

# type: (bytes) -> bytes

148

"""

148

"""

149

Convert a string from internal UTF-8 to local encoding

149

Convert a string from internal UTF-8 to local encoding

150

151

All internal strings should be UTF-8 but some repos before the

151

All internal strings should be UTF-8 but some repos before the

152

implementation of locale support may contain latin1 or possibly

152

implementation of locale support may contain latin1 or possibly

153

other character sets. We attempt to decode everything strictly

153

other character sets. We attempt to decode everything strictly

154

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

154

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

155

replace unknown characters.

155

replace unknown characters.

156

157

The localstr class is used to cache the known UTF-8 encoding of

157

The localstr class is used to cache the known UTF-8 encoding of

158

strings next to their local representation to allow lossless

158

strings next to their local representation to allow lossless

159

round-trip conversion back to UTF-8.

159

round-trip conversion back to UTF-8.

160

161

>>> u = b'foo: \\xc3\\xa4' # utf-8

161

>>> u = b'foo: \\xc3\\xa4' # utf-8

162

>>> l = tolocal(u)

162

>>> l = tolocal(u)

163

>>> l

163

>>> l

164

'foo: ?'

164

'foo: ?'

165

>>> fromlocal(l)

165

>>> fromlocal(l)

166

'foo: \\xc3\\xa4'

166

'foo: \\xc3\\xa4'

167

>>> u2 = b'foo: \\xc3\\xa1'

167

>>> u2 = b'foo: \\xc3\\xa1'

168

>>> d = { l: 1, tolocal(u2): 2 }

168

>>> d = { l: 1, tolocal(u2): 2 }

169

>>> len(d) # no collision

169

>>> len(d) # no collision

170

2

170

2

171

>>> b'foo: ?' in d

171

>>> b'foo: ?' in d

172

False

172

False

173

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

173

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

174

>>> l = tolocal(l1)

174

>>> l = tolocal(l1)

175

>>> l

175

>>> l

176

'foo: ?'

176

'foo: ?'

177

>>> fromlocal(l) # magically in utf-8

177

>>> fromlocal(l) # magically in utf-8

178

'foo: \\xc3\\xa4'

178

'foo: \\xc3\\xa4'

179

"""

179

"""

180

181

if isasciistr(s):

181

if isasciistr(s):

182

return s

182

return s

183

184

try:

184

try:

185

try:

185

try:

186

# make sure string is actually stored in UTF-8

186

# make sure string is actually stored in UTF-8

187

u = s.decode('UTF-8')

187

u = s.decode('UTF-8')

188

if encoding == b'UTF-8':

188

if encoding == b'UTF-8':

189

# fast path

189

# fast path

190

return s

190

return s

191

r = u.encode(_sysstr(encoding), "replace")

191

r = u.encode(_sysstr(encoding), "replace")

192

if u == r.decode(_sysstr(encoding)):

192

if u == r.decode(_sysstr(encoding)):

193

# r is a safe, non-lossy encoding of s

193

# r is a safe, non-lossy encoding of s

194

return safelocalstr(r)

194

return safelocalstr(r)

195

return localstr(s, r)

195

return localstr(s, r)

196

except UnicodeDecodeError:

196

except UnicodeDecodeError:

197

# we should only get here if we're looking at an ancient changeset

197

# we should only get here if we're looking at an ancient changeset

198

try:

198

try:

199

u = s.decode(_sysstr(fallbackencoding))

199

u = s.decode(_sysstr(fallbackencoding))

200

r = u.encode(_sysstr(encoding), "replace")

200

r = u.encode(_sysstr(encoding), "replace")

201

if u == r.decode(_sysstr(encoding)):

201

if u == r.decode(_sysstr(encoding)):

202

# r is a safe, non-lossy encoding of s

202

# r is a safe, non-lossy encoding of s

203

return safelocalstr(r)

203

return safelocalstr(r)

204

return localstr(u.encode('UTF-8'), r)

204

return localstr(u.encode('UTF-8'), r)

205

except UnicodeDecodeError:

205

except UnicodeDecodeError:

206

u = s.decode("utf-8", "replace") # last ditch

206

u = s.decode("utf-8", "replace") # last ditch

207

# can't round-trip

207

# can't round-trip

208

return u.encode(_sysstr(encoding), "replace")

208

return u.encode(_sysstr(encoding), "replace")

209

except LookupError as k:

209

except LookupError as k:

210

raise error.Abort(

210

raise error.Abort(

211

pycompat.bytestr(k), hint=b"please check your locale settings"

211

pycompat.bytestr(k), hint=b"please check your locale settings"

212

)

212

)

213

214

215

def fromlocal(s):

215

def fromlocal(s):

216

# type: (bytes) -> bytes

216

# type: (bytes) -> bytes

217

"""

217

"""

218

Convert a string from the local character encoding to UTF-8

218

Convert a string from the local character encoding to UTF-8

219

220

We attempt to decode strings using the encoding mode set by

220

We attempt to decode strings using the encoding mode set by

221

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

221

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

222

characters will cause an error message. Other modes include

222

characters will cause an error message. Other modes include

223

'replace', which replaces unknown characters with a special

223

'replace', which replaces unknown characters with a special

224

Unicode character, and 'ignore', which drops the character.

224

Unicode character, and 'ignore', which drops the character.

225

"""

225

"""

226

227

# can we do a lossless round-trip?

227

# can we do a lossless round-trip?

228

if isinstance(s, localstr):

228

if isinstance(s, localstr):

229

return s._utf8

229

return s._utf8

230

if isasciistr(s):

230

if isasciistr(s):

231

return s

231

return s

232

233

try:

233

try:

234

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

234

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

235

return u.encode("utf-8")

235

return u.encode("utf-8")

236

except UnicodeDecodeError as inst:

236

except UnicodeDecodeError as inst:

237

sub = s[max(0, inst.start - 10) : inst.start + 10]

237

sub = s[max(0, inst.start - 10) : inst.start + 10]

238

raise error.Abort(

238

raise error.Abort(

239

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

239

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

240

)

240

)

241

except LookupError as k:

241

except LookupError as k:

242

raise error.Abort(k, hint=b"please check your locale settings")

242

raise error.Abort(k, hint=b"please check your locale settings")

243

244

245

def unitolocal(u):

245

def unitolocal(u):

246

# type: (Text) -> bytes

246

# type: (Text) -> bytes

247

"""Convert a unicode string to a byte string of local encoding"""

247

"""Convert a unicode string to a byte string of local encoding"""

248

return tolocal(u.encode('utf-8'))

248

return tolocal(u.encode('utf-8'))

249

250

251

def unifromlocal(s):

251

def unifromlocal(s):

252

# type: (bytes) -> Text

252

# type: (bytes) -> Text

253

"""Convert a byte string of local encoding to a unicode string"""

253

"""Convert a byte string of local encoding to a unicode string"""

254

return fromlocal(s).decode('utf-8')

254

return fromlocal(s).decode('utf-8')

255

256

257

def unimethod(bytesfunc):

257

def unimethod(bytesfunc):

258

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

258

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

259

"""Create a proxy method that forwards __unicode__() and __str__() of

259

"""Create a proxy method that forwards __unicode__() and __str__() of

260

Python 3 to __bytes__()"""

260

Python 3 to __bytes__()"""

261

262

def unifunc(obj):

262

def unifunc(obj):

263

return unifromlocal(bytesfunc(obj))

263

return unifromlocal(bytesfunc(obj))

264

265

return unifunc

265

return unifunc

266

267

268

# converter functions between native str and byte string. use these if the

268

# converter functions between native str and byte string. use these if the

269

# character encoding is not aware (e.g. exception message) or is known to

269

# character encoding is not aware (e.g. exception message) or is known to

270

# be locale dependent (e.g. date formatting.)

270

# be locale dependent (e.g. date formatting.)

271

if pycompat.ispy3:

271

if pycompat.ispy3:

272

strtolocal = unitolocal

272

strtolocal = unitolocal

273

strfromlocal = unifromlocal

273

strfromlocal = unifromlocal

274

strmethod = unimethod

274

strmethod = unimethod

275

else:

275

else:

276

277

def strtolocal(s):

277

def strtolocal(s):

278

# type: (str) -> bytes

278

# type: (str) -> bytes

279

return s # pytype: disable=bad-return-type

279

return s # pytype: disable=bad-return-type

280

281

def strfromlocal(s):

281

def strfromlocal(s):

282

# type: (bytes) -> str

282

# type: (bytes) -> str

283

return s # pytype: disable=bad-return-type

283

return s # pytype: disable=bad-return-type

284

285

strmethod = pycompat.identity

285

strmethod = pycompat.identity

286

287

288

def lower(s):

288

def lower(s):

289

# type: (bytes) -> bytes

289

# type: (bytes) -> bytes

290

"""best-effort encoding-aware case-folding of local string s"""

290

"""best-effort encoding-aware case-folding of local string s"""

291

try:

291

try:

292

return asciilower(s)

292

return asciilower(s)

293

except UnicodeDecodeError:

293

except UnicodeDecodeError:

294

pass

294

pass

295

try:

295

try:

296

if isinstance(s, localstr):

296

if isinstance(s, localstr):

297

u = s._utf8.decode("utf-8")

297

u = s._utf8.decode("utf-8")

298

else:

298

else:

299

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

299

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

300

301

lu = u.lower()

301

lu = u.lower()

302

if u == lu:

302

if u == lu:

303

return s # preserve localstring

303

return s # preserve localstring

304

return lu.encode(_sysstr(encoding))

304

return lu.encode(_sysstr(encoding))

305

except UnicodeError:

305

except UnicodeError:

306

return s.lower() # we don't know how to fold this except in ASCII

306

return s.lower() # we don't know how to fold this except in ASCII

307

except LookupError as k:

307

except LookupError as k:

308

raise error.Abort(k, hint=b"please check your locale settings")

308

raise error.Abort(k, hint=b"please check your locale settings")

309

310

311

def upper(s):

311

def upper(s):

312

# type: (bytes) -> bytes

312

# type: (bytes) -> bytes

313

"""best-effort encoding-aware case-folding of local string s"""

313

"""best-effort encoding-aware case-folding of local string s"""

314

try:

314

try:

315

return asciiupper(s)

315

return asciiupper(s)

316

except UnicodeDecodeError:

316

except UnicodeDecodeError:

317

return upperfallback(s)

317

return upperfallback(s)

318

319

320

def upperfallback(s):

320

def upperfallback(s):

321

# type: (Any) -> Any

321

# type: (Any) -> Any

322

try:

322

try:

323

if isinstance(s, localstr):

323

if isinstance(s, localstr):

324

u = s._utf8.decode("utf-8")

324

u = s._utf8.decode("utf-8")

325

else:

325

else:

326

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

326

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

327

328

uu = u.upper()

328

uu = u.upper()

329

if u == uu:

329

if u == uu:

330

return s # preserve localstring

330

return s # preserve localstring

331

return uu.encode(_sysstr(encoding))

331

return uu.encode(_sysstr(encoding))

332

except UnicodeError:

332

except UnicodeError:

333

return s.upper() # we don't know how to fold this except in ASCII

333

return s.upper() # we don't know how to fold this except in ASCII

334

except LookupError as k:

334

except LookupError as k:

335

raise error.Abort(k, hint=b"please check your locale settings")

335

raise error.Abort(k, hint=b"please check your locale settings")

336

337

338

if not _nativeenviron:

338

if not _nativeenviron:

339

# now encoding and helper functions are available, recreate the environ

339

# now encoding and helper functions are available, recreate the environ

340

# dict to be exported to other modules

340

# dict to be exported to other modules

341

environ = {

341

if pycompat.iswindows and pycompat.ispy3:

342

tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))

342

343

for k, v in os.environ.items() # re-exports

343

class WindowsEnviron(dict):

344

}

344

"""`os.environ` normalizes environment variables to uppercase on windows"""

345

346

def get(self, key, default=None):

347

return super().get(upper(key), default)

348

349

environ = WindowsEnviron()

350

351

for k, v in os.environ.items(): # re-exports

352

environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))

353

345

354

346

if pycompat.ispy3:

355

if pycompat.ispy3:

347

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

356

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

348

# returns bytes.

357

# returns bytes.

349

if pycompat.iswindows:

358

if pycompat.iswindows:

350

# Python 3 on Windows issues a DeprecationWarning about using the bytes

359

# Python 3 on Windows issues a DeprecationWarning about using the bytes

351

# API when os.getcwdb() is called.

360

# API when os.getcwdb() is called.

352

#

361

#

353

# Additionally, py3.8+ uppercases the drive letter when calling

362

# Additionally, py3.8+ uppercases the drive letter when calling

354

# os.path.realpath(), which is used on ``repo.root``. Since those

363

# os.path.realpath(), which is used on ``repo.root``. Since those

355

# strings are compared in various places as simple strings, also call

364

# strings are compared in various places as simple strings, also call

356

# realpath here. See https://bugs.python.org/issue40368

365

# realpath here. See https://bugs.python.org/issue40368

357

getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports

366

getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports

358

else:

367

else:

359

getcwd = os.getcwdb # re-exports

368

getcwd = os.getcwdb # re-exports

360

else:

369

else:

361

getcwd = os.getcwd # re-exports

370

getcwd = os.getcwd # re-exports

362

371

363

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

372

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

364

_wide = _sysstr(

373

_wide = _sysstr(

365

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

374

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

366

and b"WFA"

375

and b"WFA"

367

or b"WF"

376

or b"WF"

368

)

377

)

369

378

370

379

371

def colwidth(s):

380

def colwidth(s):

372

# type: (bytes) -> int

381

# type: (bytes) -> int

373

"""Find the column width of a string for display in the local encoding"""

382

"""Find the column width of a string for display in the local encoding"""

374

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

383

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

375

384

376

385

377

def ucolwidth(d):

386

def ucolwidth(d):

378

# type: (Text) -> int

387

# type: (Text) -> int

379

"""Find the column width of a Unicode string for display"""

388

"""Find the column width of a Unicode string for display"""

380

eaw = getattr(unicodedata, 'east_asian_width', None)

389

eaw = getattr(unicodedata, 'east_asian_width', None)

381

if eaw is not None:

390

if eaw is not None:

382

return sum([eaw(c) in _wide and 2 or 1 for c in d])

391

return sum([eaw(c) in _wide and 2 or 1 for c in d])

383

return len(d)

392

return len(d)

384

393

385

394

386

def getcols(s, start, c):

395

def getcols(s, start, c):

387

# type: (bytes, int, int) -> bytes

396

# type: (bytes, int, int) -> bytes

388

"""Use colwidth to find a c-column substring of s starting at byte

397

"""Use colwidth to find a c-column substring of s starting at byte

389

index start"""

398

index start"""

390

for x in pycompat.xrange(start + c, len(s)):

399

for x in pycompat.xrange(start + c, len(s)):

391

t = s[start:x]

400

t = s[start:x]

392

if colwidth(t) == c:

401

if colwidth(t) == c:

393

return t

402

return t

394

raise ValueError('substring not found')

403

raise ValueError('substring not found')

395

404

396

405

397

def trim(s, width, ellipsis=b'', leftside=False):

406

def trim(s, width, ellipsis=b'', leftside=False):

398

# type: (bytes, int, bytes, bool) -> bytes

407

# type: (bytes, int, bytes, bool) -> bytes

399

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

408

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

400

409

401

If 'leftside' is True, left side of string 's' is trimmed.

410

If 'leftside' is True, left side of string 's' is trimmed.

402

'ellipsis' is always placed at trimmed side.

411

'ellipsis' is always placed at trimmed side.

403

412

404

>>> from .node import bin

413

>>> from .node import bin

405

>>> def bprint(s):

414

>>> def bprint(s):

406

... print(pycompat.sysstr(s))

415

... print(pycompat.sysstr(s))

407

>>> ellipsis = b'+++'

416

>>> ellipsis = b'+++'

408

>>> from . import encoding

417

>>> from . import encoding

409

>>> encoding.encoding = b'utf-8'

418

>>> encoding.encoding = b'utf-8'

410

>>> t = b'1234567890'

419

>>> t = b'1234567890'

411

>>> bprint(trim(t, 12, ellipsis=ellipsis))

420

>>> bprint(trim(t, 12, ellipsis=ellipsis))

412

1234567890

421

1234567890

413

>>> bprint(trim(t, 10, ellipsis=ellipsis))

422

>>> bprint(trim(t, 10, ellipsis=ellipsis))

414

1234567890

423

1234567890

415

>>> bprint(trim(t, 8, ellipsis=ellipsis))

424

>>> bprint(trim(t, 8, ellipsis=ellipsis))

416

12345+++

425

12345+++

417

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

426

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

418

+++67890

427

+++67890

419

>>> bprint(trim(t, 8))

428

>>> bprint(trim(t, 8))

420

12345678

429

12345678

421

>>> bprint(trim(t, 8, leftside=True))

430

>>> bprint(trim(t, 8, leftside=True))

422

34567890

431

34567890

423

>>> bprint(trim(t, 3, ellipsis=ellipsis))

432

>>> bprint(trim(t, 3, ellipsis=ellipsis))

424

+++

433

+++

425

>>> bprint(trim(t, 1, ellipsis=ellipsis))

434

>>> bprint(trim(t, 1, ellipsis=ellipsis))

426

+

435

+

427

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

436

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

428

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

437

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

429

>>> bprint(trim(t, 12, ellipsis=ellipsis))

438

>>> bprint(trim(t, 12, ellipsis=ellipsis))

430

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

439

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

431

>>> bprint(trim(t, 10, ellipsis=ellipsis))

440

>>> bprint(trim(t, 10, ellipsis=ellipsis))

432

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

441

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

433

>>> bprint(trim(t, 8, ellipsis=ellipsis))

442

>>> bprint(trim(t, 8, ellipsis=ellipsis))

434

\xe3\x81\x82\xe3\x81\x84+++

443

\xe3\x81\x82\xe3\x81\x84+++

435

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

444

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

436

+++\xe3\x81\x88\xe3\x81\x8a

445

+++\xe3\x81\x88\xe3\x81\x8a

437

>>> bprint(trim(t, 5))

446

>>> bprint(trim(t, 5))

438

\xe3\x81\x82\xe3\x81\x84

447

\xe3\x81\x82\xe3\x81\x84

439

>>> bprint(trim(t, 5, leftside=True))

448

>>> bprint(trim(t, 5, leftside=True))

440

\xe3\x81\x88\xe3\x81\x8a

449

\xe3\x81\x88\xe3\x81\x8a

441

>>> bprint(trim(t, 4, ellipsis=ellipsis))

450

>>> bprint(trim(t, 4, ellipsis=ellipsis))

442

+++

451

+++

443

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

452

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

444

+++

453

+++

445

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

454

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

446

>>> bprint(trim(t, 12, ellipsis=ellipsis))

455

>>> bprint(trim(t, 12, ellipsis=ellipsis))

447

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

456

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

448

>>> bprint(trim(t, 10, ellipsis=ellipsis))

457

>>> bprint(trim(t, 10, ellipsis=ellipsis))

449

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

458

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

450

>>> bprint(trim(t, 8, ellipsis=ellipsis))

459

>>> bprint(trim(t, 8, ellipsis=ellipsis))

451

\x11\x22\x33\x44\x55+++

460

\x11\x22\x33\x44\x55+++

452

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

461

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

453

+++\x66\x77\x88\x99\xaa

462

+++\x66\x77\x88\x99\xaa

454

>>> bprint(trim(t, 8))

463

>>> bprint(trim(t, 8))

455

\x11\x22\x33\x44\x55\x66\x77\x88

464

\x11\x22\x33\x44\x55\x66\x77\x88

456

>>> bprint(trim(t, 8, leftside=True))

465

>>> bprint(trim(t, 8, leftside=True))

457

\x33\x44\x55\x66\x77\x88\x99\xaa

466

\x33\x44\x55\x66\x77\x88\x99\xaa

458

>>> bprint(trim(t, 3, ellipsis=ellipsis))

467

>>> bprint(trim(t, 3, ellipsis=ellipsis))

459

+++

468

+++

460

>>> bprint(trim(t, 1, ellipsis=ellipsis))

469

>>> bprint(trim(t, 1, ellipsis=ellipsis))

461

+

470

+

462

"""

471

"""

463

try:

472

try:

464

u = s.decode(_sysstr(encoding))

473

u = s.decode(_sysstr(encoding))

465

except UnicodeDecodeError:

474

except UnicodeDecodeError:

466

if len(s) <= width: # trimming is not needed

475

if len(s) <= width: # trimming is not needed

467

return s

476

return s

468

width -= len(ellipsis)

477

width -= len(ellipsis)

469

if width <= 0: # no enough room even for ellipsis

478

if width <= 0: # no enough room even for ellipsis

470

return ellipsis[: width + len(ellipsis)]

479

return ellipsis[: width + len(ellipsis)]

471

if leftside:

480

if leftside:

472

return ellipsis + s[-width:]

481

return ellipsis + s[-width:]

473

return s[:width] + ellipsis

482

return s[:width] + ellipsis

474

483

475

if ucolwidth(u) <= width: # trimming is not needed

484

if ucolwidth(u) <= width: # trimming is not needed

476

return s

485

return s

477

486

478

width -= len(ellipsis)

487

width -= len(ellipsis)

479

if width <= 0: # no enough room even for ellipsis

488

if width <= 0: # no enough room even for ellipsis

480

return ellipsis[: width + len(ellipsis)]

489

return ellipsis[: width + len(ellipsis)]

481

490

482

if leftside:

491

if leftside:

483

uslice = lambda i: u[i:]

492

uslice = lambda i: u[i:]

484

concat = lambda s: ellipsis + s

493

concat = lambda s: ellipsis + s

485

else:

494

else:

486

uslice = lambda i: u[:-i]

495

uslice = lambda i: u[:-i]

487

concat = lambda s: s + ellipsis

496

concat = lambda s: s + ellipsis

488

for i in pycompat.xrange(1, len(u)):

497

for i in pycompat.xrange(1, len(u)):

489

usub = uslice(i)

498

usub = uslice(i)

490

if ucolwidth(usub) <= width:

499

if ucolwidth(usub) <= width:

491

return concat(usub.encode(_sysstr(encoding)))

500

return concat(usub.encode(_sysstr(encoding)))

492

return ellipsis # no enough room for multi-column characters

501

return ellipsis # no enough room for multi-column characters

493

502

494

503

495

class normcasespecs(object):

504

class normcasespecs(object):

496

"""what a platform's normcase does to ASCII strings

505

"""what a platform's normcase does to ASCII strings

497

506

498

This is specified per platform, and should be consistent with what normcase

507

This is specified per platform, and should be consistent with what normcase

499

on that platform actually does.

508

on that platform actually does.

500

509

501

lower: normcase lowercases ASCII strings

510

lower: normcase lowercases ASCII strings

502

upper: normcase uppercases ASCII strings

511

upper: normcase uppercases ASCII strings

503

other: the fallback function should always be called

512

other: the fallback function should always be called

504

513

505

This should be kept in sync with normcase_spec in util.h."""

514

This should be kept in sync with normcase_spec in util.h."""

506

515

507

lower = -1

516

lower = -1

508

upper = 1

517

upper = 1

509

other = 0

518

other = 0

510

519

511

520

512

def jsonescape(s, paranoid=False):

521

def jsonescape(s, paranoid=False):

513

# type: (Any, Any) -> Any

522

# type: (Any, Any) -> Any

514

"""returns a string suitable for JSON

523

"""returns a string suitable for JSON

515

524

516

JSON is problematic for us because it doesn't support non-Unicode

525

JSON is problematic for us because it doesn't support non-Unicode

517

bytes. To deal with this, we take the following approach:

526

bytes. To deal with this, we take the following approach:

518

527

519

- localstr/safelocalstr objects are converted back to UTF-8

528

- localstr/safelocalstr objects are converted back to UTF-8

520

- valid UTF-8/ASCII strings are passed as-is

529

- valid UTF-8/ASCII strings are passed as-is

521

- other strings are converted to UTF-8b surrogate encoding

530

- other strings are converted to UTF-8b surrogate encoding

522

- apply JSON-specified string escaping

531

- apply JSON-specified string escaping

523

532

524

(escapes are doubled in these tests)

533

(escapes are doubled in these tests)

525

534

526

>>> jsonescape(b'this is a test')

535

>>> jsonescape(b'this is a test')

527

'this is a test'

536

'this is a test'

528

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

537

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

529

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

538

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

530

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

539

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

531

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

540

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

532

>>> jsonescape(b'a weird byte: \\xdd')

541

>>> jsonescape(b'a weird byte: \\xdd')

533

'a weird byte: \\xed\\xb3\\x9d'

542

'a weird byte: \\xed\\xb3\\x9d'

534

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

543

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

535

'utf-8: caf\\xc3\\xa9'

544

'utf-8: caf\\xc3\\xa9'

536

>>> jsonescape(b'')

545

>>> jsonescape(b'')

537

''

546

''

538

547

539

If paranoid, non-ascii and common troublesome characters are also escaped.

548

If paranoid, non-ascii and common troublesome characters are also escaped.

540

This is suitable for web output.

549

This is suitable for web output.

541

550

542

>>> s = b'escape characters: \\0 \\x0b \\x7f'

551

>>> s = b'escape characters: \\0 \\x0b \\x7f'

543

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

552

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

544

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

553

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

545

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

554

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

546

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

555

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

547

'escape boundary: ~ \\\\u007f \\\\u0080'

556

'escape boundary: ~ \\\\u007f \\\\u0080'

548

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

557

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

549

'a weird byte: \\\\udcdd'

558

'a weird byte: \\\\udcdd'

550

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

559

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

551

'utf-8: caf\\\\u00e9'

560

'utf-8: caf\\\\u00e9'

552

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

561

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

553

'non-BMP: \\\\ud834\\\\udd1e'

562

'non-BMP: \\\\ud834\\\\udd1e'

554

>>> jsonescape(b'<foo@example.org>', paranoid=True)

563

>>> jsonescape(b'<foo@example.org>', paranoid=True)

555

'\\\\u003cfoo@example.org\\\\u003e'

564

'\\\\u003cfoo@example.org\\\\u003e'

556

"""

565

"""

557

566

558

u8chars = toutf8b(s)

567

u8chars = toutf8b(s)

559

try:

568

try:

560

return _jsonescapeu8fast(u8chars, paranoid)

569

return _jsonescapeu8fast(u8chars, paranoid)

561

except ValueError:

570

except ValueError:

562

pass

571

pass

563

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

572

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

564

573

565

574

566

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

575

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

567

# bytes are mapped to that range.

576

# bytes are mapped to that range.

568

if pycompat.ispy3:

577

if pycompat.ispy3:

569

_utf8strict = r'surrogatepass'

578

_utf8strict = r'surrogatepass'

570

else:

579

else:

571

_utf8strict = r'strict'

580

_utf8strict = r'strict'

572

581

573

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

582

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

574

583

575

584

576

def getutf8char(s, pos):

585

def getutf8char(s, pos):

577

# type: (bytes, int) -> bytes

586

# type: (bytes, int) -> bytes

578

"""get the next full utf-8 character in the given string, starting at pos

587

"""get the next full utf-8 character in the given string, starting at pos

579

588

580

Raises a UnicodeError if the given location does not start a valid

589

Raises a UnicodeError if the given location does not start a valid

581

utf-8 character.

590

utf-8 character.

582

"""

591

"""

583

592

584

# find how many bytes to attempt decoding from first nibble

593

# find how many bytes to attempt decoding from first nibble

585

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

594

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

586

if not l: # ascii

595

if not l: # ascii

587

return s[pos : pos + 1]

596

return s[pos : pos + 1]

588

597

589

c = s[pos : pos + l]

598

c = s[pos : pos + l]

590

# validate with attempted decode

599

# validate with attempted decode

591

c.decode("utf-8", _utf8strict)

600

c.decode("utf-8", _utf8strict)

592

return c

601

return c

593

602

594

603

595

def toutf8b(s):

604

def toutf8b(s):

596

# type: (bytes) -> bytes

605

# type: (bytes) -> bytes

597

"""convert a local, possibly-binary string into UTF-8b

606

"""convert a local, possibly-binary string into UTF-8b

598

607

599

This is intended as a generic method to preserve data when working

608

This is intended as a generic method to preserve data when working

600

with schemes like JSON and XML that have no provision for

609

with schemes like JSON and XML that have no provision for

601

arbitrary byte strings. As Mercurial often doesn't know

610

arbitrary byte strings. As Mercurial often doesn't know

602

what encoding data is in, we use so-called UTF-8b.

611

what encoding data is in, we use so-called UTF-8b.

603

612

604

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

613

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

605

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

614

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

606

uDC00-uDCFF.

615

uDC00-uDCFF.

607

616

608

Principles of operation:

617

Principles of operation:

609

618

610

- ASCII and UTF-8 data successfully round-trips and is understood

619

- ASCII and UTF-8 data successfully round-trips and is understood

611

by Unicode-oriented clients

620

by Unicode-oriented clients

612

- filenames and file contents in arbitrary other encodings can have

621

- filenames and file contents in arbitrary other encodings can have

613

be round-tripped or recovered by clueful clients

622

be round-tripped or recovered by clueful clients

614

- local strings that have a cached known UTF-8 encoding (aka

623

- local strings that have a cached known UTF-8 encoding (aka

615

localstr) get sent as UTF-8 so Unicode-oriented clients get the

624

localstr) get sent as UTF-8 so Unicode-oriented clients get the

616

Unicode data they want

625

Unicode data they want

617

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

626

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

618

- because we must preserve UTF-8 bytestring in places such as

627

- because we must preserve UTF-8 bytestring in places such as

619

filenames, metadata can't be roundtripped without help

628

filenames, metadata can't be roundtripped without help

620

629

621

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

630

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

622

arbitrary bytes into an internal Unicode format that can be

631

arbitrary bytes into an internal Unicode format that can be

623

re-encoded back into the original. Here we are exposing the

632

re-encoded back into the original. Here we are exposing the

624

internal surrogate encoding as a UTF-8 string.)

633

internal surrogate encoding as a UTF-8 string.)

625

"""

634

"""

626

635

627

if isinstance(s, localstr):

636

if isinstance(s, localstr):

628

# assume that the original UTF-8 sequence would never contain

637

# assume that the original UTF-8 sequence would never contain

629

# invalid characters in U+DCxx range

638

# invalid characters in U+DCxx range

630

return s._utf8

639

return s._utf8

631

elif isinstance(s, safelocalstr):

640

elif isinstance(s, safelocalstr):

632

# already verified that s is non-lossy in legacy encoding, which

641

# already verified that s is non-lossy in legacy encoding, which

633

# shouldn't contain characters in U+DCxx range

642

# shouldn't contain characters in U+DCxx range

634

return fromlocal(s)

643

return fromlocal(s)

635

elif isasciistr(s):

644

elif isasciistr(s):

636

return s

645

return s

637

if b"\xed" not in s:

646

if b"\xed" not in s:

638

try:

647

try:

639

s.decode('utf-8', _utf8strict)

648

s.decode('utf-8', _utf8strict)

640

return s

649

return s

641

except UnicodeDecodeError:

650

except UnicodeDecodeError:

642

pass

651

pass

643

652

644

s = pycompat.bytestr(s)

653

s = pycompat.bytestr(s)

645

r = b""

654

r = b""

646

pos = 0

655

pos = 0

647

l = len(s)

656

l = len(s)

648

while pos < l:

657

while pos < l:

649

try:

658

try:

650

c = getutf8char(s, pos)

659

c = getutf8char(s, pos)

651

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

660

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

652

# have to re-escape existing U+DCxx characters

661

# have to re-escape existing U+DCxx characters

653

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

662

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

654

pos += 1

663

pos += 1

655

else:

664

else:

656

pos += len(c)

665

pos += len(c)

657

except UnicodeDecodeError:

666

except UnicodeDecodeError:

658

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

667

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

659

pos += 1

668

pos += 1

660

r += c

669

r += c

661

return r

670

return r

662

671

663

672

664

def fromutf8b(s):

673

def fromutf8b(s):

665

# type: (bytes) -> bytes

674

# type: (bytes) -> bytes

666

"""Given a UTF-8b string, return a local, possibly-binary string.

675

"""Given a UTF-8b string, return a local, possibly-binary string.

667

676

668

return the original binary string. This

677

return the original binary string. This

669

is a round-trip process for strings like filenames, but metadata

678

is a round-trip process for strings like filenames, but metadata

670

that's was passed through tolocal will remain in UTF-8.

679

that's was passed through tolocal will remain in UTF-8.

671

680

672

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

681

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

673

>>> m = b"\\xc3\\xa9\\x99abcd"

682

>>> m = b"\\xc3\\xa9\\x99abcd"

674

>>> toutf8b(m)

683

>>> toutf8b(m)

675

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

684

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

676

>>> roundtrip(m)

685

>>> roundtrip(m)

677

True

686

True

678

>>> roundtrip(b"\\xc2\\xc2\\x80")

687

>>> roundtrip(b"\\xc2\\xc2\\x80")

679

True

688

True

680

>>> roundtrip(b"\\xef\\xbf\\xbd")

689

>>> roundtrip(b"\\xef\\xbf\\xbd")

681

True

690

True

682

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

691

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

683

True

692

True

684

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

693

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

685

True

694

True

686

"""

695

"""

687

696

688

if isasciistr(s):

697

if isasciistr(s):

689

return s

698

return s

690

# fast path - look for uDxxx prefixes in s

699

# fast path - look for uDxxx prefixes in s

691

if b"\xed" not in s:

700

if b"\xed" not in s:

692

return s

701

return s

693

702

694

# We could do this with the unicode type but some Python builds

703

# We could do this with the unicode type but some Python builds

695

# use UTF-16 internally (issue5031) which causes non-BMP code

704

# use UTF-16 internally (issue5031) which causes non-BMP code

696

# points to be escaped. Instead, we use our handy getutf8char

705

# points to be escaped. Instead, we use our handy getutf8char

697

# helper again to walk the string without "decoding" it.

706

# helper again to walk the string without "decoding" it.

698

707

699

s = pycompat.bytestr(s)

708

s = pycompat.bytestr(s)

700

r = b""

709

r = b""

701

pos = 0

710

pos = 0

702

l = len(s)

711

l = len(s)

703

while pos < l:

712

while pos < l:

704

c = getutf8char(s, pos)

713

c = getutf8char(s, pos)

705

pos += len(c)

714

pos += len(c)

706

# unescape U+DCxx characters

715

# unescape U+DCxx characters

707

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

716

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

708

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

717

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

709

r += c

718

r += c

710

return r

719

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             if pycompat.TYPE_CHECKING:
                 from typing import (
                     Any,
                     Callable,
                     List,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = {
                     k.encode('utf-8'): v.encode('utf-8')
                     for k, v in os.environ.items()  # re-exports
                 }
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 """This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back"""
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 if pycompat.TYPE_CHECKING:
                     # pseudo implementation to help pytype see localstr() constructor
                     def __init__(self, u, l):
                         # type: (bytes, bytes) -> None
                         super(localstr, self).__init__(l)
                         self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(
                         pycompat.bytestr(k), hint=b"please check your locale settings"
                     )
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 def strtolocal(s):
                     # type: (str) -> bytes
                     return s  # pytype: disable=bad-return-type
                 def strfromlocal(s):
                     # type: (bytes) -> str
                     return s  # pytype: disable=bad-return-type
                 strmethod = pycompat.identity
             def lower(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
                 # type: (bytes) -> bytes
                 """best-effort encoding-aware case-folding of local string s"""
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
-                environ = {
+                if pycompat.iswindows and pycompat.ispy3:
-                    tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
-                    for k, v in os.environ.items()  # re-exports
+                    class WindowsEnviron(dict):
+                        """`os.environ` normalizes environment variables to uppercase on windows"""
+                        def get(self, key, default=None):
+                            return super().get(upper(key), default)
+                    environ = WindowsEnviron()
+                for k, v in os.environ.items():  # re-exports
+                    environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     #
                     # Additionally, py3.8+ uppercases the drive letter when calling
                     # os.path.realpath(), which is used on ``repo.root``.  Since those
                     # strings are compared in various places as simple strings, also call
                     # realpath here.  See https://bugs.python.org/issue40368
                     getcwd = lambda: strtolocal(os.path.realpath(os.getcwd()))  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 """Find the column width of a string for display in the local encoding"""
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 """Find the column width of a Unicode string for display"""
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 """Use colwidth to find a c-column substring of s starting at byte
                 index start"""
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             class normcasespecs(object):
                 """what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h."""
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 """returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 """
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (bytes, int) -> bytes
                 """get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 """
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (bytes) -> bytes
                 """convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 """
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 # type: (bytes) -> bytes
                 """Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 """
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r