upstream/mercurial-mirror Commit - r44078:7f51bc36

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

if not globals(): # hide this from non-pytype users

23

if not globals(): # hide this from non-pytype users

24

from typing import (

24

from typing import (

25

Any,

25

Any,

26

Callable,

26

Callable,

27

List,

27

List,

28

Text,

28

Text,

29

Type,

29

Type,

30

TypeVar,

30

TypeVar,

31

Union,

31

Union,

32

)

32

)

33

34

# keep pyflakes happy

34

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

35

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

36

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

40

charencode = policy.importmod('charencode')

40

charencode = policy.importmod('charencode')

41

42

isasciistr = charencode.isasciistr

42

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

43

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

44

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

47

_sysstr = pycompat.sysstr

47

_sysstr = pycompat.sysstr

48

49

if pycompat.ispy3:

49

if pycompat.ispy3:

50

unichr = chr

50

unichr = chr

51

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# "Unicode Subtleties"), so we need to ignore them in some places for

53

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# sanity.

54

# sanity.

55

_ignore = [

55

_ignore = [

56

unichr(int(x, 16)).encode("utf-8")

56

unichr(int(x, 16)).encode("utf-8")

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

b"206a 206b 206c 206d 206e 206f feff".split()

58

b"206a 206b 206c 206d 206e 206f feff".split()

59

]

59

]

60

# verify the next function will work

60

# verify the next function will work

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

63

64

def hfsignoreclean(s):

64

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

65

# type: (bytes) -> bytes

66

"""Remove codepoints ignored by HFS+ from s.

66

"""Remove codepoints ignored by HFS+ from s.

67

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

'.hg'

69

'.hg'

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

'.hg'

71

'.hg'

72

"""

72

"""

73

if b"\xe2" in s or b"\xef" in s:

73

if b"\xe2" in s or b"\xef" in s:

74

for c in _ignore:

74

for c in _ignore:

75

s = s.replace(c, b'')

75

s = s.replace(c, b'')

76

return s

76

return s

77

78

79

# encoding.environ is provided read-only, which may not be used to modify

79

# encoding.environ is provided read-only, which may not be used to modify

80

# the process environment

80

# the process environment

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

82

if not pycompat.ispy3:

82

if not pycompat.ispy3:

83

environ = os.environ # re-exports

83

environ = os.environ # re-exports

84

elif _nativeenviron:

84

elif _nativeenviron:

85

environ = os.environb # re-exports

85

environ = os.environb # re-exports

86

else:

86

else:

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# and recreate it once encoding is settled

88

# and recreate it once encoding is settled

89

environ = dict(

89

environ = dict(

90

(k.encode('utf-8'), v.encode('utf-8'))

90

(k.encode('utf-8'), v.encode('utf-8'))

91

for k, v in os.environ.items() # re-exports

91

for k, v in os.environ.items() # re-exports

92

)

92

)

93

94

_encodingrewrites = {

94

_encodingrewrites = {

95

b'646': b'ascii',

95

b'646': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

97

}

97

}

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# https://bugs.python.org/issue13216

100

# https://bugs.python.org/issue13216

101

if pycompat.iswindows and not pycompat.ispy3:

101

if pycompat.iswindows and not pycompat.ispy3:

102

_encodingrewrites[b'cp65001'] = b'utf-8'

102

_encodingrewrites[b'cp65001'] = b'utf-8'

103

104

try:

104

try:

105

encoding = environ.get(b"HGENCODING")

105

encoding = environ.get(b"HGENCODING")

106

if not encoding:

106

if not encoding:

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = _encodingrewrites.get(encoding, encoding)

108

encoding = _encodingrewrites.get(encoding, encoding)

109

except locale.Error:

109

except locale.Error:

110

encoding = b'ascii'

110

encoding = b'ascii'

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

fallbackencoding = b'ISO-8859-1'

112

fallbackencoding = b'ISO-8859-1'

113

114

115

class localstr(bytes):

115

class localstr(bytes):

116

'''This class allows strings that are unmodified to be

116

'''This class allows strings that are unmodified to be

117

round-tripped to the local encoding and back'''

117

round-tripped to the local encoding and back'''

118

119

def __new__(cls, u, l):

119

def __new__(cls, u, l):

120

# type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr

120

# type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr

121

s = bytes.__new__(cls, l)

121

s = bytes.__new__(cls, l)

122

s._utf8 = u

122

s._utf8 = u

123

return s

123

return s

124

125

def __hash__(self):

125

def __hash__(self):

126

return hash(self._utf8) # avoid collisions in local string space

126

return hash(self._utf8) # avoid collisions in local string space

127

128

129

class safelocalstr(bytes):

129

class safelocalstr(bytes):

130

"""Tagged string denoting it was previously an internal UTF-8 string,

130

"""Tagged string denoting it was previously an internal UTF-8 string,

131

and can be converted back to UTF-8 losslessly

131

and can be converted back to UTF-8 losslessly

132

133

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

133

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

134

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

134

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

135

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

135

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

136

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

136

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

137

"""

137

"""

138

139

140

def tolocal(s):

140

def tolocal(s):

141

# type: (bytes) -> bytes

141

# type: (bytes) -> bytes

142

"""

142

"""

143

Convert a string from internal UTF-8 to local encoding

143

Convert a string from internal UTF-8 to local encoding

144

145

All internal strings should be UTF-8 but some repos before the

145

All internal strings should be UTF-8 but some repos before the

146

implementation of locale support may contain latin1 or possibly

146

implementation of locale support may contain latin1 or possibly

147

other character sets. We attempt to decode everything strictly

147

other character sets. We attempt to decode everything strictly

148

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

148

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

149

replace unknown characters.

149

replace unknown characters.

150

151

The localstr class is used to cache the known UTF-8 encoding of

151

The localstr class is used to cache the known UTF-8 encoding of

152

strings next to their local representation to allow lossless

152

strings next to their local representation to allow lossless

153

round-trip conversion back to UTF-8.

153

round-trip conversion back to UTF-8.

154

155

>>> u = b'foo: \\xc3\\xa4' # utf-8

155

>>> u = b'foo: \\xc3\\xa4' # utf-8

156

>>> l = tolocal(u)

156

>>> l = tolocal(u)

157

>>> l

157

>>> l

158

'foo: ?'

158

'foo: ?'

159

>>> fromlocal(l)

159

>>> fromlocal(l)

160

'foo: \\xc3\\xa4'

160

'foo: \\xc3\\xa4'

161

>>> u2 = b'foo: \\xc3\\xa1'

161

>>> u2 = b'foo: \\xc3\\xa1'

162

>>> d = { l: 1, tolocal(u2): 2 }

162

>>> d = { l: 1, tolocal(u2): 2 }

163

>>> len(d) # no collision

163

>>> len(d) # no collision

164

2

164

2

165

>>> b'foo: ?' in d

165

>>> b'foo: ?' in d

166

False

166

False

167

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

167

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

168

>>> l = tolocal(l1)

168

>>> l = tolocal(l1)

169

>>> l

169

>>> l

170

'foo: ?'

170

'foo: ?'

171

>>> fromlocal(l) # magically in utf-8

171

>>> fromlocal(l) # magically in utf-8

172

'foo: \\xc3\\xa4'

172

'foo: \\xc3\\xa4'

173

"""

173

"""

174

175

if isasciistr(s):

175

if isasciistr(s):

176

return s

176

return s

177

178

try:

178

try:

179

try:

179

try:

180

# make sure string is actually stored in UTF-8

180

# make sure string is actually stored in UTF-8

181

u = s.decode('UTF-8')

181

u = s.decode('UTF-8')

182

if encoding == b'UTF-8':

182

if encoding == b'UTF-8':

183

# fast path

183

# fast path

184

return s

184

return s

185

r = u.encode(_sysstr(encoding), "replace")

185

r = u.encode(_sysstr(encoding), "replace")

186

if u == r.decode(_sysstr(encoding)):

186

if u == r.decode(_sysstr(encoding)):

187

# r is a safe, non-lossy encoding of s

187

# r is a safe, non-lossy encoding of s

188

return safelocalstr(r)

188

return safelocalstr(r)

189

return localstr(s, r)

189

return localstr(s, r)

190

except UnicodeDecodeError:

190

except UnicodeDecodeError:

191

# we should only get here if we're looking at an ancient changeset

191

# we should only get here if we're looking at an ancient changeset

192

try:

192

try:

193

u = s.decode(_sysstr(fallbackencoding))

193

u = s.decode(_sysstr(fallbackencoding))

194

r = u.encode(_sysstr(encoding), "replace")

194

r = u.encode(_sysstr(encoding), "replace")

195

if u == r.decode(_sysstr(encoding)):

195

if u == r.decode(_sysstr(encoding)):

196

# r is a safe, non-lossy encoding of s

196

# r is a safe, non-lossy encoding of s

197

return safelocalstr(r)

197

return safelocalstr(r)

198

return localstr(u.encode('UTF-8'), r)

198

return localstr(u.encode('UTF-8'), r)

199

except UnicodeDecodeError:

199

except UnicodeDecodeError:

200

u = s.decode("utf-8", "replace") # last ditch

200

u = s.decode("utf-8", "replace") # last ditch

201

# can't round-trip

201

# can't round-trip

202

return u.encode(_sysstr(encoding), "replace")

202

return u.encode(_sysstr(encoding), "replace")

203

except LookupError as k:

203

except LookupError as k:

204

raise error.Abort(k, hint=b"please check your locale settings")

204

raise error.Abort(k, hint=b"please check your locale settings")

205

206

207

def fromlocal(s):

207

def fromlocal(s):

208

# type: (bytes) -> bytes

208

# type: (bytes) -> bytes

209

"""

209

"""

210

Convert a string from the local character encoding to UTF-8

210

Convert a string from the local character encoding to UTF-8

211

212

We attempt to decode strings using the encoding mode set by

212

We attempt to decode strings using the encoding mode set by

213

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

213

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

214

characters will cause an error message. Other modes include

214

characters will cause an error message. Other modes include

215

'replace', which replaces unknown characters with a special

215

'replace', which replaces unknown characters with a special

216

Unicode character, and 'ignore', which drops the character.

216

Unicode character, and 'ignore', which drops the character.

217

"""

217

"""

218

219

# can we do a lossless round-trip?

219

# can we do a lossless round-trip?

220

if isinstance(s, localstr):

220

if isinstance(s, localstr):

221

return s._utf8

221

return s._utf8

222

if isasciistr(s):

222

if isasciistr(s):

223

return s

223

return s

224

225

try:

225

try:

226

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

226

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

227

return u.encode("utf-8")

227

return u.encode("utf-8")

228

except UnicodeDecodeError as inst:

228

except UnicodeDecodeError as inst:

229

sub = s[max(0, inst.start - 10) : inst.start + 10]

229

sub = s[max(0, inst.start - 10) : inst.start + 10]

230

raise error.Abort(

230

raise error.Abort(

231

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

231

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

232

)

232

)

233

except LookupError as k:

233

except LookupError as k:

234

raise error.Abort(k, hint=b"please check your locale settings")

234

raise error.Abort(k, hint=b"please check your locale settings")

235

236

237

def unitolocal(u):

237

def unitolocal(u):

238

# type: (Text) -> bytes

238

# type: (Text) -> bytes

239

"""Convert a unicode string to a byte string of local encoding"""

239

"""Convert a unicode string to a byte string of local encoding"""

240

return tolocal(u.encode('utf-8'))

240

return tolocal(u.encode('utf-8'))

241

242

243

def unifromlocal(s):

243

def unifromlocal(s):

244

# type: (bytes) -> Text

244

# type: (bytes) -> Text

245

"""Convert a byte string of local encoding to a unicode string"""

245

"""Convert a byte string of local encoding to a unicode string"""

246

return fromlocal(s).decode('utf-8')

246

return fromlocal(s).decode('utf-8')

247

248

249

def unimethod(bytesfunc):

249

def unimethod(bytesfunc):

250

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

250

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

251

"""Create a proxy method that forwards __unicode__() and __str__() of

251

"""Create a proxy method that forwards __unicode__() and __str__() of

252

Python 3 to __bytes__()"""

252

Python 3 to __bytes__()"""

253

254

def unifunc(obj):

254

def unifunc(obj):

255

return unifromlocal(bytesfunc(obj))

255

return unifromlocal(bytesfunc(obj))

256

257

return unifunc

257

return unifunc

258

259

260

# converter functions between native str and byte string. use these if the

260

# converter functions between native str and byte string. use these if the

261

# character encoding is not aware (e.g. exception message) or is known to

261

# character encoding is not aware (e.g. exception message) or is known to

262

# be locale dependent (e.g. date formatting.)

262

# be locale dependent (e.g. date formatting.)

263

if pycompat.ispy3:

263

if pycompat.ispy3:

264

strtolocal = unitolocal

264

strtolocal = unitolocal

265

strfromlocal = unifromlocal

265

strfromlocal = unifromlocal

266

strmethod = unimethod

266

strmethod = unimethod

267

else:

267

else:

268

269

def strtolocal(s):

269

def strtolocal(s):

270

# type: (str) -> bytes

270

# type: (str) -> bytes

271

return s

271

return s # pytype: disable=bad-return-type

272

273

def strfromlocal(s):

273

def strfromlocal(s):

274

# type: (bytes) -> str

274

# type: (bytes) -> str

275

return s

275

return s # pytype: disable=bad-return-type

276

277

strmethod = pycompat.identity

277

strmethod = pycompat.identity

278

279

if not _nativeenviron:

279

if not _nativeenviron:

280

# now encoding and helper functions are available, recreate the environ

280

# now encoding and helper functions are available, recreate the environ

281

# dict to be exported to other modules

281

# dict to be exported to other modules

282

environ = dict(

282

environ = dict(

283

(tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))

283

(tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))

284

for k, v in os.environ.items() # re-exports

284

for k, v in os.environ.items() # re-exports

285

)

285

)

286

287

if pycompat.ispy3:

287

if pycompat.ispy3:

288

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

288

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

289

# returns bytes.

289

# returns bytes.

290

if pycompat.iswindows:

290

if pycompat.iswindows:

291

# Python 3 on Windows issues a DeprecationWarning about using the bytes

291

# Python 3 on Windows issues a DeprecationWarning about using the bytes

292

# API when os.getcwdb() is called.

292

# API when os.getcwdb() is called.

293

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

293

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

294

else:

294

else:

295

getcwd = os.getcwdb # re-exports

295

getcwd = os.getcwdb # re-exports

296

else:

296

else:

297

getcwd = os.getcwd # re-exports

297

getcwd = os.getcwd # re-exports

298

299

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

299

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

300

_wide = _sysstr(

300

_wide = _sysstr(

301

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

301

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

302

and b"WFA"

302

and b"WFA"

303

or b"WF"

303

or b"WF"

304

)

304

)

305

306

307

def colwidth(s):

307

def colwidth(s):

308

# type: (bytes) -> int

308

# type: (bytes) -> int

309

b"Find the column width of a string for display in the local encoding"

309

b"Find the column width of a string for display in the local encoding"

310

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

310

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

311

312

313

def ucolwidth(d):

313

def ucolwidth(d):

314

# type: (Text) -> int

314

# type: (Text) -> int

315

b"Find the column width of a Unicode string for display"

315

b"Find the column width of a Unicode string for display"

316

eaw = getattr(unicodedata, 'east_asian_width', None)

316

eaw = getattr(unicodedata, 'east_asian_width', None)

317

if eaw is not None:

317

if eaw is not None:

318

return sum([eaw(c) in _wide and 2 or 1 for c in d])

318

return sum([eaw(c) in _wide and 2 or 1 for c in d])

319

return len(d)

319

return len(d)

320

321

322

def getcols(s, start, c):

322

def getcols(s, start, c):

323

# type: (bytes, int, int) -> bytes

323

# type: (bytes, int, int) -> bytes

324

'''Use colwidth to find a c-column substring of s starting at byte

324

'''Use colwidth to find a c-column substring of s starting at byte

325

index start'''

325

index start'''

326

for x in pycompat.xrange(start + c, len(s)):

326

for x in pycompat.xrange(start + c, len(s)):

327

t = s[start:x]

327

t = s[start:x]

328

if colwidth(t) == c:

328

if colwidth(t) == c:

329

return t

329

return t

330

raise ValueError('substring not found')

330

raise ValueError('substring not found')

331

332

333

def trim(s, width, ellipsis=b'', leftside=False):

333

def trim(s, width, ellipsis=b'', leftside=False):

334

# type: (bytes, int, bytes, bool) -> bytes

334

# type: (bytes, int, bytes, bool) -> bytes

335

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

335

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

336

337

If 'leftside' is True, left side of string 's' is trimmed.

337

If 'leftside' is True, left side of string 's' is trimmed.

338

'ellipsis' is always placed at trimmed side.

338

'ellipsis' is always placed at trimmed side.

339

340

>>> from .node import bin

340

>>> from .node import bin

341

>>> def bprint(s):

341

>>> def bprint(s):

342

... print(pycompat.sysstr(s))

342

... print(pycompat.sysstr(s))

343

>>> ellipsis = b'+++'

343

>>> ellipsis = b'+++'

344

>>> from . import encoding

344

>>> from . import encoding

345

>>> encoding.encoding = b'utf-8'

345

>>> encoding.encoding = b'utf-8'

346

>>> t = b'1234567890'

346

>>> t = b'1234567890'

347

>>> bprint(trim(t, 12, ellipsis=ellipsis))

347

>>> bprint(trim(t, 12, ellipsis=ellipsis))

348

1234567890

348

1234567890

349

>>> bprint(trim(t, 10, ellipsis=ellipsis))

349

>>> bprint(trim(t, 10, ellipsis=ellipsis))

350

1234567890

350

1234567890

351

>>> bprint(trim(t, 8, ellipsis=ellipsis))

351

>>> bprint(trim(t, 8, ellipsis=ellipsis))

352

12345+++

352

12345+++

353

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

353

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

354

+++67890

354

+++67890

355

>>> bprint(trim(t, 8))

355

>>> bprint(trim(t, 8))

356

12345678

356

12345678

357

>>> bprint(trim(t, 8, leftside=True))

357

>>> bprint(trim(t, 8, leftside=True))

358

34567890

358

34567890

359

>>> bprint(trim(t, 3, ellipsis=ellipsis))

359

>>> bprint(trim(t, 3, ellipsis=ellipsis))

360

+++

360

+++

361

>>> bprint(trim(t, 1, ellipsis=ellipsis))

361

>>> bprint(trim(t, 1, ellipsis=ellipsis))

362

+

362

+

363

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

363

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

364

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

364

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

365

>>> bprint(trim(t, 12, ellipsis=ellipsis))

365

>>> bprint(trim(t, 12, ellipsis=ellipsis))

366

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

366

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

367

>>> bprint(trim(t, 10, ellipsis=ellipsis))

367

>>> bprint(trim(t, 10, ellipsis=ellipsis))

368

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

368

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

369

>>> bprint(trim(t, 8, ellipsis=ellipsis))

369

>>> bprint(trim(t, 8, ellipsis=ellipsis))

370

\xe3\x81\x82\xe3\x81\x84+++

370

\xe3\x81\x82\xe3\x81\x84+++

371

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

371

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

372

+++\xe3\x81\x88\xe3\x81\x8a

372

+++\xe3\x81\x88\xe3\x81\x8a

373

>>> bprint(trim(t, 5))

373

>>> bprint(trim(t, 5))

374

\xe3\x81\x82\xe3\x81\x84

374

\xe3\x81\x82\xe3\x81\x84

375

>>> bprint(trim(t, 5, leftside=True))

375

>>> bprint(trim(t, 5, leftside=True))

376

\xe3\x81\x88\xe3\x81\x8a

376

\xe3\x81\x88\xe3\x81\x8a

377

>>> bprint(trim(t, 4, ellipsis=ellipsis))

377

>>> bprint(trim(t, 4, ellipsis=ellipsis))

378

+++

378

+++

379

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

379

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

380

+++

380

+++

381

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

381

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

382

>>> bprint(trim(t, 12, ellipsis=ellipsis))

382

>>> bprint(trim(t, 12, ellipsis=ellipsis))

383

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

383

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

384

>>> bprint(trim(t, 10, ellipsis=ellipsis))

384

>>> bprint(trim(t, 10, ellipsis=ellipsis))

385

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

385

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

386

>>> bprint(trim(t, 8, ellipsis=ellipsis))

386

>>> bprint(trim(t, 8, ellipsis=ellipsis))

387

\x11\x22\x33\x44\x55+++

387

\x11\x22\x33\x44\x55+++

388

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

388

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

389

+++\x66\x77\x88\x99\xaa

389

+++\x66\x77\x88\x99\xaa

390

>>> bprint(trim(t, 8))

390

>>> bprint(trim(t, 8))

391

\x11\x22\x33\x44\x55\x66\x77\x88

391

\x11\x22\x33\x44\x55\x66\x77\x88

392

>>> bprint(trim(t, 8, leftside=True))

392

>>> bprint(trim(t, 8, leftside=True))

393

\x33\x44\x55\x66\x77\x88\x99\xaa

393

\x33\x44\x55\x66\x77\x88\x99\xaa

394

>>> bprint(trim(t, 3, ellipsis=ellipsis))

394

>>> bprint(trim(t, 3, ellipsis=ellipsis))

395

+++

395

+++

396

>>> bprint(trim(t, 1, ellipsis=ellipsis))

396

>>> bprint(trim(t, 1, ellipsis=ellipsis))

397

+

397

+

398

"""

398

"""

399

try:

399

try:

400

u = s.decode(_sysstr(encoding))

400

u = s.decode(_sysstr(encoding))

401

except UnicodeDecodeError:

401

except UnicodeDecodeError:

402

if len(s) <= width: # trimming is not needed

402

if len(s) <= width: # trimming is not needed

403

return s

403

return s

404

width -= len(ellipsis)

404

width -= len(ellipsis)

405

if width <= 0: # no enough room even for ellipsis

405

if width <= 0: # no enough room even for ellipsis

406

return ellipsis[: width + len(ellipsis)]

406

return ellipsis[: width + len(ellipsis)]

407

if leftside:

407

if leftside:

408

return ellipsis + s[-width:]

408

return ellipsis + s[-width:]

409

return s[:width] + ellipsis

409

return s[:width] + ellipsis

410

411

if ucolwidth(u) <= width: # trimming is not needed

411

if ucolwidth(u) <= width: # trimming is not needed

412

return s

412

return s

413

414

width -= len(ellipsis)

414

width -= len(ellipsis)

415

if width <= 0: # no enough room even for ellipsis

415

if width <= 0: # no enough room even for ellipsis

416

return ellipsis[: width + len(ellipsis)]

416

return ellipsis[: width + len(ellipsis)]

417

418

if leftside:

418

if leftside:

419

uslice = lambda i: u[i:]

419

uslice = lambda i: u[i:]

420

concat = lambda s: ellipsis + s

420

concat = lambda s: ellipsis + s

421

else:

421

else:

422

uslice = lambda i: u[:-i]

422

uslice = lambda i: u[:-i]

423

concat = lambda s: s + ellipsis

423

concat = lambda s: s + ellipsis

424

for i in pycompat.xrange(1, len(u)):

424

for i in pycompat.xrange(1, len(u)):

425

usub = uslice(i)

425

usub = uslice(i)

426

if ucolwidth(usub) <= width:

426

if ucolwidth(usub) <= width:

427

return concat(usub.encode(_sysstr(encoding)))

427

return concat(usub.encode(_sysstr(encoding)))

428

return ellipsis # no enough room for multi-column characters

428

return ellipsis # no enough room for multi-column characters

429

430

431

def lower(s):

431

def lower(s):

432

# type: (bytes) -> bytes

432

# type: (bytes) -> bytes

433

b"best-effort encoding-aware case-folding of local string s"

433

b"best-effort encoding-aware case-folding of local string s"

434

try:

434

try:

435

return asciilower(s)

435

return asciilower(s)

436

except UnicodeDecodeError:

436

except UnicodeDecodeError:

437

pass

437

pass

438

try:

438

try:

439

if isinstance(s, localstr):

439

if isinstance(s, localstr):

440

u = s._utf8.decode("utf-8")

440

u = s._utf8.decode("utf-8")

441

else:

441

else:

442

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

442

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

443

444

lu = u.lower()

444

lu = u.lower()

445

if u == lu:

445

if u == lu:

446

return s # preserve localstring

446

return s # preserve localstring

447

return lu.encode(_sysstr(encoding))

447

return lu.encode(_sysstr(encoding))

448

except UnicodeError:

448

except UnicodeError:

449

return s.lower() # we don't know how to fold this except in ASCII

449

return s.lower() # we don't know how to fold this except in ASCII

450

except LookupError as k:

450

except LookupError as k:

451

raise error.Abort(k, hint=b"please check your locale settings")

451

raise error.Abort(k, hint=b"please check your locale settings")

452

453

454

def upper(s):

454

def upper(s):

455

# type: (bytes) -> bytes

455

# type: (bytes) -> bytes

456

b"best-effort encoding-aware case-folding of local string s"

456

b"best-effort encoding-aware case-folding of local string s"

457

try:

457

try:

458

return asciiupper(s)

458

return asciiupper(s)

459

except UnicodeDecodeError:

459

except UnicodeDecodeError:

460

return upperfallback(s)

460

return upperfallback(s)

461

462

463

def upperfallback(s):

463

def upperfallback(s):

464

# type: (Any) -> Any

464

# type: (Any) -> Any

465

try:

465

try:

466

if isinstance(s, localstr):

466

if isinstance(s, localstr):

467

u = s._utf8.decode("utf-8")

467

u = s._utf8.decode("utf-8")

468

else:

468

else:

469

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

469

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

470

471

uu = u.upper()

471

uu = u.upper()

472

if u == uu:

472

if u == uu:

473

return s # preserve localstring

473

return s # preserve localstring

474

return uu.encode(_sysstr(encoding))

474

return uu.encode(_sysstr(encoding))

475

except UnicodeError:

475

except UnicodeError:

476

return s.upper() # we don't know how to fold this except in ASCII

476

return s.upper() # we don't know how to fold this except in ASCII

477

except LookupError as k:

477

except LookupError as k:

478

raise error.Abort(k, hint=b"please check your locale settings")

478

raise error.Abort(k, hint=b"please check your locale settings")

479

480

481

class normcasespecs(object):

481

class normcasespecs(object):

482

'''what a platform's normcase does to ASCII strings

482

'''what a platform's normcase does to ASCII strings

483

484

This is specified per platform, and should be consistent with what normcase

484

This is specified per platform, and should be consistent with what normcase

485

on that platform actually does.

485

on that platform actually does.

486

487

lower: normcase lowercases ASCII strings

487

lower: normcase lowercases ASCII strings

488

upper: normcase uppercases ASCII strings

488

upper: normcase uppercases ASCII strings

489

other: the fallback function should always be called

489

other: the fallback function should always be called

490

491

This should be kept in sync with normcase_spec in util.h.'''

491

This should be kept in sync with normcase_spec in util.h.'''

492

493

lower = -1

493

lower = -1

494

upper = 1

494

upper = 1

495

other = 0

495

other = 0

496

497

498

def jsonescape(s, paranoid=False):

498

def jsonescape(s, paranoid=False):

499

# type: (Any, Any) -> Any

499

# type: (Any, Any) -> Any

500

'''returns a string suitable for JSON

500

'''returns a string suitable for JSON

501

502

JSON is problematic for us because it doesn't support non-Unicode

502

JSON is problematic for us because it doesn't support non-Unicode

503

bytes. To deal with this, we take the following approach:

503

bytes. To deal with this, we take the following approach:

504

505

- localstr/safelocalstr objects are converted back to UTF-8

505

- localstr/safelocalstr objects are converted back to UTF-8

506

- valid UTF-8/ASCII strings are passed as-is

506

- valid UTF-8/ASCII strings are passed as-is

507

- other strings are converted to UTF-8b surrogate encoding

507

- other strings are converted to UTF-8b surrogate encoding

508

- apply JSON-specified string escaping

508

- apply JSON-specified string escaping

509

510

(escapes are doubled in these tests)

510

(escapes are doubled in these tests)

511

512

>>> jsonescape(b'this is a test')

512

>>> jsonescape(b'this is a test')

513

'this is a test'

513

'this is a test'

514

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

514

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

515

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

515

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

516

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

516

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

517

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

517

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

518

>>> jsonescape(b'a weird byte: \\xdd')

518

>>> jsonescape(b'a weird byte: \\xdd')

519

'a weird byte: \\xed\\xb3\\x9d'

519

'a weird byte: \\xed\\xb3\\x9d'

520

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

520

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

521

'utf-8: caf\\xc3\\xa9'

521

'utf-8: caf\\xc3\\xa9'

522

>>> jsonescape(b'')

522

>>> jsonescape(b'')

523

''

523

''

524

525

If paranoid, non-ascii and common troublesome characters are also escaped.

525

If paranoid, non-ascii and common troublesome characters are also escaped.

526

This is suitable for web output.

526

This is suitable for web output.

527

528

>>> s = b'escape characters: \\0 \\x0b \\x7f'

528

>>> s = b'escape characters: \\0 \\x0b \\x7f'

529

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

529

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

530

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

530

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

531

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

531

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

532

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

532

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

533

'escape boundary: ~ \\\\u007f \\\\u0080'

533

'escape boundary: ~ \\\\u007f \\\\u0080'

534

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

534

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

535

'a weird byte: \\\\udcdd'

535

'a weird byte: \\\\udcdd'

536

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

536

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

537

'utf-8: caf\\\\u00e9'

537

'utf-8: caf\\\\u00e9'

538

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

538

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

539

'non-BMP: \\\\ud834\\\\udd1e'

539

'non-BMP: \\\\ud834\\\\udd1e'

540

>>> jsonescape(b'<foo@example.org>', paranoid=True)

540

>>> jsonescape(b'<foo@example.org>', paranoid=True)

541

'\\\\u003cfoo@example.org\\\\u003e'

541

'\\\\u003cfoo@example.org\\\\u003e'

542

'''

542

'''

543

544

u8chars = toutf8b(s)

544

u8chars = toutf8b(s)

545

try:

545

try:

546

return _jsonescapeu8fast(u8chars, paranoid)

546

return _jsonescapeu8fast(u8chars, paranoid)

547

except ValueError:

547

except ValueError:

548

pass

548

pass

549

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

549

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

550

551

552

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

552

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

553

# bytes are mapped to that range.

553

# bytes are mapped to that range.

554

if pycompat.ispy3:

554

if pycompat.ispy3:

555

_utf8strict = r'surrogatepass'

555

_utf8strict = r'surrogatepass'

556

else:

556

else:

557

_utf8strict = r'strict'

557

_utf8strict = r'strict'

558

559

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

559

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

560

561

562

def getutf8char(s, pos):

562

def getutf8char(s, pos):

563

# type: (bytes, int) -> bytes

563

# type: (bytes, int) -> bytes

564

'''get the next full utf-8 character in the given string, starting at pos

564

'''get the next full utf-8 character in the given string, starting at pos

565

566

Raises a UnicodeError if the given location does not start a valid

566

Raises a UnicodeError if the given location does not start a valid

567

utf-8 character.

567

utf-8 character.

568

'''

568

'''

569

570

# find how many bytes to attempt decoding from first nibble

570

# find how many bytes to attempt decoding from first nibble

571

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

571

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

572

if not l: # ascii

572

if not l: # ascii

573

return s[pos : pos + 1]

573

return s[pos : pos + 1]

574

575

c = s[pos : pos + l]

575

c = s[pos : pos + l]

576

# validate with attempted decode

576

# validate with attempted decode

577

c.decode("utf-8", _utf8strict)

577

c.decode("utf-8", _utf8strict)

578

return c

578

return c

579

580

581

def toutf8b(s):

581

def toutf8b(s):

582

# type: (bytes) -> bytes

582

# type: (bytes) -> bytes

583

'''convert a local, possibly-binary string into UTF-8b

583

'''convert a local, possibly-binary string into UTF-8b

584

585

This is intended as a generic method to preserve data when working

585

This is intended as a generic method to preserve data when working

586

with schemes like JSON and XML that have no provision for

586

with schemes like JSON and XML that have no provision for

587

arbitrary byte strings. As Mercurial often doesn't know

587

arbitrary byte strings. As Mercurial often doesn't know

588

what encoding data is in, we use so-called UTF-8b.

588

what encoding data is in, we use so-called UTF-8b.

589

590

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

590

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

591

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

591

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

592

uDC00-uDCFF.

592

uDC00-uDCFF.

593

594

Principles of operation:

594

Principles of operation:

595

596

- ASCII and UTF-8 data successfully round-trips and is understood

596

- ASCII and UTF-8 data successfully round-trips and is understood

597

by Unicode-oriented clients

597

by Unicode-oriented clients

598

- filenames and file contents in arbitrary other encodings can have

598

- filenames and file contents in arbitrary other encodings can have

599

be round-tripped or recovered by clueful clients

599

be round-tripped or recovered by clueful clients

600

- local strings that have a cached known UTF-8 encoding (aka

600

- local strings that have a cached known UTF-8 encoding (aka

601

localstr) get sent as UTF-8 so Unicode-oriented clients get the

601

localstr) get sent as UTF-8 so Unicode-oriented clients get the

602

Unicode data they want

602

Unicode data they want

603

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

603

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

604

- because we must preserve UTF-8 bytestring in places such as

604

- because we must preserve UTF-8 bytestring in places such as

605

filenames, metadata can't be roundtripped without help

605

filenames, metadata can't be roundtripped without help

606

607

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

607

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

608

arbitrary bytes into an internal Unicode format that can be

608

arbitrary bytes into an internal Unicode format that can be

609

re-encoded back into the original. Here we are exposing the

609

re-encoded back into the original. Here we are exposing the

610

internal surrogate encoding as a UTF-8 string.)

610

internal surrogate encoding as a UTF-8 string.)

611

'''

611

'''

612

613

if isinstance(s, localstr):

613

if isinstance(s, localstr):

614

# assume that the original UTF-8 sequence would never contain

614

# assume that the original UTF-8 sequence would never contain

615

# invalid characters in U+DCxx range

615

# invalid characters in U+DCxx range

616

return s._utf8

616

return s._utf8

617

elif isinstance(s, safelocalstr):

617

elif isinstance(s, safelocalstr):

618

# already verified that s is non-lossy in legacy encoding, which

618

# already verified that s is non-lossy in legacy encoding, which

619

# shouldn't contain characters in U+DCxx range

619

# shouldn't contain characters in U+DCxx range

620

return fromlocal(s)

620

return fromlocal(s)

621

elif isasciistr(s):

621

elif isasciistr(s):

622

return s

622

return s

623

if b"\xed" not in s:

623

if b"\xed" not in s:

624

try:

624

try:

625

s.decode('utf-8', _utf8strict)

625

s.decode('utf-8', _utf8strict)

626

return s

626

return s

627

except UnicodeDecodeError:

627

except UnicodeDecodeError:

628

pass

628

pass

629

630

s = pycompat.bytestr(s)

630

s = pycompat.bytestr(s)

631

r = b""

631

r = b""

632

pos = 0

632

pos = 0

633

l = len(s)

633

l = len(s)

634

while pos < l:

634

while pos < l:

635

try:

635

try:

636

c = getutf8char(s, pos)

636

c = getutf8char(s, pos)

637

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

637

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

638

# have to re-escape existing U+DCxx characters

638

# have to re-escape existing U+DCxx characters

639

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

639

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

640

pos += 1

640

pos += 1

641

else:

641

else:

642

pos += len(c)

642

pos += len(c)

643

except UnicodeDecodeError:

643

except UnicodeDecodeError:

644

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

644

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

645

pos += 1

645

pos += 1

646

r += c

646

r += c

647

return r

647

return r

648

649

650

def fromutf8b(s):

650

def fromutf8b(s):

651

# type: (bytes) -> bytes

651

# type: (bytes) -> bytes

652

'''Given a UTF-8b string, return a local, possibly-binary string.

652

'''Given a UTF-8b string, return a local, possibly-binary string.

653

654

return the original binary string. This

654

return the original binary string. This

655

is a round-trip process for strings like filenames, but metadata

655

is a round-trip process for strings like filenames, but metadata

656

that's was passed through tolocal will remain in UTF-8.

656

that's was passed through tolocal will remain in UTF-8.

657

658

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

658

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

659

>>> m = b"\\xc3\\xa9\\x99abcd"

659

>>> m = b"\\xc3\\xa9\\x99abcd"

660

>>> toutf8b(m)

660

>>> toutf8b(m)

661

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

661

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

662

>>> roundtrip(m)

662

>>> roundtrip(m)

663

True

663

True

664

>>> roundtrip(b"\\xc2\\xc2\\x80")

664

>>> roundtrip(b"\\xc2\\xc2\\x80")

665

True

665

True

666

>>> roundtrip(b"\\xef\\xbf\\xbd")

666

>>> roundtrip(b"\\xef\\xbf\\xbd")

667

True

667

True

668

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

668

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

669

True

669

True

670

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

670

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

671

True

671

True

672

'''

672

'''

673

674

if isasciistr(s):

674

if isasciistr(s):

675

return s

675

return s

676

# fast path - look for uDxxx prefixes in s

676

# fast path - look for uDxxx prefixes in s

677

if b"\xed" not in s:

677

if b"\xed" not in s:

678

return s

678

return s

679

680

# We could do this with the unicode type but some Python builds

680

# We could do this with the unicode type but some Python builds

681

# use UTF-16 internally (issue5031) which causes non-BMP code

681

# use UTF-16 internally (issue5031) which causes non-BMP code

682

# points to be escaped. Instead, we use our handy getutf8char

682

# points to be escaped. Instead, we use our handy getutf8char

683

# helper again to walk the string without "decoding" it.

683

# helper again to walk the string without "decoding" it.

684

685

s = pycompat.bytestr(s)

685

s = pycompat.bytestr(s)

686

r = b""

686

r = b""

687

pos = 0

687

pos = 0

688

l = len(s)

688

l = len(s)

689

while pos < l:

689

while pos < l:

690

c = getutf8char(s, pos)

690

c = getutf8char(s, pos)

691

pos += len(c)

691

pos += len(c)

692

# unescape U+DCxx characters

692

# unescape U+DCxx characters

693

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

693

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

694

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

694

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

695

r += c

695

r += c

696

return r

696

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             if not globals():  # hide this from non-pytype users
                 from typing import (
                     Any,
                     Callable,
                     List,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict(
                     (k.encode('utf-8'), v.encode('utf-8'))
                     for k, v in os.environ.items()  # re-exports
                 )
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 def strtolocal(s):
                     # type: (str) -> bytes
-                    return s
+                    return s  # pytype: disable=bad-return-type
                 def strfromlocal(s):
                     # type: (bytes) -> str
-                    return s
+                    return s  # pytype: disable=bad-return-type
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict(
                     (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
                     for k, v in os.environ.items()  # re-exports
                 )
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     getcwd = lambda: strtolocal(os.getcwd())  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 b"Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 b"Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             def lower(s):
                 # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
                 # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (bytes, int) -> bytes
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (bytes) -> bytes
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 # type: (bytes) -> bytes
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r