upstream/mercurial-mirror Commit - r44187:2ade00f3

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

if not globals(): # hide this from non-pytype users

24

from typing import (

25

Any,

26

Callable,

27

List,

28

Text,

29

Type,

30

TypeVar,

31

Union,

32

)

33

34

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)

39

23

charencode = policy.importmod(r'charencode')

40

charencode = policy.importmod(r'charencode')

24

41

25

isasciistr = charencode.isasciistr

42

isasciistr = charencode.isasciistr

26

asciilower = charencode.asciilower

43

asciilower = charencode.asciilower

27

asciiupper = charencode.asciiupper

44

asciiupper = charencode.asciiupper

28

_jsonescapeu8fast = charencode.jsonescapeu8fast

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

29

46

30

_sysstr = pycompat.sysstr

47

_sysstr = pycompat.sysstr

31

48

32

if pycompat.ispy3:

49

if pycompat.ispy3:

33

unichr = chr

50

unichr = chr

34

51

35

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

36

# "Unicode Subtleties"), so we need to ignore them in some places for

53

# "Unicode Subtleties"), so we need to ignore them in some places for

37

# sanity.

54

# sanity.

38

_ignore = [

55

_ignore = [

39

unichr(int(x, 16)).encode("utf-8")

56

unichr(int(x, 16)).encode("utf-8")

40

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

b"206a 206b 206c 206d 206e 206f feff".split()

58

b"206a 206b 206c 206d 206e 206f feff".split()

42

]

59

]

43

# verify the next function will work

60

# verify the next function will work

44

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

45

62

46

63

47

def hfsignoreclean(s):

64

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

48

"""Remove codepoints ignored by HFS+ from s.

66

"""Remove codepoints ignored by HFS+ from s.

49

67

50

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

51

'.hg'

69

'.hg'

52

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

53

'.hg'

71

'.hg'

54

"""

72

"""

55

if b"\xe2" in s or b"\xef" in s:

73

if b"\xe2" in s or b"\xef" in s:

56

for c in _ignore:

74

for c in _ignore:

57

s = s.replace(c, b'')

75

s = s.replace(c, b'')

58

return s

76

return s

59

77

60

78

61

# encoding.environ is provided read-only, which may not be used to modify

79

# encoding.environ is provided read-only, which may not be used to modify

62

# the process environment

80

# the process environment

63

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

64

if not pycompat.ispy3:

82

if not pycompat.ispy3:

65

environ = os.environ # re-exports

83

environ = os.environ # re-exports

66

elif _nativeenviron:

84

elif _nativeenviron:

67

environ = os.environb # re-exports

85

environ = os.environb # re-exports

68

else:

86

else:

69

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

70

# and recreate it once encoding is settled

88

# and recreate it once encoding is settled

71

environ = dict(

89

environ = dict(

72

(k.encode(r'utf-8'), v.encode(r'utf-8'))

90

(k.encode(r'utf-8'), v.encode(r'utf-8'))

73

for k, v in os.environ.items() # re-exports

91

for k, v in os.environ.items() # re-exports

74

)

92

)

75

93

76

_encodingrewrites = {

94

_encodingrewrites = {

77

b'646': b'ascii',

95

b'646': b'ascii',

78

b'ANSI_X3.4-1968': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

79

}

97

}

80

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

81

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

82

# https://bugs.python.org/issue13216

100

# https://bugs.python.org/issue13216

83

if pycompat.iswindows and not pycompat.ispy3:

101

if pycompat.iswindows and not pycompat.ispy3:

84

_encodingrewrites[b'cp65001'] = b'utf-8'

102

_encodingrewrites[b'cp65001'] = b'utf-8'

85

103

86

try:

104

try:

87

encoding = environ.get(b"HGENCODING")

105

encoding = environ.get(b"HGENCODING")

88

if not encoding:

106

if not encoding:

89

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

90

encoding = _encodingrewrites.get(encoding, encoding)

108

encoding = _encodingrewrites.get(encoding, encoding)

91

except locale.Error:

109

except locale.Error:

92

encoding = b'ascii'

110

encoding = b'ascii'

93

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

94

fallbackencoding = b'ISO-8859-1'

112

fallbackencoding = b'ISO-8859-1'

95

113

96

114

97

class localstr(bytes):

115

class localstr(bytes):

98

'''This class allows strings that are unmodified to be

116

'''This class allows strings that are unmodified to be

99

round-tripped to the local encoding and back'''

117

round-tripped to the local encoding and back'''

100

118

101

def __new__(cls, u, l):

119

def __new__(cls, u, l):

120

# type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr

102

s = bytes.__new__(cls, l)

121

s = bytes.__new__(cls, l)

103

s._utf8 = u

122

s._utf8 = u

104

return s

123

return s

105

124

106

def __hash__(self):

125

def __hash__(self):

107

return hash(self._utf8) # avoid collisions in local string space

126

return hash(self._utf8) # avoid collisions in local string space

108

127

109

128

110

class safelocalstr(bytes):

129

class safelocalstr(bytes):

111

"""Tagged string denoting it was previously an internal UTF-8 string,

130

"""Tagged string denoting it was previously an internal UTF-8 string,

112

and can be converted back to UTF-8 losslessly

131

and can be converted back to UTF-8 losslessly

113

132

114

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

133

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

115

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

134

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

116

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

135

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

117

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

136

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

118

"""

137

"""

119

138

120

139

121

def tolocal(s):

140

def tolocal(s):

141

# type: (Text) -> bytes

122

"""

142

"""

123

Convert a string from internal UTF-8 to local encoding

143

Convert a string from internal UTF-8 to local encoding

124

144

125

All internal strings should be UTF-8 but some repos before the

145

All internal strings should be UTF-8 but some repos before the

126

implementation of locale support may contain latin1 or possibly

146

implementation of locale support may contain latin1 or possibly

127

other character sets. We attempt to decode everything strictly

147

other character sets. We attempt to decode everything strictly

128

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

148

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

129

replace unknown characters.

149

replace unknown characters.

130

150

131

The localstr class is used to cache the known UTF-8 encoding of

151

The localstr class is used to cache the known UTF-8 encoding of

132

strings next to their local representation to allow lossless

152

strings next to their local representation to allow lossless

133

round-trip conversion back to UTF-8.

153

round-trip conversion back to UTF-8.

134

154

135

>>> u = b'foo: \\xc3\\xa4' # utf-8

155

>>> u = b'foo: \\xc3\\xa4' # utf-8

136

>>> l = tolocal(u)

156

>>> l = tolocal(u)

137

>>> l

157

>>> l

138

'foo: ?'

158

'foo: ?'

139

>>> fromlocal(l)

159

>>> fromlocal(l)

140

'foo: \\xc3\\xa4'

160

'foo: \\xc3\\xa4'

141

>>> u2 = b'foo: \\xc3\\xa1'

161

>>> u2 = b'foo: \\xc3\\xa1'

142

>>> d = { l: 1, tolocal(u2): 2 }

162

>>> d = { l: 1, tolocal(u2): 2 }

143

>>> len(d) # no collision

163

>>> len(d) # no collision

144

2

164

2

145

>>> b'foo: ?' in d

165

>>> b'foo: ?' in d

146

False

166

False

147

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

167

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

148

>>> l = tolocal(l1)

168

>>> l = tolocal(l1)

149

>>> l

169

>>> l

150

'foo: ?'

170

'foo: ?'

151

>>> fromlocal(l) # magically in utf-8

171

>>> fromlocal(l) # magically in utf-8

152

'foo: \\xc3\\xa4'

172

'foo: \\xc3\\xa4'

153

"""

173

"""

154

174

155

if isasciistr(s):

175

if isasciistr(s):

156

return s

176

return s

157

177

158

try:

178

try:

159

try:

179

try:

160

# make sure string is actually stored in UTF-8

180

# make sure string is actually stored in UTF-8

161

u = s.decode('UTF-8')

181

u = s.decode('UTF-8')

162

if encoding == b'UTF-8':

182

if encoding == b'UTF-8':

163

# fast path

183

# fast path

164

return s

184

return s

165

r = u.encode(_sysstr(encoding), r"replace")

185

r = u.encode(_sysstr(encoding), r"replace")

166

if u == r.decode(_sysstr(encoding)):

186

if u == r.decode(_sysstr(encoding)):

167

# r is a safe, non-lossy encoding of s

187

# r is a safe, non-lossy encoding of s

168

return safelocalstr(r)

188

return safelocalstr(r)

169

return localstr(s, r)

189

return localstr(s, r)

170

except UnicodeDecodeError:

190

except UnicodeDecodeError:

171

# we should only get here if we're looking at an ancient changeset

191

# we should only get here if we're looking at an ancient changeset

172

try:

192

try:

173

u = s.decode(_sysstr(fallbackencoding))

193

u = s.decode(_sysstr(fallbackencoding))

174

r = u.encode(_sysstr(encoding), r"replace")

194

r = u.encode(_sysstr(encoding), r"replace")

175

if u == r.decode(_sysstr(encoding)):

195

if u == r.decode(_sysstr(encoding)):

176

# r is a safe, non-lossy encoding of s

196

# r is a safe, non-lossy encoding of s

177

return safelocalstr(r)

197

return safelocalstr(r)

178

return localstr(u.encode('UTF-8'), r)

198

return localstr(u.encode('UTF-8'), r)

179

except UnicodeDecodeError:

199

except UnicodeDecodeError:

180

u = s.decode("utf-8", "replace") # last ditch

200

u = s.decode("utf-8", "replace") # last ditch

181

# can't round-trip

201

# can't round-trip

182

return u.encode(_sysstr(encoding), r"replace")

202

return u.encode(_sysstr(encoding), r"replace")

183

except LookupError as k:

203

except LookupError as k:

184

raise error.Abort(k, hint=b"please check your locale settings")

204

raise error.Abort(k, hint=b"please check your locale settings")

185

205

186

206

187

def fromlocal(s):

207

def fromlocal(s):

208

# type: (bytes) -> Text

188

"""

209

"""

189

Convert a string from the local character encoding to UTF-8

210

Convert a string from the local character encoding to UTF-8

190

211

191

We attempt to decode strings using the encoding mode set by

212

We attempt to decode strings using the encoding mode set by

192

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

213

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

193

characters will cause an error message. Other modes include

214

characters will cause an error message. Other modes include

194

'replace', which replaces unknown characters with a special

215

'replace', which replaces unknown characters with a special

195

Unicode character, and 'ignore', which drops the character.

216

Unicode character, and 'ignore', which drops the character.

196

"""

217

"""

197

218

198

# can we do a lossless round-trip?

219

# can we do a lossless round-trip?

199

if isinstance(s, localstr):

220

if isinstance(s, localstr):

200

return s._utf8

221

return s._utf8

201

if isasciistr(s):

222

if isasciistr(s):

202

return s

223

return s

203

224

204

try:

225

try:

205

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

226

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

206

return u.encode("utf-8")

227

return u.encode("utf-8")

207

except UnicodeDecodeError as inst:

228

except UnicodeDecodeError as inst:

208

sub = s[max(0, inst.start - 10) : inst.start + 10]

229

sub = s[max(0, inst.start - 10) : inst.start + 10]

209

raise error.Abort(

230

raise error.Abort(

210

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

231

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

211

)

232

)

212

except LookupError as k:

233

except LookupError as k:

213

raise error.Abort(k, hint=b"please check your locale settings")

234

raise error.Abort(k, hint=b"please check your locale settings")

214

235

215

236

216

def unitolocal(u):

237

def unitolocal(u):

238

# type: (Text) -> bytes

217

"""Convert a unicode string to a byte string of local encoding"""

239

"""Convert a unicode string to a byte string of local encoding"""

218

return tolocal(u.encode('utf-8'))

240

return tolocal(u.encode('utf-8'))

219

241

220

242

221

def unifromlocal(s):

243

def unifromlocal(s):

244

# type: (bytes) -> Text

222

"""Convert a byte string of local encoding to a unicode string"""

245

"""Convert a byte string of local encoding to a unicode string"""

223

return fromlocal(s).decode('utf-8')

246

return fromlocal(s).decode('utf-8')

224

247

225

248

226

def unimethod(bytesfunc):

249

def unimethod(bytesfunc):

250

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

227

"""Create a proxy method that forwards __unicode__() and __str__() of

251

"""Create a proxy method that forwards __unicode__() and __str__() of

228

Python 3 to __bytes__()"""

252

Python 3 to __bytes__()"""

229

253

230

def unifunc(obj):

254

def unifunc(obj):

231

return unifromlocal(bytesfunc(obj))

255

return unifromlocal(bytesfunc(obj))

232

256

233

return unifunc

257

return unifunc

234

258

235

259

236

# converter functions between native str and byte string. use these if the

260

# converter functions between native str and byte string. use these if the

237

# character encoding is not aware (e.g. exception message) or is known to

261

# character encoding is not aware (e.g. exception message) or is known to

238

# be locale dependent (e.g. date formatting.)

262

# be locale dependent (e.g. date formatting.)

239

if pycompat.ispy3:

263

if pycompat.ispy3:

240

strtolocal = unitolocal

264

strtolocal = unitolocal

241

strfromlocal = unifromlocal

265

strfromlocal = unifromlocal

242

strmethod = unimethod

266

strmethod = unimethod

243

else:

267

else:

244

268

245

def strtolocal(s):

269

def strtolocal(s):

246

# type: (str) -> bytes

270

# type: (str) -> bytes

247

return s

271

return s

248

272

249

def strfromlocal(s):

273

def strfromlocal(s):

250

# type: (bytes) -> str

274

# type: (bytes) -> str

251

return s

275

return s

252

276

253

strmethod = pycompat.identity

277

strmethod = pycompat.identity

254

278

255

if not _nativeenviron:

279

if not _nativeenviron:

256

# now encoding and helper functions are available, recreate the environ

280

# now encoding and helper functions are available, recreate the environ

257

# dict to be exported to other modules

281

# dict to be exported to other modules

258

environ = dict(

282

environ = dict(

259

(tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))

283

(tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))

260

for k, v in os.environ.items() # re-exports

284

for k, v in os.environ.items() # re-exports

261

)

285

)

262

286

263

if pycompat.ispy3:

287

if pycompat.ispy3:

264

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

288

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

265

# returns bytes.

289

# returns bytes.

266

if pycompat.iswindows:

290

if pycompat.iswindows:

267

# Python 3 on Windows issues a DeprecationWarning about using the bytes

291

# Python 3 on Windows issues a DeprecationWarning about using the bytes

268

# API when os.getcwdb() is called.

292

# API when os.getcwdb() is called.

269

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

293

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

270

else:

294

else:

271

getcwd = os.getcwdb # re-exports

295

getcwd = os.getcwdb # re-exports

272

else:

296

else:

273

getcwd = os.getcwd # re-exports

297

getcwd = os.getcwd # re-exports

274

298

275

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

299

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

276

_wide = _sysstr(

300

_wide = _sysstr(

277

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

301

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

278

and b"WFA"

302

and b"WFA"

279

or b"WF"

303

or b"WF"

280

)

304

)

281

305

282

306

283

def colwidth(s):

307

def colwidth(s):

308

# type: (bytes) -> int

284

b"Find the column width of a string for display in the local encoding"

309

b"Find the column width of a string for display in the local encoding"

285

return ucolwidth(s.decode(_sysstr(encoding), r'replace'))

310

return ucolwidth(s.decode(_sysstr(encoding), r'replace'))

286

311

287

312

288

def ucolwidth(d):

313

def ucolwidth(d):

314

# type: (Text) -> int

289

b"Find the column width of a Unicode string for display"

315

b"Find the column width of a Unicode string for display"

290

eaw = getattr(unicodedata, 'east_asian_width', None)

316

eaw = getattr(unicodedata, 'east_asian_width', None)

291

if eaw is not None:

317

if eaw is not None:

292

return sum([eaw(c) in _wide and 2 or 1 for c in d])

318

return sum([eaw(c) in _wide and 2 or 1 for c in d])

293

return len(d)

319

return len(d)

294

320

295

321

296

def getcols(s, start, c):

322

def getcols(s, start, c):

323

# type: (bytes, int, int) -> bytes

297

'''Use colwidth to find a c-column substring of s starting at byte

324

'''Use colwidth to find a c-column substring of s starting at byte

298

index start'''

325

index start'''

299

for x in pycompat.xrange(start + c, len(s)):

326

for x in pycompat.xrange(start + c, len(s)):

300

t = s[start:x]

327

t = s[start:x]

301

if colwidth(t) == c:

328

if colwidth(t) == c:

302

return t

329

return t

303

330

304

331

305

def trim(s, width, ellipsis=b'', leftside=False):

332

def trim(s, width, ellipsis=b'', leftside=False):

333

# type: (bytes, int, bytes, bool) -> bytes

306

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

334

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

307

335

308

If 'leftside' is True, left side of string 's' is trimmed.

336

If 'leftside' is True, left side of string 's' is trimmed.

309

'ellipsis' is always placed at trimmed side.

337

'ellipsis' is always placed at trimmed side.

310

338

311

>>> from .node import bin

339

>>> from .node import bin

312

>>> def bprint(s):

340

>>> def bprint(s):

313

... print(pycompat.sysstr(s))

341

... print(pycompat.sysstr(s))

314

>>> ellipsis = b'+++'

342

>>> ellipsis = b'+++'

315

>>> from . import encoding

343

>>> from . import encoding

316

>>> encoding.encoding = b'utf-8'

344

>>> encoding.encoding = b'utf-8'

317

>>> t = b'1234567890'

345

>>> t = b'1234567890'

318

>>> bprint(trim(t, 12, ellipsis=ellipsis))

346

>>> bprint(trim(t, 12, ellipsis=ellipsis))

319

1234567890

347

1234567890

320

>>> bprint(trim(t, 10, ellipsis=ellipsis))

348

>>> bprint(trim(t, 10, ellipsis=ellipsis))

321

1234567890

349

1234567890

322

>>> bprint(trim(t, 8, ellipsis=ellipsis))

350

>>> bprint(trim(t, 8, ellipsis=ellipsis))

323

12345+++

351

12345+++

324

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

352

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

325

+++67890

353

+++67890

326

>>> bprint(trim(t, 8))

354

>>> bprint(trim(t, 8))

327

12345678

355

12345678

328

>>> bprint(trim(t, 8, leftside=True))

356

>>> bprint(trim(t, 8, leftside=True))

329

34567890

357

34567890

330

>>> bprint(trim(t, 3, ellipsis=ellipsis))

358

>>> bprint(trim(t, 3, ellipsis=ellipsis))

331

+++

359

+++

332

>>> bprint(trim(t, 1, ellipsis=ellipsis))

360

>>> bprint(trim(t, 1, ellipsis=ellipsis))

333

+

361

+

334

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

362

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

335

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

363

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

336

>>> bprint(trim(t, 12, ellipsis=ellipsis))

364

>>> bprint(trim(t, 12, ellipsis=ellipsis))

337

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

365

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

338

>>> bprint(trim(t, 10, ellipsis=ellipsis))

366

>>> bprint(trim(t, 10, ellipsis=ellipsis))

339

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

367

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

340

>>> bprint(trim(t, 8, ellipsis=ellipsis))

368

>>> bprint(trim(t, 8, ellipsis=ellipsis))

341

\xe3\x81\x82\xe3\x81\x84+++

369

\xe3\x81\x82\xe3\x81\x84+++

342

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

370

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

343

+++\xe3\x81\x88\xe3\x81\x8a

371

+++\xe3\x81\x88\xe3\x81\x8a

344

>>> bprint(trim(t, 5))

372

>>> bprint(trim(t, 5))

345

\xe3\x81\x82\xe3\x81\x84

373

\xe3\x81\x82\xe3\x81\x84

346

>>> bprint(trim(t, 5, leftside=True))

374

>>> bprint(trim(t, 5, leftside=True))

347

\xe3\x81\x88\xe3\x81\x8a

375

\xe3\x81\x88\xe3\x81\x8a

348

>>> bprint(trim(t, 4, ellipsis=ellipsis))

376

>>> bprint(trim(t, 4, ellipsis=ellipsis))

349

+++

377

+++

350

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

378

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

351

+++

379

+++

352

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

380

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

353

>>> bprint(trim(t, 12, ellipsis=ellipsis))

381

>>> bprint(trim(t, 12, ellipsis=ellipsis))

354

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

382

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

355

>>> bprint(trim(t, 10, ellipsis=ellipsis))

383

>>> bprint(trim(t, 10, ellipsis=ellipsis))

356

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

384

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

357

>>> bprint(trim(t, 8, ellipsis=ellipsis))

385

>>> bprint(trim(t, 8, ellipsis=ellipsis))

358

\x11\x22\x33\x44\x55+++

386

\x11\x22\x33\x44\x55+++

359

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

387

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

360

+++\x66\x77\x88\x99\xaa

388

+++\x66\x77\x88\x99\xaa

361

>>> bprint(trim(t, 8))

389

>>> bprint(trim(t, 8))

362

\x11\x22\x33\x44\x55\x66\x77\x88

390

\x11\x22\x33\x44\x55\x66\x77\x88

363

>>> bprint(trim(t, 8, leftside=True))

391

>>> bprint(trim(t, 8, leftside=True))

364

\x33\x44\x55\x66\x77\x88\x99\xaa

392

\x33\x44\x55\x66\x77\x88\x99\xaa

365

>>> bprint(trim(t, 3, ellipsis=ellipsis))

393

>>> bprint(trim(t, 3, ellipsis=ellipsis))

366

+++

394

+++

367

>>> bprint(trim(t, 1, ellipsis=ellipsis))

395

>>> bprint(trim(t, 1, ellipsis=ellipsis))

368

+

396

+

369

"""

397

"""

370

try:

398

try:

371

u = s.decode(_sysstr(encoding))

399

u = s.decode(_sysstr(encoding))

372

except UnicodeDecodeError:

400

except UnicodeDecodeError:

373

if len(s) <= width: # trimming is not needed

401

if len(s) <= width: # trimming is not needed

374

return s

402

return s

375

width -= len(ellipsis)

403

width -= len(ellipsis)

376

if width <= 0: # no enough room even for ellipsis

404

if width <= 0: # no enough room even for ellipsis

377

return ellipsis[: width + len(ellipsis)]

405

return ellipsis[: width + len(ellipsis)]

378

if leftside:

406

if leftside:

379

return ellipsis + s[-width:]

407

return ellipsis + s[-width:]

380

return s[:width] + ellipsis

408

return s[:width] + ellipsis

381

409

382

if ucolwidth(u) <= width: # trimming is not needed

410

if ucolwidth(u) <= width: # trimming is not needed

383

return s

411

return s

384

412

385

width -= len(ellipsis)

413

width -= len(ellipsis)

386

if width <= 0: # no enough room even for ellipsis

414

if width <= 0: # no enough room even for ellipsis

387

return ellipsis[: width + len(ellipsis)]

415

return ellipsis[: width + len(ellipsis)]

388

416

389

if leftside:

417

if leftside:

390

uslice = lambda i: u[i:]

418

uslice = lambda i: u[i:]

391

concat = lambda s: ellipsis + s

419

concat = lambda s: ellipsis + s

392

else:

420

else:

393

uslice = lambda i: u[:-i]

421

uslice = lambda i: u[:-i]

394

concat = lambda s: s + ellipsis

422

concat = lambda s: s + ellipsis

395

for i in pycompat.xrange(1, len(u)):

423

for i in pycompat.xrange(1, len(u)):

396

usub = uslice(i)

424

usub = uslice(i)

397

if ucolwidth(usub) <= width:

425

if ucolwidth(usub) <= width:

398

return concat(usub.encode(_sysstr(encoding)))

426

return concat(usub.encode(_sysstr(encoding)))

399

return ellipsis # no enough room for multi-column characters

427

return ellipsis # no enough room for multi-column characters

400

428

401

429

402

def lower(s):

430

def lower(s):

431

# type: (bytes) -> bytes

403

b"best-effort encoding-aware case-folding of local string s"

432

b"best-effort encoding-aware case-folding of local string s"

404

try:

433

try:

405

return asciilower(s)

434

return asciilower(s)

406

except UnicodeDecodeError:

435

except UnicodeDecodeError:

407

pass

436

pass

408

try:

437

try:

409

if isinstance(s, localstr):

438

if isinstance(s, localstr):

410

u = s._utf8.decode("utf-8")

439

u = s._utf8.decode("utf-8")

411

else:

440

else:

412

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

441

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

413

442

414

lu = u.lower()

443

lu = u.lower()

415

if u == lu:

444

if u == lu:

416

return s # preserve localstring

445

return s # preserve localstring

417

return lu.encode(_sysstr(encoding))

446

return lu.encode(_sysstr(encoding))

418

except UnicodeError:

447

except UnicodeError:

419

return s.lower() # we don't know how to fold this except in ASCII

448

return s.lower() # we don't know how to fold this except in ASCII

420

except LookupError as k:

449

except LookupError as k:

421

raise error.Abort(k, hint=b"please check your locale settings")

450

raise error.Abort(k, hint=b"please check your locale settings")

422

451

423

452

424

def upper(s):

453

def upper(s):

454

# type: (bytes) -> bytes

425

b"best-effort encoding-aware case-folding of local string s"

455

b"best-effort encoding-aware case-folding of local string s"

426

try:

456

try:

427

return asciiupper(s)

457

return asciiupper(s)

428

except UnicodeDecodeError:

458

except UnicodeDecodeError:

429

return upperfallback(s)

459

return upperfallback(s)

430

460

431

461

432

def upperfallback(s):

462

def upperfallback(s):

463

# type: (Any) -> Any

433

try:

464

try:

434

if isinstance(s, localstr):

465

if isinstance(s, localstr):

435

u = s._utf8.decode("utf-8")

466

u = s._utf8.decode("utf-8")

436

else:

467

else:

437

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

468

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

438

469

439

uu = u.upper()

470

uu = u.upper()

440

if u == uu:

471

if u == uu:

441

return s # preserve localstring

472

return s # preserve localstring

442

return uu.encode(_sysstr(encoding))

473

return uu.encode(_sysstr(encoding))

443

except UnicodeError:

474

except UnicodeError:

444

return s.upper() # we don't know how to fold this except in ASCII

475

return s.upper() # we don't know how to fold this except in ASCII

445

except LookupError as k:

476

except LookupError as k:

446

raise error.Abort(k, hint=b"please check your locale settings")

477

raise error.Abort(k, hint=b"please check your locale settings")

447

478

448

479

449

class normcasespecs(object):

480

class normcasespecs(object):

450

'''what a platform's normcase does to ASCII strings

481

'''what a platform's normcase does to ASCII strings

451

482

452

This is specified per platform, and should be consistent with what normcase

483

This is specified per platform, and should be consistent with what normcase

453

on that platform actually does.

484

on that platform actually does.

454

485

455

lower: normcase lowercases ASCII strings

486

lower: normcase lowercases ASCII strings

456

upper: normcase uppercases ASCII strings

487

upper: normcase uppercases ASCII strings

457

other: the fallback function should always be called

488

other: the fallback function should always be called

458

489

459

This should be kept in sync with normcase_spec in util.h.'''

490

This should be kept in sync with normcase_spec in util.h.'''

460

491

461

lower = -1

492

lower = -1

462

upper = 1

493

upper = 1

463

other = 0

494

other = 0

464

495

465

496

466

def jsonescape(s, paranoid=False):

497

def jsonescape(s, paranoid=False):

498

# type: (Any, Any) -> Any

467

'''returns a string suitable for JSON

499

'''returns a string suitable for JSON

468

500

469

JSON is problematic for us because it doesn't support non-Unicode

501

JSON is problematic for us because it doesn't support non-Unicode

470

bytes. To deal with this, we take the following approach:

502

bytes. To deal with this, we take the following approach:

471

503

472

- localstr/safelocalstr objects are converted back to UTF-8

504

- localstr/safelocalstr objects are converted back to UTF-8

473

- valid UTF-8/ASCII strings are passed as-is

505

- valid UTF-8/ASCII strings are passed as-is

474

- other strings are converted to UTF-8b surrogate encoding

506

- other strings are converted to UTF-8b surrogate encoding

475

- apply JSON-specified string escaping

507

- apply JSON-specified string escaping

476

508

477

(escapes are doubled in these tests)

509

(escapes are doubled in these tests)

478

510

479

>>> jsonescape(b'this is a test')

511

>>> jsonescape(b'this is a test')

480

'this is a test'

512

'this is a test'

481

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

513

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

482

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

514

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

483

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

515

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

484

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

516

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

485

>>> jsonescape(b'a weird byte: \\xdd')

517

>>> jsonescape(b'a weird byte: \\xdd')

486

'a weird byte: \\xed\\xb3\\x9d'

518

'a weird byte: \\xed\\xb3\\x9d'

487

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

519

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

488

'utf-8: caf\\xc3\\xa9'

520

'utf-8: caf\\xc3\\xa9'

489

>>> jsonescape(b'')

521

>>> jsonescape(b'')

490

''

522

''

491

523

492

If paranoid, non-ascii and common troublesome characters are also escaped.

524

If paranoid, non-ascii and common troublesome characters are also escaped.

493

This is suitable for web output.

525

This is suitable for web output.

494

526

495

>>> s = b'escape characters: \\0 \\x0b \\x7f'

527

>>> s = b'escape characters: \\0 \\x0b \\x7f'

496

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

528

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

497

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

529

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

498

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

530

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

499

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

531

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

500

'escape boundary: ~ \\\\u007f \\\\u0080'

532

'escape boundary: ~ \\\\u007f \\\\u0080'

501

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

533

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

502

'a weird byte: \\\\udcdd'

534

'a weird byte: \\\\udcdd'

503

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

535

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

504

'utf-8: caf\\\\u00e9'

536

'utf-8: caf\\\\u00e9'

505

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

537

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

506

'non-BMP: \\\\ud834\\\\udd1e'

538

'non-BMP: \\\\ud834\\\\udd1e'

507

>>> jsonescape(b'<foo@example.org>', paranoid=True)

539

>>> jsonescape(b'<foo@example.org>', paranoid=True)

508

'\\\\u003cfoo@example.org\\\\u003e'

540

'\\\\u003cfoo@example.org\\\\u003e'

509

'''

541

'''

510

542

511

u8chars = toutf8b(s)

543

u8chars = toutf8b(s)

512

try:

544

try:

513

return _jsonescapeu8fast(u8chars, paranoid)

545

return _jsonescapeu8fast(u8chars, paranoid)

514

except ValueError:

546

except ValueError:

515

pass

547

pass

516

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

548

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

517

549

518

550

519

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

551

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

520

# bytes are mapped to that range.

552

# bytes are mapped to that range.

521

if pycompat.ispy3:

553

if pycompat.ispy3:

522

_utf8strict = r'surrogatepass'

554

_utf8strict = r'surrogatepass'

523

else:

555

else:

524

_utf8strict = r'strict'

556

_utf8strict = r'strict'

525

557

526

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

558

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

527

559

528

560

529

def getutf8char(s, pos):

561

def getutf8char(s, pos):

562

# type: (Any, Any) -> Any

530

'''get the next full utf-8 character in the given string, starting at pos

563

'''get the next full utf-8 character in the given string, starting at pos

531

564

532

Raises a UnicodeError if the given location does not start a valid

565

Raises a UnicodeError if the given location does not start a valid

533

utf-8 character.

566

utf-8 character.

534

'''

567

'''

535

568

536

# find how many bytes to attempt decoding from first nibble

569

# find how many bytes to attempt decoding from first nibble

537

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

570

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

538

if not l: # ascii

571

if not l: # ascii

539

return s[pos : pos + 1]

572

return s[pos : pos + 1]

540

573

541

c = s[pos : pos + l]

574

c = s[pos : pos + l]

542

# validate with attempted decode

575

# validate with attempted decode

543

c.decode("utf-8", _utf8strict)

576

c.decode("utf-8", _utf8strict)

544

return c

577

return c

545

578

546

579

547

def toutf8b(s):

580

def toutf8b(s):

581

# type: (Any) -> Any

548

'''convert a local, possibly-binary string into UTF-8b

582

'''convert a local, possibly-binary string into UTF-8b

549

583

550

This is intended as a generic method to preserve data when working

584

This is intended as a generic method to preserve data when working

551

with schemes like JSON and XML that have no provision for

585

with schemes like JSON and XML that have no provision for

552

arbitrary byte strings. As Mercurial often doesn't know

586

arbitrary byte strings. As Mercurial often doesn't know

553

what encoding data is in, we use so-called UTF-8b.

587

what encoding data is in, we use so-called UTF-8b.

554

588

555

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

589

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

556

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

590

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

557

uDC00-uDCFF.

591

uDC00-uDCFF.

558

592

559

Principles of operation:

593

Principles of operation:

560

594

561

- ASCII and UTF-8 data successfully round-trips and is understood

595

- ASCII and UTF-8 data successfully round-trips and is understood

562

by Unicode-oriented clients

596

by Unicode-oriented clients

563

- filenames and file contents in arbitrary other encodings can have

597

- filenames and file contents in arbitrary other encodings can have

564

be round-tripped or recovered by clueful clients

598

be round-tripped or recovered by clueful clients

565

- local strings that have a cached known UTF-8 encoding (aka

599

- local strings that have a cached known UTF-8 encoding (aka

566

localstr) get sent as UTF-8 so Unicode-oriented clients get the

600

localstr) get sent as UTF-8 so Unicode-oriented clients get the

567

Unicode data they want

601

Unicode data they want

568

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

602

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

569

- because we must preserve UTF-8 bytestring in places such as

603

- because we must preserve UTF-8 bytestring in places such as

570

filenames, metadata can't be roundtripped without help

604

filenames, metadata can't be roundtripped without help

571

605

572

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

606

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

573

arbitrary bytes into an internal Unicode format that can be

607

arbitrary bytes into an internal Unicode format that can be

574

re-encoded back into the original. Here we are exposing the

608

re-encoded back into the original. Here we are exposing the

575

internal surrogate encoding as a UTF-8 string.)

609

internal surrogate encoding as a UTF-8 string.)

576

'''

610

'''

577

611

578

if isinstance(s, localstr):

612

if isinstance(s, localstr):

579

# assume that the original UTF-8 sequence would never contain

613

# assume that the original UTF-8 sequence would never contain

580

# invalid characters in U+DCxx range

614

# invalid characters in U+DCxx range

581

return s._utf8

615

return s._utf8

582

elif isinstance(s, safelocalstr):

616

elif isinstance(s, safelocalstr):

583

# already verified that s is non-lossy in legacy encoding, which

617

# already verified that s is non-lossy in legacy encoding, which

584

# shouldn't contain characters in U+DCxx range

618

# shouldn't contain characters in U+DCxx range

585

return fromlocal(s)

619

return fromlocal(s)

586

elif isasciistr(s):

620

elif isasciistr(s):

587

return s

621

return s

588

if b"\xed" not in s:

622

if b"\xed" not in s:

589

try:

623

try:

590

s.decode('utf-8', _utf8strict)

624

s.decode('utf-8', _utf8strict)

591

return s

625

return s

592

except UnicodeDecodeError:

626

except UnicodeDecodeError:

593

pass

627

pass

594

628

595

s = pycompat.bytestr(s)

629

s = pycompat.bytestr(s)

596

r = b""

630

r = b""

597

pos = 0

631

pos = 0

598

l = len(s)

632

l = len(s)

599

while pos < l:

633

while pos < l:

600

try:

634

try:

601

c = getutf8char(s, pos)

635

c = getutf8char(s, pos)

602

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

636

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

603

# have to re-escape existing U+DCxx characters

637

# have to re-escape existing U+DCxx characters

604

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

638

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

605

pos += 1

639

pos += 1

606

else:

640

else:

607

pos += len(c)

641

pos += len(c)

608

except UnicodeDecodeError:

642

except UnicodeDecodeError:

609

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

643

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

610

pos += 1

644

pos += 1

611

r += c

645

r += c

612

return r

646

return r

613

647

614

648

615

def fromutf8b(s):

649

def fromutf8b(s):

650

# type: (Text) -> bytes

616

'''Given a UTF-8b string, return a local, possibly-binary string.

651

'''Given a UTF-8b string, return a local, possibly-binary string.

617

652

618

return the original binary string. This

653

return the original binary string. This

619

is a round-trip process for strings like filenames, but metadata

654

is a round-trip process for strings like filenames, but metadata

620

that's was passed through tolocal will remain in UTF-8.

655

that's was passed through tolocal will remain in UTF-8.

621

656

622

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

657

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

623

>>> m = b"\\xc3\\xa9\\x99abcd"

658

>>> m = b"\\xc3\\xa9\\x99abcd"

624

>>> toutf8b(m)

659

>>> toutf8b(m)

625

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

660

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

626

>>> roundtrip(m)

661

>>> roundtrip(m)

627

True

662

True

628

>>> roundtrip(b"\\xc2\\xc2\\x80")

663

>>> roundtrip(b"\\xc2\\xc2\\x80")

629

True

664

True

630

>>> roundtrip(b"\\xef\\xbf\\xbd")

665

>>> roundtrip(b"\\xef\\xbf\\xbd")

631

True

666

True

632

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

667

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

633

True

668

True

634

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

669

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

635

True

670

True

636

'''

671

'''

637

672

638

if isasciistr(s):

673

if isasciistr(s):

639

return s

674

return s

640

# fast path - look for uDxxx prefixes in s

675

# fast path - look for uDxxx prefixes in s

641

if b"\xed" not in s:

676

if b"\xed" not in s:

642

return s

677

return s

643

678

644

# We could do this with the unicode type but some Python builds

679

# We could do this with the unicode type but some Python builds

645

# use UTF-16 internally (issue5031) which causes non-BMP code

680

# use UTF-16 internally (issue5031) which causes non-BMP code

646

# points to be escaped. Instead, we use our handy getutf8char

681

# points to be escaped. Instead, we use our handy getutf8char

647

# helper again to walk the string without "decoding" it.

682

# helper again to walk the string without "decoding" it.

648

683

649

s = pycompat.bytestr(s)

684

s = pycompat.bytestr(s)

650

r = b""

685

r = b""

651

pos = 0

686

pos = 0

652

l = len(s)

687

l = len(s)

653

while pos < l:

688

while pos < l:

654

c = getutf8char(s, pos)

689

c = getutf8char(s, pos)

655

pos += len(c)

690

pos += len(c)

656

# unescape U+DCxx characters

691

# unescape U+DCxx characters

657

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

692

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

658

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

693

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

659

r += c

694

r += c

660

return r

695

return r

             #require test-repo pyflakes hg10
               $ . "$TESTDIR/helpers-testrepo.sh"
             run pyflakes on all tracked files ending in .py or without a file ending
             (skipping binary file random-seed)
               $ cat > test.py <<EOF
               > print(undefinedname)
               > EOF
               $ pyflakes test.py 2>/dev/null | "$TESTDIR/filterpyflakes.py"
               test.py:1: undefined name 'undefinedname'
               $ cd "`dirname "$TESTDIR"`"
               $ testrepohg locate 'set:**.py or grep("^#!.*python")' \
               > -X hgext/fsmonitor/pywatchman \
               > -X mercurial/pycompat.py -X contrib/python-zstandard \
               > -X mercurial/thirdparty/cbor \
               > -X mercurial/thirdparty/concurrent \
               > -X mercurial/thirdparty/zope \
               > 2>/dev/null \
               > | xargs pyflakes 2>/dev/null | "$TESTDIR/filterpyflakes.py"
               contrib/perf.py:*: undefined name 'xrange' (glob) (?)
               mercurial/hgweb/server.py:*: undefined name 'reload' (glob) (?)
               mercurial/util.py:*: undefined name 'file' (glob) (?)
+              mercurial/encoding.py:*: undefined name 'localstr' (glob) (?)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
+            if not globals():  # hide this from non-pytype users
+                from typing import (
+                    Any,
+                    Callable,
+                    List,
+                    Text,
+                    Type,
+                    TypeVar,
+                    Union,
+                )
+                # keep pyflakes happy
+                for t in (Any, Callable, List, Text, Type, Union):
+                    assert t
+                _Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
+                # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict(
                     (k.encode(r'utf-8'), v.encode(r'utf-8'))
                     for k, v in os.environ.items()  # re-exports
                 )
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
+                    # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
+                # type: (Text) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), r"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), r"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), r"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def fromlocal(s):
+                # type: (bytes) -> Text
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
+                # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
+                # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
+                # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 def strtolocal(s):
                     # type: (str) -> bytes
                     return s
                 def strfromlocal(s):
                     # type: (bytes) -> str
                     return s
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict(
                     (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
                     for k, v in os.environ.items()  # re-exports
                 )
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     getcwd = lambda: strtolocal(os.getcwd())  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
+                # type: (bytes) -> int
                 b"Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
             def ucolwidth(d):
+                # type: (Text) -> int
                 b"Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
+                # type: (bytes, int, int) -> bytes
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis=b'', leftside=False):
+                # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             def lower(s):
+                # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
+                # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
+                # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
+                # type: (Any, Any) -> Any
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
+                # type: (Any, Any) -> Any
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
+                # type: (Any) -> Any
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
+                # type: (Text) -> bytes
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r