upstream/mercurial-mirror Commit - r44074:7cf33231

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

if not globals(): # hide this from non-pytype users

23

if not globals(): # hide this from non-pytype users

24

from typing import (

24

from typing import (

25

Any,

25

Any,

26

Callable,

26

Callable,

27

List,

27

List,

28

Text,

28

Text,

29

Type,

29

Type,

30

TypeVar,

30

TypeVar,

31

Union,

31

Union,

32

)

32

)

33

34

# keep pyflakes happy

34

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

35

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

36

assert t

37

38

_Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)

38

_Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)

39

40

charencode = policy.importmod('charencode')

40

charencode = policy.importmod('charencode')

41

42

isasciistr = charencode.isasciistr

42

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

43

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

44

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

47

_sysstr = pycompat.sysstr

47

_sysstr = pycompat.sysstr

48

49

if pycompat.ispy3:

49

if pycompat.ispy3:

50

unichr = chr

50

unichr = chr

51

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# "Unicode Subtleties"), so we need to ignore them in some places for

53

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# sanity.

54

# sanity.

55

_ignore = [

55

_ignore = [

56

unichr(int(x, 16)).encode("utf-8")

56

unichr(int(x, 16)).encode("utf-8")

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

b"206a 206b 206c 206d 206e 206f feff".split()

58

b"206a 206b 206c 206d 206e 206f feff".split()

59

]

59

]

60

# verify the next function will work

60

# verify the next function will work

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

63

64

def hfsignoreclean(s):

64

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

65

# type: (bytes) -> bytes

66

"""Remove codepoints ignored by HFS+ from s.

66

"""Remove codepoints ignored by HFS+ from s.

67

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

'.hg'

69

'.hg'

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

'.hg'

71

'.hg'

72

"""

72

"""

73

if b"\xe2" in s or b"\xef" in s:

73

if b"\xe2" in s or b"\xef" in s:

74

for c in _ignore:

74

for c in _ignore:

75

s = s.replace(c, b'')

75

s = s.replace(c, b'')

76

return s

76

return s

77

78

79

# encoding.environ is provided read-only, which may not be used to modify

79

# encoding.environ is provided read-only, which may not be used to modify

80

# the process environment

80

# the process environment

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

82

if not pycompat.ispy3:

82

if not pycompat.ispy3:

83

environ = os.environ # re-exports

83

environ = os.environ # re-exports

84

elif _nativeenviron:

84

elif _nativeenviron:

85

environ = os.environb # re-exports

85

environ = os.environb # re-exports

86

else:

86

else:

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# and recreate it once encoding is settled

88

# and recreate it once encoding is settled

89

environ = dict(

89

environ = dict(

90

(k.encode('utf-8'), v.encode('utf-8'))

90

(k.encode('utf-8'), v.encode('utf-8'))

91

for k, v in os.environ.items() # re-exports

91

for k, v in os.environ.items() # re-exports

92

)

92

)

93

94

_encodingrewrites = {

94

_encodingrewrites = {

95

b'646': b'ascii',

95

b'646': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

97

}

97

}

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# https://bugs.python.org/issue13216

100

# https://bugs.python.org/issue13216

101

if pycompat.iswindows and not pycompat.ispy3:

101

if pycompat.iswindows and not pycompat.ispy3:

102

_encodingrewrites[b'cp65001'] = b'utf-8'

102

_encodingrewrites[b'cp65001'] = b'utf-8'

103

104

try:

104

try:

105

encoding = environ.get(b"HGENCODING")

105

encoding = environ.get(b"HGENCODING")

106

if not encoding:

106

if not encoding:

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = _encodingrewrites.get(encoding, encoding)

108

encoding = _encodingrewrites.get(encoding, encoding)

109

except locale.Error:

109

except locale.Error:

110

encoding = b'ascii'

110

encoding = b'ascii'

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

fallbackencoding = b'ISO-8859-1'

112

fallbackencoding = b'ISO-8859-1'

113

114

115

class localstr(bytes):

115

class localstr(bytes):

116

'''This class allows strings that are unmodified to be

116

'''This class allows strings that are unmodified to be

117

round-tripped to the local encoding and back'''

117

round-tripped to the local encoding and back'''

118

119

def __new__(cls, u, l):

119

def __new__(cls, u, l):

120

# type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr

120

# type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr

121

s = bytes.__new__(cls, l)

121

s = bytes.__new__(cls, l)

122

s._utf8 = u

122

s._utf8 = u

123

return s

123

return s

124

125

def __hash__(self):

125

def __hash__(self):

126

return hash(self._utf8) # avoid collisions in local string space

126

return hash(self._utf8) # avoid collisions in local string space

127

128

129

class safelocalstr(bytes):

129

class safelocalstr(bytes):

130

"""Tagged string denoting it was previously an internal UTF-8 string,

130

"""Tagged string denoting it was previously an internal UTF-8 string,

131

and can be converted back to UTF-8 losslessly

131

and can be converted back to UTF-8 losslessly

132

133

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

133

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

134

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

134

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

135

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

135

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

136

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

136

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

137

"""

137

"""

138

139

140

def tolocal(s):

140

def tolocal(s):

141

# type: (Text) -> bytes

141

# type: (Text) -> bytes

142

"""

142

"""

143

Convert a string from internal UTF-8 to local encoding

143

Convert a string from internal UTF-8 to local encoding

144

145

All internal strings should be UTF-8 but some repos before the

145

All internal strings should be UTF-8 but some repos before the

146

implementation of locale support may contain latin1 or possibly

146

implementation of locale support may contain latin1 or possibly

147

other character sets. We attempt to decode everything strictly

147

other character sets. We attempt to decode everything strictly

148

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

148

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

149

replace unknown characters.

149

replace unknown characters.

150

151

The localstr class is used to cache the known UTF-8 encoding of

151

The localstr class is used to cache the known UTF-8 encoding of

152

strings next to their local representation to allow lossless

152

strings next to their local representation to allow lossless

153

round-trip conversion back to UTF-8.

153

round-trip conversion back to UTF-8.

154

155

>>> u = b'foo: \\xc3\\xa4' # utf-8

155

>>> u = b'foo: \\xc3\\xa4' # utf-8

156

>>> l = tolocal(u)

156

>>> l = tolocal(u)

157

>>> l

157

>>> l

158

'foo: ?'

158

'foo: ?'

159

>>> fromlocal(l)

159

>>> fromlocal(l)

160

'foo: \\xc3\\xa4'

160

'foo: \\xc3\\xa4'

161

>>> u2 = b'foo: \\xc3\\xa1'

161

>>> u2 = b'foo: \\xc3\\xa1'

162

>>> d = { l: 1, tolocal(u2): 2 }

162

>>> d = { l: 1, tolocal(u2): 2 }

163

>>> len(d) # no collision

163

>>> len(d) # no collision

164

2

164

2

165

>>> b'foo: ?' in d

165

>>> b'foo: ?' in d

166

False

166

False

167

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

167

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

168

>>> l = tolocal(l1)

168

>>> l = tolocal(l1)

169

>>> l

169

>>> l

170

'foo: ?'

170

'foo: ?'

171

>>> fromlocal(l) # magically in utf-8

171

>>> fromlocal(l) # magically in utf-8

172

'foo: \\xc3\\xa4'

172

'foo: \\xc3\\xa4'

173

"""

173

"""

174

175

if isasciistr(s):

175

if isasciistr(s):

176

return s

176

return s

177

178

try:

178

try:

179

try:

179

try:

180

# make sure string is actually stored in UTF-8

180

# make sure string is actually stored in UTF-8

181

u = s.decode('UTF-8')

181

u = s.decode('UTF-8')

182

if encoding == b'UTF-8':

182

if encoding == b'UTF-8':

183

# fast path

183

# fast path

184

return s

184

return s

185

r = u.encode(_sysstr(encoding), "replace")

185

r = u.encode(_sysstr(encoding), "replace")

186

if u == r.decode(_sysstr(encoding)):

186

if u == r.decode(_sysstr(encoding)):

187

# r is a safe, non-lossy encoding of s

187

# r is a safe, non-lossy encoding of s

188

return safelocalstr(r)

188

return safelocalstr(r)

189

return localstr(s, r)

189

return localstr(s, r)

190

except UnicodeDecodeError:

190

except UnicodeDecodeError:

191

# we should only get here if we're looking at an ancient changeset

191

# we should only get here if we're looking at an ancient changeset

192

try:

192

try:

193

u = s.decode(_sysstr(fallbackencoding))

193

u = s.decode(_sysstr(fallbackencoding))

194

r = u.encode(_sysstr(encoding), "replace")

194

r = u.encode(_sysstr(encoding), "replace")

195

if u == r.decode(_sysstr(encoding)):

195

if u == r.decode(_sysstr(encoding)):

196

# r is a safe, non-lossy encoding of s

196

# r is a safe, non-lossy encoding of s

197

return safelocalstr(r)

197

return safelocalstr(r)

198

return localstr(u.encode('UTF-8'), r)

198

return localstr(u.encode('UTF-8'), r)

199

except UnicodeDecodeError:

199

except UnicodeDecodeError:

200

u = s.decode("utf-8", "replace") # last ditch

200

u = s.decode("utf-8", "replace") # last ditch

201

# can't round-trip

201

# can't round-trip

202

return u.encode(_sysstr(encoding), "replace")

202

return u.encode(_sysstr(encoding), "replace")

203

except LookupError as k:

203

except LookupError as k:

204

raise error.Abort(k, hint=b"please check your locale settings")

204

raise error.Abort(k, hint=b"please check your locale settings")

205

206

207

def fromlocal(s):

207

def fromlocal(s):

208

# type: (bytes) -> bytes

208

# type: (bytes) -> bytes

209

"""

209

"""

210

Convert a string from the local character encoding to UTF-8

210

Convert a string from the local character encoding to UTF-8

211

212

We attempt to decode strings using the encoding mode set by

212

We attempt to decode strings using the encoding mode set by

213

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

213

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

214

characters will cause an error message. Other modes include

214

characters will cause an error message. Other modes include

215

'replace', which replaces unknown characters with a special

215

'replace', which replaces unknown characters with a special

216

Unicode character, and 'ignore', which drops the character.

216

Unicode character, and 'ignore', which drops the character.

217

"""

217

"""

218

219

# can we do a lossless round-trip?

219

# can we do a lossless round-trip?

220

if isinstance(s, localstr):

220

if isinstance(s, localstr):

221

return s._utf8

221

return s._utf8

222

if isasciistr(s):

222

if isasciistr(s):

223

return s

223

return s

224

225

try:

225

try:

226

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

226

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

227

return u.encode("utf-8")

227

return u.encode("utf-8")

228

except UnicodeDecodeError as inst:

228

except UnicodeDecodeError as inst:

229

sub = s[max(0, inst.start - 10) : inst.start + 10]

229

sub = s[max(0, inst.start - 10) : inst.start + 10]

230

raise error.Abort(

230

raise error.Abort(

231

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

231

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

232

)

232

)

233

except LookupError as k:

233

except LookupError as k:

234

raise error.Abort(k, hint=b"please check your locale settings")

234

raise error.Abort(k, hint=b"please check your locale settings")

235

236

237

def unitolocal(u):

237

def unitolocal(u):

238

# type: (Text) -> bytes

238

# type: (Text) -> bytes

239

"""Convert a unicode string to a byte string of local encoding"""

239

"""Convert a unicode string to a byte string of local encoding"""

240

return tolocal(u.encode('utf-8'))

240

return tolocal(u.encode('utf-8'))

241

242

243

def unifromlocal(s):

243

def unifromlocal(s):

244

# type: (bytes) -> Text

244

# type: (bytes) -> Text

245

"""Convert a byte string of local encoding to a unicode string"""

245

"""Convert a byte string of local encoding to a unicode string"""

246

return fromlocal(s).decode('utf-8')

246

return fromlocal(s).decode('utf-8')

247

248

249

def unimethod(bytesfunc):

249

def unimethod(bytesfunc):

250

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

250

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

251

"""Create a proxy method that forwards __unicode__() and __str__() of

251

"""Create a proxy method that forwards __unicode__() and __str__() of

252

Python 3 to __bytes__()"""

252

Python 3 to __bytes__()"""

253

254

def unifunc(obj):

254

def unifunc(obj):

255

return unifromlocal(bytesfunc(obj))

255

return unifromlocal(bytesfunc(obj))

256

257

return unifunc

257

return unifunc

258

259

260

# converter functions between native str and byte string. use these if the

260

# converter functions between native str and byte string. use these if the

261

# character encoding is not aware (e.g. exception message) or is known to

261

# character encoding is not aware (e.g. exception message) or is known to

262

# be locale dependent (e.g. date formatting.)

262

# be locale dependent (e.g. date formatting.)

263

if pycompat.ispy3:

263

if pycompat.ispy3:

264

strtolocal = unitolocal

264

strtolocal = unitolocal

265

strfromlocal = unifromlocal

265

strfromlocal = unifromlocal

266

strmethod = unimethod

266

strmethod = unimethod

267

else:

267

else:

268

269

def strtolocal(s):

269

def strtolocal(s):

270

# type: (str) -> bytes

270

# type: (str) -> bytes

271

return s

271

return s

272

273

def strfromlocal(s):

273

def strfromlocal(s):

274

# type: (bytes) -> str

274

# type: (bytes) -> str

275

return s

275

return s

276

277

strmethod = pycompat.identity

277

strmethod = pycompat.identity

278

279

if not _nativeenviron:

279

if not _nativeenviron:

280

# now encoding and helper functions are available, recreate the environ

280

# now encoding and helper functions are available, recreate the environ

281

# dict to be exported to other modules

281

# dict to be exported to other modules

282

environ = dict(

282

environ = dict(

283

(tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))

283

(tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))

284

for k, v in os.environ.items() # re-exports

284

for k, v in os.environ.items() # re-exports

285

)

285

)

286

287

if pycompat.ispy3:

287

if pycompat.ispy3:

288

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

288

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

289

# returns bytes.

289

# returns bytes.

290

if pycompat.iswindows:

290

if pycompat.iswindows:

291

# Python 3 on Windows issues a DeprecationWarning about using the bytes

291

# Python 3 on Windows issues a DeprecationWarning about using the bytes

292

# API when os.getcwdb() is called.

292

# API when os.getcwdb() is called.

293

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

293

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

294

else:

294

else:

295

getcwd = os.getcwdb # re-exports

295

getcwd = os.getcwdb # re-exports

296

else:

296

else:

297

getcwd = os.getcwd # re-exports

297

getcwd = os.getcwd # re-exports

298

299

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

299

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

300

_wide = _sysstr(

300

_wide = _sysstr(

301

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

301

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

302

and b"WFA"

302

and b"WFA"

303

or b"WF"

303

or b"WF"

304

)

304

)

305

306

307

def colwidth(s):

307

def colwidth(s):

308

# type: (bytes) -> int

308

# type: (bytes) -> int

309

b"Find the column width of a string for display in the local encoding"

309

b"Find the column width of a string for display in the local encoding"

310

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

310

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

311

312

313

def ucolwidth(d):

313

def ucolwidth(d):

314

# type: (Text) -> int

314

# type: (Text) -> int

315

b"Find the column width of a Unicode string for display"

315

b"Find the column width of a Unicode string for display"

316

eaw = getattr(unicodedata, 'east_asian_width', None)

316

eaw = getattr(unicodedata, 'east_asian_width', None)

317

if eaw is not None:

317

if eaw is not None:

318

return sum([eaw(c) in _wide and 2 or 1 for c in d])

318

return sum([eaw(c) in _wide and 2 or 1 for c in d])

319

return len(d)

319

return len(d)

320

321

322

def getcols(s, start, c):

322

def getcols(s, start, c):

323

# type: (bytes, int, int) -> bytes

323

# type: (bytes, int, int) -> bytes

324

'''Use colwidth to find a c-column substring of s starting at byte

324

'''Use colwidth to find a c-column substring of s starting at byte

325

index start'''

325

index start'''

326

for x in pycompat.xrange(start + c, len(s)):

326

for x in pycompat.xrange(start + c, len(s)):

327

t = s[start:x]

327

t = s[start:x]

328

if colwidth(t) == c:

328

if colwidth(t) == c:

329

return t

329

return t

330

raise ValueError('substring not found')

330

331

332

def trim(s, width, ellipsis=b'', leftside=False):

333

def trim(s, width, ellipsis=b'', leftside=False):

333

# type: (bytes, int, bytes, bool) -> bytes

334

# type: (bytes, int, bytes, bool) -> bytes

334

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

335

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

335

336

If 'leftside' is True, left side of string 's' is trimmed.

337

If 'leftside' is True, left side of string 's' is trimmed.

337

'ellipsis' is always placed at trimmed side.

338

'ellipsis' is always placed at trimmed side.

338

339

>>> from .node import bin

340

>>> from .node import bin

340

>>> def bprint(s):

341

>>> def bprint(s):

341

... print(pycompat.sysstr(s))

342

... print(pycompat.sysstr(s))

342

>>> ellipsis = b'+++'

343

>>> ellipsis = b'+++'

343

>>> from . import encoding

344

>>> from . import encoding

344

>>> encoding.encoding = b'utf-8'

345

>>> encoding.encoding = b'utf-8'

345

>>> t = b'1234567890'

346

>>> t = b'1234567890'

346

>>> bprint(trim(t, 12, ellipsis=ellipsis))

347

>>> bprint(trim(t, 12, ellipsis=ellipsis))

347

1234567890

348

1234567890

348

>>> bprint(trim(t, 10, ellipsis=ellipsis))

349

>>> bprint(trim(t, 10, ellipsis=ellipsis))

349

1234567890

350

1234567890

350

>>> bprint(trim(t, 8, ellipsis=ellipsis))

351

>>> bprint(trim(t, 8, ellipsis=ellipsis))

351

12345+++

352

12345+++

352

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

353

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

353

+++67890

354

+++67890

354

>>> bprint(trim(t, 8))

355

>>> bprint(trim(t, 8))

355

12345678

356

12345678

356

>>> bprint(trim(t, 8, leftside=True))

357

>>> bprint(trim(t, 8, leftside=True))

357

34567890

358

34567890

358

>>> bprint(trim(t, 3, ellipsis=ellipsis))

359

>>> bprint(trim(t, 3, ellipsis=ellipsis))

359

+++

360

+++

360

>>> bprint(trim(t, 1, ellipsis=ellipsis))

361

>>> bprint(trim(t, 1, ellipsis=ellipsis))

361

+

362

+

362

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

363

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

363

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

364

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

364

>>> bprint(trim(t, 12, ellipsis=ellipsis))

365

>>> bprint(trim(t, 12, ellipsis=ellipsis))

365

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

366

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

366

>>> bprint(trim(t, 10, ellipsis=ellipsis))

367

>>> bprint(trim(t, 10, ellipsis=ellipsis))

367

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

368

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

368

>>> bprint(trim(t, 8, ellipsis=ellipsis))

369

>>> bprint(trim(t, 8, ellipsis=ellipsis))

369

\xe3\x81\x82\xe3\x81\x84+++

370

\xe3\x81\x82\xe3\x81\x84+++

370

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

371

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

371

+++\xe3\x81\x88\xe3\x81\x8a

372

+++\xe3\x81\x88\xe3\x81\x8a

372

>>> bprint(trim(t, 5))

373

>>> bprint(trim(t, 5))

373

\xe3\x81\x82\xe3\x81\x84

374

\xe3\x81\x82\xe3\x81\x84

374

>>> bprint(trim(t, 5, leftside=True))

375

>>> bprint(trim(t, 5, leftside=True))

375

\xe3\x81\x88\xe3\x81\x8a

376

\xe3\x81\x88\xe3\x81\x8a

376

>>> bprint(trim(t, 4, ellipsis=ellipsis))

377

>>> bprint(trim(t, 4, ellipsis=ellipsis))

377

+++

378

+++

378

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

379

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

379

+++

380

+++

380

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

381

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

381

>>> bprint(trim(t, 12, ellipsis=ellipsis))

382

>>> bprint(trim(t, 12, ellipsis=ellipsis))

382

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

383

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

383

>>> bprint(trim(t, 10, ellipsis=ellipsis))

384

>>> bprint(trim(t, 10, ellipsis=ellipsis))

384

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

385

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

385

>>> bprint(trim(t, 8, ellipsis=ellipsis))

386

>>> bprint(trim(t, 8, ellipsis=ellipsis))

386

\x11\x22\x33\x44\x55+++

387

\x11\x22\x33\x44\x55+++

387

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

388

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

388

+++\x66\x77\x88\x99\xaa

389

+++\x66\x77\x88\x99\xaa

389

>>> bprint(trim(t, 8))

390

>>> bprint(trim(t, 8))

390

\x11\x22\x33\x44\x55\x66\x77\x88

391

\x11\x22\x33\x44\x55\x66\x77\x88

391

>>> bprint(trim(t, 8, leftside=True))

392

>>> bprint(trim(t, 8, leftside=True))

392

\x33\x44\x55\x66\x77\x88\x99\xaa

393

\x33\x44\x55\x66\x77\x88\x99\xaa

393

>>> bprint(trim(t, 3, ellipsis=ellipsis))

394

>>> bprint(trim(t, 3, ellipsis=ellipsis))

394

+++

395

+++

395

>>> bprint(trim(t, 1, ellipsis=ellipsis))

396

>>> bprint(trim(t, 1, ellipsis=ellipsis))

396

+

397

+

397

"""

398

"""

398

try:

399

try:

399

u = s.decode(_sysstr(encoding))

400

u = s.decode(_sysstr(encoding))

400

except UnicodeDecodeError:

401

except UnicodeDecodeError:

401

if len(s) <= width: # trimming is not needed

402

if len(s) <= width: # trimming is not needed

402

return s

403

return s

403

width -= len(ellipsis)

404

width -= len(ellipsis)

404

if width <= 0: # no enough room even for ellipsis

405

if width <= 0: # no enough room even for ellipsis

405

return ellipsis[: width + len(ellipsis)]

406

return ellipsis[: width + len(ellipsis)]

406

if leftside:

407

if leftside:

407

return ellipsis + s[-width:]

408

return ellipsis + s[-width:]

408

return s[:width] + ellipsis

409

return s[:width] + ellipsis

409

410

if ucolwidth(u) <= width: # trimming is not needed

411

if ucolwidth(u) <= width: # trimming is not needed

411

return s

412

return s

412

413

width -= len(ellipsis)

414

width -= len(ellipsis)

414

if width <= 0: # no enough room even for ellipsis

415

if width <= 0: # no enough room even for ellipsis

415

return ellipsis[: width + len(ellipsis)]

416

return ellipsis[: width + len(ellipsis)]

416

417

if leftside:

418

if leftside:

418

uslice = lambda i: u[i:]

419

uslice = lambda i: u[i:]

419

concat = lambda s: ellipsis + s

420

concat = lambda s: ellipsis + s

420

else:

421

else:

421

uslice = lambda i: u[:-i]

422

uslice = lambda i: u[:-i]

422

concat = lambda s: s + ellipsis

423

concat = lambda s: s + ellipsis

423

for i in pycompat.xrange(1, len(u)):

424

for i in pycompat.xrange(1, len(u)):

424

usub = uslice(i)

425

usub = uslice(i)

425

if ucolwidth(usub) <= width:

426

if ucolwidth(usub) <= width:

426

return concat(usub.encode(_sysstr(encoding)))

427

return concat(usub.encode(_sysstr(encoding)))

427

return ellipsis # no enough room for multi-column characters

428

return ellipsis # no enough room for multi-column characters

428

429

430

def lower(s):

431

def lower(s):

431

# type: (bytes) -> bytes

432

# type: (bytes) -> bytes

432

b"best-effort encoding-aware case-folding of local string s"

433

b"best-effort encoding-aware case-folding of local string s"

433

try:

434

try:

434

return asciilower(s)

435

return asciilower(s)

435

except UnicodeDecodeError:

436

except UnicodeDecodeError:

436

pass

437

pass

437

try:

438

try:

438

if isinstance(s, localstr):

439

if isinstance(s, localstr):

439

u = s._utf8.decode("utf-8")

440

u = s._utf8.decode("utf-8")

440

else:

441

else:

441

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

442

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

442

443

lu = u.lower()

444

lu = u.lower()

444

if u == lu:

445

if u == lu:

445

return s # preserve localstring

446

return s # preserve localstring

446

return lu.encode(_sysstr(encoding))

447

return lu.encode(_sysstr(encoding))

447

except UnicodeError:

448

except UnicodeError:

448

return s.lower() # we don't know how to fold this except in ASCII

449

return s.lower() # we don't know how to fold this except in ASCII

449

except LookupError as k:

450

except LookupError as k:

450

raise error.Abort(k, hint=b"please check your locale settings")

451

raise error.Abort(k, hint=b"please check your locale settings")

451

452

453

def upper(s):

454

def upper(s):

454

# type: (bytes) -> bytes

455

# type: (bytes) -> bytes

455

b"best-effort encoding-aware case-folding of local string s"

456

b"best-effort encoding-aware case-folding of local string s"

456

try:

457

try:

457

return asciiupper(s)

458

return asciiupper(s)

458

except UnicodeDecodeError:

459

except UnicodeDecodeError:

459

return upperfallback(s)

460

return upperfallback(s)

460

461

462

def upperfallback(s):

463

def upperfallback(s):

463

# type: (Any) -> Any

464

# type: (Any) -> Any

464

try:

465

try:

465

if isinstance(s, localstr):

466

if isinstance(s, localstr):

466

u = s._utf8.decode("utf-8")

467

u = s._utf8.decode("utf-8")

467

else:

468

else:

468

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

469

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

469

470

uu = u.upper()

471

uu = u.upper()

471

if u == uu:

472

if u == uu:

472

return s # preserve localstring

473

return s # preserve localstring

473

return uu.encode(_sysstr(encoding))

474

return uu.encode(_sysstr(encoding))

474

except UnicodeError:

475

except UnicodeError:

475

return s.upper() # we don't know how to fold this except in ASCII

476

return s.upper() # we don't know how to fold this except in ASCII

476

except LookupError as k:

477

except LookupError as k:

477

raise error.Abort(k, hint=b"please check your locale settings")

478

raise error.Abort(k, hint=b"please check your locale settings")

478

479

480

class normcasespecs(object):

481

class normcasespecs(object):

481

'''what a platform's normcase does to ASCII strings

482

'''what a platform's normcase does to ASCII strings

482

483

This is specified per platform, and should be consistent with what normcase

484

This is specified per platform, and should be consistent with what normcase

484

on that platform actually does.

485

on that platform actually does.

485

486

lower: normcase lowercases ASCII strings

487

lower: normcase lowercases ASCII strings

487

upper: normcase uppercases ASCII strings

488

upper: normcase uppercases ASCII strings

488

other: the fallback function should always be called

489

other: the fallback function should always be called

489

490

This should be kept in sync with normcase_spec in util.h.'''

491

This should be kept in sync with normcase_spec in util.h.'''

491

492

lower = -1

493

lower = -1

493

upper = 1

494

upper = 1

494

other = 0

495

other = 0

495

496

497

def jsonescape(s, paranoid=False):

498

def jsonescape(s, paranoid=False):

498

# type: (Any, Any) -> Any

499

# type: (Any, Any) -> Any

499

'''returns a string suitable for JSON

500

'''returns a string suitable for JSON

500

501

JSON is problematic for us because it doesn't support non-Unicode

502

JSON is problematic for us because it doesn't support non-Unicode

502

bytes. To deal with this, we take the following approach:

503

bytes. To deal with this, we take the following approach:

503

504

- localstr/safelocalstr objects are converted back to UTF-8

505

- localstr/safelocalstr objects are converted back to UTF-8

505

- valid UTF-8/ASCII strings are passed as-is

506

- valid UTF-8/ASCII strings are passed as-is

506

- other strings are converted to UTF-8b surrogate encoding

507

- other strings are converted to UTF-8b surrogate encoding

507

- apply JSON-specified string escaping

508

- apply JSON-specified string escaping

508

509

(escapes are doubled in these tests)

510

(escapes are doubled in these tests)

510

511

>>> jsonescape(b'this is a test')

512

>>> jsonescape(b'this is a test')

512

'this is a test'

513

'this is a test'

513

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

514

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

514

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

515

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

515

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

516

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

516

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

517

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

517

>>> jsonescape(b'a weird byte: \\xdd')

518

>>> jsonescape(b'a weird byte: \\xdd')

518

'a weird byte: \\xed\\xb3\\x9d'

519

'a weird byte: \\xed\\xb3\\x9d'

519

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

520

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

520

'utf-8: caf\\xc3\\xa9'

521

'utf-8: caf\\xc3\\xa9'

521

>>> jsonescape(b'')

522

>>> jsonescape(b'')

522

''

523

''

523

524

If paranoid, non-ascii and common troublesome characters are also escaped.

525

If paranoid, non-ascii and common troublesome characters are also escaped.

525

This is suitable for web output.

526

This is suitable for web output.

526

527

>>> s = b'escape characters: \\0 \\x0b \\x7f'

528

>>> s = b'escape characters: \\0 \\x0b \\x7f'

528

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

529

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

529

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

530

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

530

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

531

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

531

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

532

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

532

'escape boundary: ~ \\\\u007f \\\\u0080'

533

'escape boundary: ~ \\\\u007f \\\\u0080'

533

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

534

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

534

'a weird byte: \\\\udcdd'

535

'a weird byte: \\\\udcdd'

535

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

536

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

536

'utf-8: caf\\\\u00e9'

537

'utf-8: caf\\\\u00e9'

537

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

538

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

538

'non-BMP: \\\\ud834\\\\udd1e'

539

'non-BMP: \\\\ud834\\\\udd1e'

539

>>> jsonescape(b'<foo@example.org>', paranoid=True)

540

>>> jsonescape(b'<foo@example.org>', paranoid=True)

540

'\\\\u003cfoo@example.org\\\\u003e'

541

'\\\\u003cfoo@example.org\\\\u003e'

541

'''

542

'''

542

543

u8chars = toutf8b(s)

544

u8chars = toutf8b(s)

544

try:

545

try:

545

return _jsonescapeu8fast(u8chars, paranoid)

546

return _jsonescapeu8fast(u8chars, paranoid)

546

except ValueError:

547

except ValueError:

547

pass

548

pass

548

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

549

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

549

550

551

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

552

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

552

# bytes are mapped to that range.

553

# bytes are mapped to that range.

553

if pycompat.ispy3:

554

if pycompat.ispy3:

554

_utf8strict = r'surrogatepass'

555

_utf8strict = r'surrogatepass'

555

else:

556

else:

556

_utf8strict = r'strict'

557

_utf8strict = r'strict'

557

558

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

559

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

559

560

561

def getutf8char(s, pos):

562

def getutf8char(s, pos):

562

# type: (Any, Any) -> Any

563

# type: (Any, Any) -> Any

563

'''get the next full utf-8 character in the given string, starting at pos

564

'''get the next full utf-8 character in the given string, starting at pos

564

565

Raises a UnicodeError if the given location does not start a valid

566

Raises a UnicodeError if the given location does not start a valid

566

utf-8 character.

567

utf-8 character.

567

'''

568

'''

568

569

# find how many bytes to attempt decoding from first nibble

570

# find how many bytes to attempt decoding from first nibble

570

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

571

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

571

if not l: # ascii

572

if not l: # ascii

572

return s[pos : pos + 1]

573

return s[pos : pos + 1]

573

574

c = s[pos : pos + l]

575

c = s[pos : pos + l]

575

# validate with attempted decode

576

# validate with attempted decode

576

c.decode("utf-8", _utf8strict)

577

c.decode("utf-8", _utf8strict)

577

return c

578

return c

578

579

580

def toutf8b(s):

581

def toutf8b(s):

581

# type: (Any) -> Any

582

# type: (Any) -> Any

582

'''convert a local, possibly-binary string into UTF-8b

583

'''convert a local, possibly-binary string into UTF-8b

583

584

This is intended as a generic method to preserve data when working

585

This is intended as a generic method to preserve data when working

585

with schemes like JSON and XML that have no provision for

586

with schemes like JSON and XML that have no provision for

586

arbitrary byte strings. As Mercurial often doesn't know

587

arbitrary byte strings. As Mercurial often doesn't know

587

what encoding data is in, we use so-called UTF-8b.

588

what encoding data is in, we use so-called UTF-8b.

588

589

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

590

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

590

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

591

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

591

uDC00-uDCFF.

592

uDC00-uDCFF.

592

593

Principles of operation:

594

Principles of operation:

594

595

- ASCII and UTF-8 data successfully round-trips and is understood

596

- ASCII and UTF-8 data successfully round-trips and is understood

596

by Unicode-oriented clients

597

by Unicode-oriented clients

597

- filenames and file contents in arbitrary other encodings can have

598

- filenames and file contents in arbitrary other encodings can have

598

be round-tripped or recovered by clueful clients

599

be round-tripped or recovered by clueful clients

599

- local strings that have a cached known UTF-8 encoding (aka

600

- local strings that have a cached known UTF-8 encoding (aka

600

localstr) get sent as UTF-8 so Unicode-oriented clients get the

601

localstr) get sent as UTF-8 so Unicode-oriented clients get the

601

Unicode data they want

602

Unicode data they want

602

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

603

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

603

- because we must preserve UTF-8 bytestring in places such as

604

- because we must preserve UTF-8 bytestring in places such as

604

filenames, metadata can't be roundtripped without help

605

filenames, metadata can't be roundtripped without help

605

606

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

607

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

607

arbitrary bytes into an internal Unicode format that can be

608

arbitrary bytes into an internal Unicode format that can be

608

re-encoded back into the original. Here we are exposing the

609

re-encoded back into the original. Here we are exposing the

609

internal surrogate encoding as a UTF-8 string.)

610

internal surrogate encoding as a UTF-8 string.)

610

'''

611

'''

611

612

if isinstance(s, localstr):

613

if isinstance(s, localstr):

613

# assume that the original UTF-8 sequence would never contain

614

# assume that the original UTF-8 sequence would never contain

614

# invalid characters in U+DCxx range

615

# invalid characters in U+DCxx range

615

return s._utf8

616

return s._utf8

616

elif isinstance(s, safelocalstr):

617

elif isinstance(s, safelocalstr):

617

# already verified that s is non-lossy in legacy encoding, which

618

# already verified that s is non-lossy in legacy encoding, which

618

# shouldn't contain characters in U+DCxx range

619

# shouldn't contain characters in U+DCxx range

619

return fromlocal(s)

620

return fromlocal(s)

620

elif isasciistr(s):

621

elif isasciistr(s):

621

return s

622

return s

622

if b"\xed" not in s:

623

if b"\xed" not in s:

623

try:

624

try:

624

s.decode('utf-8', _utf8strict)

625

s.decode('utf-8', _utf8strict)

625

return s

626

return s

626

except UnicodeDecodeError:

627

except UnicodeDecodeError:

627

pass

628

pass

628

629

s = pycompat.bytestr(s)

630

s = pycompat.bytestr(s)

630

r = b""

631

r = b""

631

pos = 0

632

pos = 0

632

l = len(s)

633

l = len(s)

633

while pos < l:

634

while pos < l:

634

try:

635

try:

635

c = getutf8char(s, pos)

636

c = getutf8char(s, pos)

636

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

637

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

637

# have to re-escape existing U+DCxx characters

638

# have to re-escape existing U+DCxx characters

638

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

639

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

639

pos += 1

640

pos += 1

640

else:

641

else:

641

pos += len(c)

642

pos += len(c)

642

except UnicodeDecodeError:

643

except UnicodeDecodeError:

643

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

644

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

644

pos += 1

645

pos += 1

645

r += c

646

r += c

646

return r

647

return r

647

648

649

def fromutf8b(s):

650

def fromutf8b(s):

650

# type: (Text) -> bytes

651

# type: (Text) -> bytes

651

'''Given a UTF-8b string, return a local, possibly-binary string.

652

'''Given a UTF-8b string, return a local, possibly-binary string.

652

653

return the original binary string. This

654

return the original binary string. This

654

is a round-trip process for strings like filenames, but metadata

655

is a round-trip process for strings like filenames, but metadata

655

that's was passed through tolocal will remain in UTF-8.

656

that's was passed through tolocal will remain in UTF-8.

656

657

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

658

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

658

>>> m = b"\\xc3\\xa9\\x99abcd"

659

>>> m = b"\\xc3\\xa9\\x99abcd"

659

>>> toutf8b(m)

660

>>> toutf8b(m)

660

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

661

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

661

>>> roundtrip(m)

662

>>> roundtrip(m)

662

True

663

True

663

>>> roundtrip(b"\\xc2\\xc2\\x80")

664

>>> roundtrip(b"\\xc2\\xc2\\x80")

664

True

665

True

665

>>> roundtrip(b"\\xef\\xbf\\xbd")

666

>>> roundtrip(b"\\xef\\xbf\\xbd")

666

True

667

True

667

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

668

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

668

True

669

True

669

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

670

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

670

True

671

True

671

'''

672

'''

672

673

if isasciistr(s):

674

if isasciistr(s):

674

return s

675

return s

675

# fast path - look for uDxxx prefixes in s

676

# fast path - look for uDxxx prefixes in s

676

if b"\xed" not in s:

677

if b"\xed" not in s:

677

return s

678

return s

678

679

# We could do this with the unicode type but some Python builds

680

# We could do this with the unicode type but some Python builds

680

# use UTF-16 internally (issue5031) which causes non-BMP code

681

# use UTF-16 internally (issue5031) which causes non-BMP code

681

# points to be escaped. Instead, we use our handy getutf8char

682

# points to be escaped. Instead, we use our handy getutf8char

682

# helper again to walk the string without "decoding" it.

683

# helper again to walk the string without "decoding" it.

683

684

s = pycompat.bytestr(s)

685

s = pycompat.bytestr(s)

685

r = b""

686

r = b""

686

pos = 0

687

pos = 0

687

l = len(s)

688

l = len(s)

688

while pos < l:

689

while pos < l:

689

c = getutf8char(s, pos)

690

c = getutf8char(s, pos)

690

pos += len(c)

691

pos += len(c)

691

# unescape U+DCxx characters

692

# unescape U+DCxx characters

692

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

693

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

693

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

694

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

694

r += c

695

r += c

695

return r

696

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
             if not globals():  # hide this from non-pytype users
                 from typing import (
                     Any,
                     Callable,
                     List,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict(
                     (k.encode('utf-8'), v.encode('utf-8'))
                     for k, v in os.environ.items()  # re-exports
                 )
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (Text) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 def strtolocal(s):
                     # type: (str) -> bytes
                     return s
                 def strfromlocal(s):
                     # type: (bytes) -> str
                     return s
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict(
                     (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
                     for k, v in os.environ.items()  # re-exports
                 )
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     getcwd = lambda: strtolocal(os.getcwd())  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 b"Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 b"Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
+                raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             def lower(s):
                 # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
                 # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (Any, Any) -> Any
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (Any) -> Any
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 # type: (Text) -> bytes
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r