upstream/mercurial-mirror Commit - r44080:da925257

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from .pycompat import getattr

14

from .pycompat import getattr

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import charencode as charencodepure

21

from .pure import charencode as charencodepure

22

23

_TYPE_CHECKING = False

24

23

if not globals(): # hide this from non-pytype users

25

if not globals(): # hide this from non-pytype users

24

from typing import (

26

from typing import (

25

Any,

27

Any,

26

Callable,

28

Callable,

27

List,

29

List,

30

TYPE_CHECKING as _TYPE_CHECKING,

28

Text,

31

Text,

29

Type,

32

Type,

30

TypeVar,

33

TypeVar,

31

Union,

34

Union,

32

)

35

)

33

36

34

# keep pyflakes happy

37

# keep pyflakes happy

35

for t in (Any, Callable, List, Text, Type, Union):

38

for t in (Any, Callable, List, Text, Type, Union):

36

assert t

39

assert t

37

40

38

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

41

_Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')

39

42

40

charencode = policy.importmod('charencode')

43

charencode = policy.importmod('charencode')

41

44

42

isasciistr = charencode.isasciistr

45

isasciistr = charencode.isasciistr

43

asciilower = charencode.asciilower

46

asciilower = charencode.asciilower

44

asciiupper = charencode.asciiupper

47

asciiupper = charencode.asciiupper

45

_jsonescapeu8fast = charencode.jsonescapeu8fast

48

_jsonescapeu8fast = charencode.jsonescapeu8fast

46

49

47

_sysstr = pycompat.sysstr

50

_sysstr = pycompat.sysstr

48

51

49

if pycompat.ispy3:

52

if pycompat.ispy3:

50

unichr = chr

53

unichr = chr

51

54

52

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

55

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

53

# "Unicode Subtleties"), so we need to ignore them in some places for

56

# "Unicode Subtleties"), so we need to ignore them in some places for

54

# sanity.

57

# sanity.

55

_ignore = [

58

_ignore = [

56

unichr(int(x, 16)).encode("utf-8")

59

unichr(int(x, 16)).encode("utf-8")

57

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

60

for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "

58

b"206a 206b 206c 206d 206e 206f feff".split()

61

b"206a 206b 206c 206d 206e 206f feff".split()

59

]

62

]

60

# verify the next function will work

63

# verify the next function will work

61

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

64

assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)

62

65

63

66

64

def hfsignoreclean(s):

67

def hfsignoreclean(s):

65

# type: (bytes) -> bytes

68

# type: (bytes) -> bytes

66

"""Remove codepoints ignored by HFS+ from s.

69

"""Remove codepoints ignored by HFS+ from s.

67

70

68

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

71

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

69

'.hg'

72

'.hg'

70

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

73

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

71

'.hg'

74

'.hg'

72

"""

75

"""

73

if b"\xe2" in s or b"\xef" in s:

76

if b"\xe2" in s or b"\xef" in s:

74

for c in _ignore:

77

for c in _ignore:

75

s = s.replace(c, b'')

78

s = s.replace(c, b'')

76

return s

79

return s

77

80

78

81

79

# encoding.environ is provided read-only, which may not be used to modify

82

# encoding.environ is provided read-only, which may not be used to modify

80

# the process environment

83

# the process environment

81

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

84

_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ

82

if not pycompat.ispy3:

85

if not pycompat.ispy3:

83

environ = os.environ # re-exports

86

environ = os.environ # re-exports

84

elif _nativeenviron:

87

elif _nativeenviron:

85

environ = os.environb # re-exports

88

environ = os.environb # re-exports

86

else:

89

else:

87

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

90

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

88

# and recreate it once encoding is settled

91

# and recreate it once encoding is settled

89

environ = dict(

92

environ = dict(

90

(k.encode('utf-8'), v.encode('utf-8'))

93

(k.encode('utf-8'), v.encode('utf-8'))

91

for k, v in os.environ.items() # re-exports

94

for k, v in os.environ.items() # re-exports

92

)

95

)

93

96

94

_encodingrewrites = {

97

_encodingrewrites = {

95

b'646': b'ascii',

98

b'646': b'ascii',

96

b'ANSI_X3.4-1968': b'ascii',

99

b'ANSI_X3.4-1968': b'ascii',

97

}

100

}

98

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

101

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

99

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

102

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

100

# https://bugs.python.org/issue13216

103

# https://bugs.python.org/issue13216

101

if pycompat.iswindows and not pycompat.ispy3:

104

if pycompat.iswindows and not pycompat.ispy3:

102

_encodingrewrites[b'cp65001'] = b'utf-8'

105

_encodingrewrites[b'cp65001'] = b'utf-8'

103

106

104

try:

107

try:

105

encoding = environ.get(b"HGENCODING")

108

encoding = environ.get(b"HGENCODING")

106

if not encoding:

109

if not encoding:

107

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

110

encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'

108

encoding = _encodingrewrites.get(encoding, encoding)

111

encoding = _encodingrewrites.get(encoding, encoding)

109

except locale.Error:

112

except locale.Error:

110

encoding = b'ascii'

113

encoding = b'ascii'

111

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

114

encodingmode = environ.get(b"HGENCODINGMODE", b"strict")

112

fallbackencoding = b'ISO-8859-1'

115

fallbackencoding = b'ISO-8859-1'

113

116

114

117

115

class localstr(bytes):

118

class localstr(bytes):

116

'''This class allows strings that are unmodified to be

119

'''This class allows strings that are unmodified to be

117

round-tripped to the local encoding and back'''

120

round-tripped to the local encoding and back'''

118

121

119

def __new__(cls, u, l):

122

def __new__(cls, u, l):

120

# type: (Type[_Tlocalstr], bytes, bytes) -> _Tlocalstr

121

s = bytes.__new__(cls, l)

123

s = bytes.__new__(cls, l)

122

s._utf8 = u

124

s._utf8 = u

123

return s

125

return s

124

126

127

if _TYPE_CHECKING:

128

# pseudo implementation to help pytype see localstr() constructor

129

def __init__(self, u, l):

130

# type: (bytes, bytes) -> None

131

super(localstr, self).__init__(l)

132

self._utf8 = u

133

125

def __hash__(self):

134

def __hash__(self):

126

return hash(self._utf8) # avoid collisions in local string space

135

return hash(self._utf8) # avoid collisions in local string space

127

136

128

137

129

class safelocalstr(bytes):

138

class safelocalstr(bytes):

130

"""Tagged string denoting it was previously an internal UTF-8 string,

139

"""Tagged string denoting it was previously an internal UTF-8 string,

131

and can be converted back to UTF-8 losslessly

140

and can be converted back to UTF-8 losslessly

132

141

133

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

142

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

134

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

143

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

135

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

144

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

136

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

145

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

137

"""

146

"""

138

147

139

148

140

def tolocal(s):

149

def tolocal(s):

141

# type: (bytes) -> bytes

150

# type: (bytes) -> bytes

142

"""

151

"""

143

Convert a string from internal UTF-8 to local encoding

152

Convert a string from internal UTF-8 to local encoding

144

153

145

All internal strings should be UTF-8 but some repos before the

154

All internal strings should be UTF-8 but some repos before the

146

implementation of locale support may contain latin1 or possibly

155

implementation of locale support may contain latin1 or possibly

147

other character sets. We attempt to decode everything strictly

156

other character sets. We attempt to decode everything strictly

148

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

157

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

149

replace unknown characters.

158

replace unknown characters.

150

159

151

The localstr class is used to cache the known UTF-8 encoding of

160

The localstr class is used to cache the known UTF-8 encoding of

152

strings next to their local representation to allow lossless

161

strings next to their local representation to allow lossless

153

round-trip conversion back to UTF-8.

162

round-trip conversion back to UTF-8.

154

163

155

>>> u = b'foo: \\xc3\\xa4' # utf-8

164

>>> u = b'foo: \\xc3\\xa4' # utf-8

156

>>> l = tolocal(u)

165

>>> l = tolocal(u)

157

>>> l

166

>>> l

158

'foo: ?'

167

'foo: ?'

159

>>> fromlocal(l)

168

>>> fromlocal(l)

160

'foo: \\xc3\\xa4'

169

'foo: \\xc3\\xa4'

161

>>> u2 = b'foo: \\xc3\\xa1'

170

>>> u2 = b'foo: \\xc3\\xa1'

162

>>> d = { l: 1, tolocal(u2): 2 }

171

>>> d = { l: 1, tolocal(u2): 2 }

163

>>> len(d) # no collision

172

>>> len(d) # no collision

164

2

173

2

165

>>> b'foo: ?' in d

174

>>> b'foo: ?' in d

166

False

175

False

167

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

176

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

168

>>> l = tolocal(l1)

177

>>> l = tolocal(l1)

169

>>> l

178

>>> l

170

'foo: ?'

179

'foo: ?'

171

>>> fromlocal(l) # magically in utf-8

180

>>> fromlocal(l) # magically in utf-8

172

'foo: \\xc3\\xa4'

181

'foo: \\xc3\\xa4'

173

"""

182

"""

174

183

175

if isasciistr(s):

184

if isasciistr(s):

176

return s

185

return s

177

186

178

try:

187

try:

179

try:

188

try:

180

# make sure string is actually stored in UTF-8

189

# make sure string is actually stored in UTF-8

181

u = s.decode('UTF-8')

190

u = s.decode('UTF-8')

182

if encoding == b'UTF-8':

191

if encoding == b'UTF-8':

183

# fast path

192

# fast path

184

return s

193

return s

185

r = u.encode(_sysstr(encoding), "replace")

194

r = u.encode(_sysstr(encoding), "replace")

186

if u == r.decode(_sysstr(encoding)):

195

if u == r.decode(_sysstr(encoding)):

187

# r is a safe, non-lossy encoding of s

196

# r is a safe, non-lossy encoding of s

188

return safelocalstr(r)

197

return safelocalstr(r)

189

return localstr(s, r)

198

return localstr(s, r)

190

except UnicodeDecodeError:

199

except UnicodeDecodeError:

191

# we should only get here if we're looking at an ancient changeset

200

# we should only get here if we're looking at an ancient changeset

192

try:

201

try:

193

u = s.decode(_sysstr(fallbackencoding))

202

u = s.decode(_sysstr(fallbackencoding))

194

r = u.encode(_sysstr(encoding), "replace")

203

r = u.encode(_sysstr(encoding), "replace")

195

if u == r.decode(_sysstr(encoding)):

204

if u == r.decode(_sysstr(encoding)):

196

# r is a safe, non-lossy encoding of s

205

# r is a safe, non-lossy encoding of s

197

return safelocalstr(r)

206

return safelocalstr(r)

198

return localstr(u.encode('UTF-8'), r)

207

return localstr(u.encode('UTF-8'), r)

199

except UnicodeDecodeError:

208

except UnicodeDecodeError:

200

u = s.decode("utf-8", "replace") # last ditch

209

u = s.decode("utf-8", "replace") # last ditch

201

# can't round-trip

210

# can't round-trip

202

return u.encode(_sysstr(encoding), "replace")

211

return u.encode(_sysstr(encoding), "replace")

203

except LookupError as k:

212

except LookupError as k:

204

raise error.Abort(k, hint=b"please check your locale settings")

213

raise error.Abort(k, hint=b"please check your locale settings")

205

214

206

215

207

def fromlocal(s):

216

def fromlocal(s):

208

# type: (bytes) -> bytes

217

# type: (bytes) -> bytes

209

"""

218

"""

210

Convert a string from the local character encoding to UTF-8

219

Convert a string from the local character encoding to UTF-8

211

220

212

We attempt to decode strings using the encoding mode set by

221

We attempt to decode strings using the encoding mode set by

213

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

222

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

214

characters will cause an error message. Other modes include

223

characters will cause an error message. Other modes include

215

'replace', which replaces unknown characters with a special

224

'replace', which replaces unknown characters with a special

216

Unicode character, and 'ignore', which drops the character.

225

Unicode character, and 'ignore', which drops the character.

217

"""

226

"""

218

227

219

# can we do a lossless round-trip?

228

# can we do a lossless round-trip?

220

if isinstance(s, localstr):

229

if isinstance(s, localstr):

221

return s._utf8

230

return s._utf8

222

if isasciistr(s):

231

if isasciistr(s):

223

return s

232

return s

224

233

225

try:

234

try:

226

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

235

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

227

return u.encode("utf-8")

236

return u.encode("utf-8")

228

except UnicodeDecodeError as inst:

237

except UnicodeDecodeError as inst:

229

sub = s[max(0, inst.start - 10) : inst.start + 10]

238

sub = s[max(0, inst.start - 10) : inst.start + 10]

230

raise error.Abort(

239

raise error.Abort(

231

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

240

b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))

232

)

241

)

233

except LookupError as k:

242

except LookupError as k:

234

raise error.Abort(k, hint=b"please check your locale settings")

243

raise error.Abort(k, hint=b"please check your locale settings")

235

244

236

245

237

def unitolocal(u):

246

def unitolocal(u):

238

# type: (Text) -> bytes

247

# type: (Text) -> bytes

239

"""Convert a unicode string to a byte string of local encoding"""

248

"""Convert a unicode string to a byte string of local encoding"""

240

return tolocal(u.encode('utf-8'))

249

return tolocal(u.encode('utf-8'))

241

250

242

251

243

def unifromlocal(s):

252

def unifromlocal(s):

244

# type: (bytes) -> Text

253

# type: (bytes) -> Text

245

"""Convert a byte string of local encoding to a unicode string"""

254

"""Convert a byte string of local encoding to a unicode string"""

246

return fromlocal(s).decode('utf-8')

255

return fromlocal(s).decode('utf-8')

247

256

248

257

249

def unimethod(bytesfunc):

258

def unimethod(bytesfunc):

250

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

259

# type: (Callable[[Any], bytes]) -> Callable[[Any], Text]

251

"""Create a proxy method that forwards __unicode__() and __str__() of

260

"""Create a proxy method that forwards __unicode__() and __str__() of

252

Python 3 to __bytes__()"""

261

Python 3 to __bytes__()"""

253

262

254

def unifunc(obj):

263

def unifunc(obj):

255

return unifromlocal(bytesfunc(obj))

264

return unifromlocal(bytesfunc(obj))

256

265

257

return unifunc

266

return unifunc

258

267

259

268

260

# converter functions between native str and byte string. use these if the

269

# converter functions between native str and byte string. use these if the

261

# character encoding is not aware (e.g. exception message) or is known to

270

# character encoding is not aware (e.g. exception message) or is known to

262

# be locale dependent (e.g. date formatting.)

271

# be locale dependent (e.g. date formatting.)

263

if pycompat.ispy3:

272

if pycompat.ispy3:

264

strtolocal = unitolocal

273

strtolocal = unitolocal

265

strfromlocal = unifromlocal

274

strfromlocal = unifromlocal

266

strmethod = unimethod

275

strmethod = unimethod

267

else:

276

else:

268

277

269

def strtolocal(s):

278

def strtolocal(s):

270

# type: (str) -> bytes

279

# type: (str) -> bytes

271

return s # pytype: disable=bad-return-type

280

return s # pytype: disable=bad-return-type

272

281

273

def strfromlocal(s):

282

def strfromlocal(s):

274

# type: (bytes) -> str

283

# type: (bytes) -> str

275

return s # pytype: disable=bad-return-type

284

return s # pytype: disable=bad-return-type

276

285

277

strmethod = pycompat.identity

286

strmethod = pycompat.identity

278

287

279

if not _nativeenviron:

288

if not _nativeenviron:

280

# now encoding and helper functions are available, recreate the environ

289

# now encoding and helper functions are available, recreate the environ

281

# dict to be exported to other modules

290

# dict to be exported to other modules

282

environ = dict(

291

environ = dict(

283

(tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))

292

(tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))

284

for k, v in os.environ.items() # re-exports

293

for k, v in os.environ.items() # re-exports

285

)

294

)

286

295

287

if pycompat.ispy3:

296

if pycompat.ispy3:

288

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

297

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

289

# returns bytes.

298

# returns bytes.

290

if pycompat.iswindows:

299

if pycompat.iswindows:

291

# Python 3 on Windows issues a DeprecationWarning about using the bytes

300

# Python 3 on Windows issues a DeprecationWarning about using the bytes

292

# API when os.getcwdb() is called.

301

# API when os.getcwdb() is called.

293

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

302

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

294

else:

303

else:

295

getcwd = os.getcwdb # re-exports

304

getcwd = os.getcwdb # re-exports

296

else:

305

else:

297

getcwd = os.getcwd # re-exports

306

getcwd = os.getcwd # re-exports

298

307

299

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

308

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

300

_wide = _sysstr(

309

_wide = _sysstr(

301

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

310

environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"

302

and b"WFA"

311

and b"WFA"

303

or b"WF"

312

or b"WF"

304

)

313

)

305

314

306

315

307

def colwidth(s):

316

def colwidth(s):

308

# type: (bytes) -> int

317

# type: (bytes) -> int

309

b"Find the column width of a string for display in the local encoding"

318

b"Find the column width of a string for display in the local encoding"

310

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

319

return ucolwidth(s.decode(_sysstr(encoding), 'replace'))

311

320

312

321

313

def ucolwidth(d):

322

def ucolwidth(d):

314

# type: (Text) -> int

323

# type: (Text) -> int

315

b"Find the column width of a Unicode string for display"

324

b"Find the column width of a Unicode string for display"

316

eaw = getattr(unicodedata, 'east_asian_width', None)

325

eaw = getattr(unicodedata, 'east_asian_width', None)

317

if eaw is not None:

326

if eaw is not None:

318

return sum([eaw(c) in _wide and 2 or 1 for c in d])

327

return sum([eaw(c) in _wide and 2 or 1 for c in d])

319

return len(d)

328

return len(d)

320

329

321

330

322

def getcols(s, start, c):

331

def getcols(s, start, c):

323

# type: (bytes, int, int) -> bytes

332

# type: (bytes, int, int) -> bytes

324

'''Use colwidth to find a c-column substring of s starting at byte

333

'''Use colwidth to find a c-column substring of s starting at byte

325

index start'''

334

index start'''

326

for x in pycompat.xrange(start + c, len(s)):

335

for x in pycompat.xrange(start + c, len(s)):

327

t = s[start:x]

336

t = s[start:x]

328

if colwidth(t) == c:

337

if colwidth(t) == c:

329

return t

338

return t

330

raise ValueError('substring not found')

339

raise ValueError('substring not found')

331

340

332

341

333

def trim(s, width, ellipsis=b'', leftside=False):

342

def trim(s, width, ellipsis=b'', leftside=False):

334

# type: (bytes, int, bytes, bool) -> bytes

343

# type: (bytes, int, bytes, bool) -> bytes

335

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

344

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

336

345

337

If 'leftside' is True, left side of string 's' is trimmed.

346

If 'leftside' is True, left side of string 's' is trimmed.

338

'ellipsis' is always placed at trimmed side.

347

'ellipsis' is always placed at trimmed side.

339

348

340

>>> from .node import bin

349

>>> from .node import bin

341

>>> def bprint(s):

350

>>> def bprint(s):

342

... print(pycompat.sysstr(s))

351

... print(pycompat.sysstr(s))

343

>>> ellipsis = b'+++'

352

>>> ellipsis = b'+++'

344

>>> from . import encoding

353

>>> from . import encoding

345

>>> encoding.encoding = b'utf-8'

354

>>> encoding.encoding = b'utf-8'

346

>>> t = b'1234567890'

355

>>> t = b'1234567890'

347

>>> bprint(trim(t, 12, ellipsis=ellipsis))

356

>>> bprint(trim(t, 12, ellipsis=ellipsis))

348

1234567890

357

1234567890

349

>>> bprint(trim(t, 10, ellipsis=ellipsis))

358

>>> bprint(trim(t, 10, ellipsis=ellipsis))

350

1234567890

359

1234567890

351

>>> bprint(trim(t, 8, ellipsis=ellipsis))

360

>>> bprint(trim(t, 8, ellipsis=ellipsis))

352

12345+++

361

12345+++

353

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

362

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

354

+++67890

363

+++67890

355

>>> bprint(trim(t, 8))

364

>>> bprint(trim(t, 8))

356

12345678

365

12345678

357

>>> bprint(trim(t, 8, leftside=True))

366

>>> bprint(trim(t, 8, leftside=True))

358

34567890

367

34567890

359

>>> bprint(trim(t, 3, ellipsis=ellipsis))

368

>>> bprint(trim(t, 3, ellipsis=ellipsis))

360

+++

369

+++

361

>>> bprint(trim(t, 1, ellipsis=ellipsis))

370

>>> bprint(trim(t, 1, ellipsis=ellipsis))

362

+

371

+

363

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

372

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

364

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

373

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

365

>>> bprint(trim(t, 12, ellipsis=ellipsis))

374

>>> bprint(trim(t, 12, ellipsis=ellipsis))

366

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

375

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

367

>>> bprint(trim(t, 10, ellipsis=ellipsis))

376

>>> bprint(trim(t, 10, ellipsis=ellipsis))

368

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

377

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

369

>>> bprint(trim(t, 8, ellipsis=ellipsis))

378

>>> bprint(trim(t, 8, ellipsis=ellipsis))

370

\xe3\x81\x82\xe3\x81\x84+++

379

\xe3\x81\x82\xe3\x81\x84+++

371

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

380

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

372

+++\xe3\x81\x88\xe3\x81\x8a

381

+++\xe3\x81\x88\xe3\x81\x8a

373

>>> bprint(trim(t, 5))

382

>>> bprint(trim(t, 5))

374

\xe3\x81\x82\xe3\x81\x84

383

\xe3\x81\x82\xe3\x81\x84

375

>>> bprint(trim(t, 5, leftside=True))

384

>>> bprint(trim(t, 5, leftside=True))

376

\xe3\x81\x88\xe3\x81\x8a

385

\xe3\x81\x88\xe3\x81\x8a

377

>>> bprint(trim(t, 4, ellipsis=ellipsis))

386

>>> bprint(trim(t, 4, ellipsis=ellipsis))

378

+++

387

+++

379

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

388

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

380

+++

389

+++

381

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

390

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

382

>>> bprint(trim(t, 12, ellipsis=ellipsis))

391

>>> bprint(trim(t, 12, ellipsis=ellipsis))

383

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

392

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

384

>>> bprint(trim(t, 10, ellipsis=ellipsis))

393

>>> bprint(trim(t, 10, ellipsis=ellipsis))

385

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

394

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

386

>>> bprint(trim(t, 8, ellipsis=ellipsis))

395

>>> bprint(trim(t, 8, ellipsis=ellipsis))

387

\x11\x22\x33\x44\x55+++

396

\x11\x22\x33\x44\x55+++

388

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

397

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

389

+++\x66\x77\x88\x99\xaa

398

+++\x66\x77\x88\x99\xaa

390

>>> bprint(trim(t, 8))

399

>>> bprint(trim(t, 8))

391

\x11\x22\x33\x44\x55\x66\x77\x88

400

\x11\x22\x33\x44\x55\x66\x77\x88

392

>>> bprint(trim(t, 8, leftside=True))

401

>>> bprint(trim(t, 8, leftside=True))

393

\x33\x44\x55\x66\x77\x88\x99\xaa

402

\x33\x44\x55\x66\x77\x88\x99\xaa

394

>>> bprint(trim(t, 3, ellipsis=ellipsis))

403

>>> bprint(trim(t, 3, ellipsis=ellipsis))

395

+++

404

+++

396

>>> bprint(trim(t, 1, ellipsis=ellipsis))

405

>>> bprint(trim(t, 1, ellipsis=ellipsis))

397

+

406

+

398

"""

407

"""

399

try:

408

try:

400

u = s.decode(_sysstr(encoding))

409

u = s.decode(_sysstr(encoding))

401

except UnicodeDecodeError:

410

except UnicodeDecodeError:

402

if len(s) <= width: # trimming is not needed

411

if len(s) <= width: # trimming is not needed

403

return s

412

return s

404

width -= len(ellipsis)

413

width -= len(ellipsis)

405

if width <= 0: # no enough room even for ellipsis

414

if width <= 0: # no enough room even for ellipsis

406

return ellipsis[: width + len(ellipsis)]

415

return ellipsis[: width + len(ellipsis)]

407

if leftside:

416

if leftside:

408

return ellipsis + s[-width:]

417

return ellipsis + s[-width:]

409

return s[:width] + ellipsis

418

return s[:width] + ellipsis

410

419

411

if ucolwidth(u) <= width: # trimming is not needed

420

if ucolwidth(u) <= width: # trimming is not needed

412

return s

421

return s

413

422

414

width -= len(ellipsis)

423

width -= len(ellipsis)

415

if width <= 0: # no enough room even for ellipsis

424

if width <= 0: # no enough room even for ellipsis

416

return ellipsis[: width + len(ellipsis)]

425

return ellipsis[: width + len(ellipsis)]

417

426

418

if leftside:

427

if leftside:

419

uslice = lambda i: u[i:]

428

uslice = lambda i: u[i:]

420

concat = lambda s: ellipsis + s

429

concat = lambda s: ellipsis + s

421

else:

430

else:

422

uslice = lambda i: u[:-i]

431

uslice = lambda i: u[:-i]

423

concat = lambda s: s + ellipsis

432

concat = lambda s: s + ellipsis

424

for i in pycompat.xrange(1, len(u)):

433

for i in pycompat.xrange(1, len(u)):

425

usub = uslice(i)

434

usub = uslice(i)

426

if ucolwidth(usub) <= width:

435

if ucolwidth(usub) <= width:

427

return concat(usub.encode(_sysstr(encoding)))

436

return concat(usub.encode(_sysstr(encoding)))

428

return ellipsis # no enough room for multi-column characters

437

return ellipsis # no enough room for multi-column characters

429

438

430

439

431

def lower(s):

440

def lower(s):

432

# type: (bytes) -> bytes

441

# type: (bytes) -> bytes

433

b"best-effort encoding-aware case-folding of local string s"

442

b"best-effort encoding-aware case-folding of local string s"

434

try:

443

try:

435

return asciilower(s)

444

return asciilower(s)

436

except UnicodeDecodeError:

445

except UnicodeDecodeError:

437

pass

446

pass

438

try:

447

try:

439

if isinstance(s, localstr):

448

if isinstance(s, localstr):

440

u = s._utf8.decode("utf-8")

449

u = s._utf8.decode("utf-8")

441

else:

450

else:

442

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

451

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

443

452

444

lu = u.lower()

453

lu = u.lower()

445

if u == lu:

454

if u == lu:

446

return s # preserve localstring

455

return s # preserve localstring

447

return lu.encode(_sysstr(encoding))

456

return lu.encode(_sysstr(encoding))

448

except UnicodeError:

457

except UnicodeError:

449

return s.lower() # we don't know how to fold this except in ASCII

458

return s.lower() # we don't know how to fold this except in ASCII

450

except LookupError as k:

459

except LookupError as k:

451

raise error.Abort(k, hint=b"please check your locale settings")

460

raise error.Abort(k, hint=b"please check your locale settings")

452

461

453

462

454

def upper(s):

463

def upper(s):

455

# type: (bytes) -> bytes

464

# type: (bytes) -> bytes

456

b"best-effort encoding-aware case-folding of local string s"

465

b"best-effort encoding-aware case-folding of local string s"

457

try:

466

try:

458

return asciiupper(s)

467

return asciiupper(s)

459

except UnicodeDecodeError:

468

except UnicodeDecodeError:

460

return upperfallback(s)

469

return upperfallback(s)

461

470

462

471

463

def upperfallback(s):

472

def upperfallback(s):

464

# type: (Any) -> Any

473

# type: (Any) -> Any

465

try:

474

try:

466

if isinstance(s, localstr):

475

if isinstance(s, localstr):

467

u = s._utf8.decode("utf-8")

476

u = s._utf8.decode("utf-8")

468

else:

477

else:

469

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

478

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

470

479

471

uu = u.upper()

480

uu = u.upper()

472

if u == uu:

481

if u == uu:

473

return s # preserve localstring

482

return s # preserve localstring

474

return uu.encode(_sysstr(encoding))

483

return uu.encode(_sysstr(encoding))

475

except UnicodeError:

484

except UnicodeError:

476

return s.upper() # we don't know how to fold this except in ASCII

485

return s.upper() # we don't know how to fold this except in ASCII

477

except LookupError as k:

486

except LookupError as k:

478

raise error.Abort(k, hint=b"please check your locale settings")

487

raise error.Abort(k, hint=b"please check your locale settings")

479

488

480

489

481

class normcasespecs(object):

490

class normcasespecs(object):

482

'''what a platform's normcase does to ASCII strings

491

'''what a platform's normcase does to ASCII strings

483

492

484

This is specified per platform, and should be consistent with what normcase

493

This is specified per platform, and should be consistent with what normcase

485

on that platform actually does.

494

on that platform actually does.

486

495

487

lower: normcase lowercases ASCII strings

496

lower: normcase lowercases ASCII strings

488

upper: normcase uppercases ASCII strings

497

upper: normcase uppercases ASCII strings

489

other: the fallback function should always be called

498

other: the fallback function should always be called

490

499

491

This should be kept in sync with normcase_spec in util.h.'''

500

This should be kept in sync with normcase_spec in util.h.'''

492

501

493

lower = -1

502

lower = -1

494

upper = 1

503

upper = 1

495

other = 0

504

other = 0

496

505

497

506

498

def jsonescape(s, paranoid=False):

507

def jsonescape(s, paranoid=False):

499

# type: (Any, Any) -> Any

508

# type: (Any, Any) -> Any

500

'''returns a string suitable for JSON

509

'''returns a string suitable for JSON

501

510

502

JSON is problematic for us because it doesn't support non-Unicode

511

JSON is problematic for us because it doesn't support non-Unicode

503

bytes. To deal with this, we take the following approach:

512

bytes. To deal with this, we take the following approach:

504

513

505

- localstr/safelocalstr objects are converted back to UTF-8

514

- localstr/safelocalstr objects are converted back to UTF-8

506

- valid UTF-8/ASCII strings are passed as-is

515

- valid UTF-8/ASCII strings are passed as-is

507

- other strings are converted to UTF-8b surrogate encoding

516

- other strings are converted to UTF-8b surrogate encoding

508

- apply JSON-specified string escaping

517

- apply JSON-specified string escaping

509

518

510

(escapes are doubled in these tests)

519

(escapes are doubled in these tests)

511

520

512

>>> jsonescape(b'this is a test')

521

>>> jsonescape(b'this is a test')

513

'this is a test'

522

'this is a test'

514

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

523

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

515

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

524

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

516

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

525

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

517

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

526

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

518

>>> jsonescape(b'a weird byte: \\xdd')

527

>>> jsonescape(b'a weird byte: \\xdd')

519

'a weird byte: \\xed\\xb3\\x9d'

528

'a weird byte: \\xed\\xb3\\x9d'

520

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

529

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

521

'utf-8: caf\\xc3\\xa9'

530

'utf-8: caf\\xc3\\xa9'

522

>>> jsonescape(b'')

531

>>> jsonescape(b'')

523

''

532

''

524

533

525

If paranoid, non-ascii and common troublesome characters are also escaped.

534

If paranoid, non-ascii and common troublesome characters are also escaped.

526

This is suitable for web output.

535

This is suitable for web output.

527

536

528

>>> s = b'escape characters: \\0 \\x0b \\x7f'

537

>>> s = b'escape characters: \\0 \\x0b \\x7f'

529

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

538

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

530

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

539

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

531

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

540

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

532

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

541

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

533

'escape boundary: ~ \\\\u007f \\\\u0080'

542

'escape boundary: ~ \\\\u007f \\\\u0080'

534

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

543

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

535

'a weird byte: \\\\udcdd'

544

'a weird byte: \\\\udcdd'

536

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

545

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

537

'utf-8: caf\\\\u00e9'

546

'utf-8: caf\\\\u00e9'

538

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

547

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

539

'non-BMP: \\\\ud834\\\\udd1e'

548

'non-BMP: \\\\ud834\\\\udd1e'

540

>>> jsonescape(b'<foo@example.org>', paranoid=True)

549

>>> jsonescape(b'<foo@example.org>', paranoid=True)

541

'\\\\u003cfoo@example.org\\\\u003e'

550

'\\\\u003cfoo@example.org\\\\u003e'

542

'''

551

'''

543

552

544

u8chars = toutf8b(s)

553

u8chars = toutf8b(s)

545

try:

554

try:

546

return _jsonescapeu8fast(u8chars, paranoid)

555

return _jsonescapeu8fast(u8chars, paranoid)

547

except ValueError:

556

except ValueError:

548

pass

557

pass

549

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

558

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

550

559

551

560

552

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

561

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

553

# bytes are mapped to that range.

562

# bytes are mapped to that range.

554

if pycompat.ispy3:

563

if pycompat.ispy3:

555

_utf8strict = r'surrogatepass'

564

_utf8strict = r'surrogatepass'

556

else:

565

else:

557

_utf8strict = r'strict'

566

_utf8strict = r'strict'

558

567

559

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

568

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

560

569

561

570

562

def getutf8char(s, pos):

571

def getutf8char(s, pos):

563

# type: (bytes, int) -> bytes

572

# type: (bytes, int) -> bytes

564

'''get the next full utf-8 character in the given string, starting at pos

573

'''get the next full utf-8 character in the given string, starting at pos

565

574

566

Raises a UnicodeError if the given location does not start a valid

575

Raises a UnicodeError if the given location does not start a valid

567

utf-8 character.

576

utf-8 character.

568

'''

577

'''

569

578

570

# find how many bytes to attempt decoding from first nibble

579

# find how many bytes to attempt decoding from first nibble

571

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

580

l = _utf8len[ord(s[pos : pos + 1]) >> 4]

572

if not l: # ascii

581

if not l: # ascii

573

return s[pos : pos + 1]

582

return s[pos : pos + 1]

574

583

575

c = s[pos : pos + l]

584

c = s[pos : pos + l]

576

# validate with attempted decode

585

# validate with attempted decode

577

c.decode("utf-8", _utf8strict)

586

c.decode("utf-8", _utf8strict)

578

return c

587

return c

579

588

580

589

581

def toutf8b(s):

590

def toutf8b(s):

582

# type: (bytes) -> bytes

591

# type: (bytes) -> bytes

583

'''convert a local, possibly-binary string into UTF-8b

592

'''convert a local, possibly-binary string into UTF-8b

584

593

585

This is intended as a generic method to preserve data when working

594

This is intended as a generic method to preserve data when working

586

with schemes like JSON and XML that have no provision for

595

with schemes like JSON and XML that have no provision for

587

arbitrary byte strings. As Mercurial often doesn't know

596

arbitrary byte strings. As Mercurial often doesn't know

588

what encoding data is in, we use so-called UTF-8b.

597

what encoding data is in, we use so-called UTF-8b.

589

598

590

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

599

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

591

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

600

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

592

uDC00-uDCFF.

601

uDC00-uDCFF.

593

602

594

Principles of operation:

603

Principles of operation:

595

604

596

- ASCII and UTF-8 data successfully round-trips and is understood

605

- ASCII and UTF-8 data successfully round-trips and is understood

597

by Unicode-oriented clients

606

by Unicode-oriented clients

598

- filenames and file contents in arbitrary other encodings can have

607

- filenames and file contents in arbitrary other encodings can have

599

be round-tripped or recovered by clueful clients

608

be round-tripped or recovered by clueful clients

600

- local strings that have a cached known UTF-8 encoding (aka

609

- local strings that have a cached known UTF-8 encoding (aka

601

localstr) get sent as UTF-8 so Unicode-oriented clients get the

610

localstr) get sent as UTF-8 so Unicode-oriented clients get the

602

Unicode data they want

611

Unicode data they want

603

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

612

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

604

- because we must preserve UTF-8 bytestring in places such as

613

- because we must preserve UTF-8 bytestring in places such as

605

filenames, metadata can't be roundtripped without help

614

filenames, metadata can't be roundtripped without help

606

615

607

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

616

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

608

arbitrary bytes into an internal Unicode format that can be

617

arbitrary bytes into an internal Unicode format that can be

609

re-encoded back into the original. Here we are exposing the

618

re-encoded back into the original. Here we are exposing the

610

internal surrogate encoding as a UTF-8 string.)

619

internal surrogate encoding as a UTF-8 string.)

611

'''

620

'''

612

621

613

if isinstance(s, localstr):

622

if isinstance(s, localstr):

614

# assume that the original UTF-8 sequence would never contain

623

# assume that the original UTF-8 sequence would never contain

615

# invalid characters in U+DCxx range

624

# invalid characters in U+DCxx range

616

return s._utf8

625

return s._utf8

617

elif isinstance(s, safelocalstr):

626

elif isinstance(s, safelocalstr):

618

# already verified that s is non-lossy in legacy encoding, which

627

# already verified that s is non-lossy in legacy encoding, which

619

# shouldn't contain characters in U+DCxx range

628

# shouldn't contain characters in U+DCxx range

620

return fromlocal(s)

629

return fromlocal(s)

621

elif isasciistr(s):

630

elif isasciistr(s):

622

return s

631

return s

623

if b"\xed" not in s:

632

if b"\xed" not in s:

624

try:

633

try:

625

s.decode('utf-8', _utf8strict)

634

s.decode('utf-8', _utf8strict)

626

return s

635

return s

627

except UnicodeDecodeError:

636

except UnicodeDecodeError:

628

pass

637

pass

629

638

630

s = pycompat.bytestr(s)

639

s = pycompat.bytestr(s)

631

r = b""

640

r = b""

632

pos = 0

641

pos = 0

633

l = len(s)

642

l = len(s)

634

while pos < l:

643

while pos < l:

635

try:

644

try:

636

c = getutf8char(s, pos)

645

c = getutf8char(s, pos)

637

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

646

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

638

# have to re-escape existing U+DCxx characters

647

# have to re-escape existing U+DCxx characters

639

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

648

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

640

pos += 1

649

pos += 1

641

else:

650

else:

642

pos += len(c)

651

pos += len(c)

643

except UnicodeDecodeError:

652

except UnicodeDecodeError:

644

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

653

c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)

645

pos += 1

654

pos += 1

646

r += c

655

r += c

647

return r

656

return r

648

657

649

658

650

def fromutf8b(s):

659

def fromutf8b(s):

651

# type: (bytes) -> bytes

660

# type: (bytes) -> bytes

652

'''Given a UTF-8b string, return a local, possibly-binary string.

661

'''Given a UTF-8b string, return a local, possibly-binary string.

653

662

654

return the original binary string. This

663

return the original binary string. This

655

is a round-trip process for strings like filenames, but metadata

664

is a round-trip process for strings like filenames, but metadata

656

that's was passed through tolocal will remain in UTF-8.

665

that's was passed through tolocal will remain in UTF-8.

657

666

658

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

667

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

659

>>> m = b"\\xc3\\xa9\\x99abcd"

668

>>> m = b"\\xc3\\xa9\\x99abcd"

660

>>> toutf8b(m)

669

>>> toutf8b(m)

661

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

670

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

662

>>> roundtrip(m)

671

>>> roundtrip(m)

663

True

672

True

664

>>> roundtrip(b"\\xc2\\xc2\\x80")

673

>>> roundtrip(b"\\xc2\\xc2\\x80")

665

True

674

True

666

>>> roundtrip(b"\\xef\\xbf\\xbd")

675

>>> roundtrip(b"\\xef\\xbf\\xbd")

667

True

676

True

668

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

677

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

669

True

678

True

670

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

679

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

671

True

680

True

672

'''

681

'''

673

682

674

if isasciistr(s):

683

if isasciistr(s):

675

return s

684

return s

676

# fast path - look for uDxxx prefixes in s

685

# fast path - look for uDxxx prefixes in s

677

if b"\xed" not in s:

686

if b"\xed" not in s:

678

return s

687

return s

679

688

680

# We could do this with the unicode type but some Python builds

689

# We could do this with the unicode type but some Python builds

681

# use UTF-16 internally (issue5031) which causes non-BMP code

690

# use UTF-16 internally (issue5031) which causes non-BMP code

682

# points to be escaped. Instead, we use our handy getutf8char

691

# points to be escaped. Instead, we use our handy getutf8char

683

# helper again to walk the string without "decoding" it.

692

# helper again to walk the string without "decoding" it.

684

693

685

s = pycompat.bytestr(s)

694

s = pycompat.bytestr(s)

686

r = b""

695

r = b""

687

pos = 0

696

pos = 0

688

l = len(s)

697

l = len(s)

689

while pos < l:

698

while pos < l:

690

c = getutf8char(s, pos)

699

c = getutf8char(s, pos)

691

pos += len(c)

700

pos += len(c)

692

# unescape U+DCxx characters

701

# unescape U+DCxx characters

693

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

702

if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":

694

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

703

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)

695

r += c

704

r += c

696

return r

705

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from .pycompat import getattr
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import charencode as charencodepure
+            _TYPE_CHECKING = False
             if not globals():  # hide this from non-pytype users
                 from typing import (
                     Any,
                     Callable,
                     List,
+                    TYPE_CHECKING as _TYPE_CHECKING,
                     Text,
                     Type,
                     TypeVar,
                     Union,
                 )
                 # keep pyflakes happy
                 for t in (Any, Callable, List, Text, Type, Union):
                     assert t
                 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
             charencode = policy.importmod('charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [
                 unichr(int(x, 16)).encode("utf-8")
                 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
                 b"206a 206b 206c 206d 206e 206f feff".split()
             ]
             # verify the next function will work
             assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 # type: (bytes) -> bytes
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if b"\xe2" in s or b"\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, b'')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict(
                     (k.encode('utf-8'), v.encode('utf-8'))
                     for k, v in os.environ.items()  # re-exports
                 )
             _encodingrewrites = {
                 b'646': b'ascii',
                 b'ANSI_X3.4-1968': b'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites[b'cp65001'] = b'utf-8'
             try:
                 encoding = environ.get(b"HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = b'ascii'
             encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
             fallbackencoding = b'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
-                    # type: (Type[_Tlocalstr], bytes, bytes) -> _Tlocalstr
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
+                if _TYPE_CHECKING:
+                    # pseudo implementation to help pytype see localstr() constructor
+                    def __init__(self, u, l):
+                        # type: (bytes, bytes) -> None
+                        super(localstr, self).__init__(l)
+                        self._utf8 = u
                 def __hash__(self):
                     return hash(self._utf8)  # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == b'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), "replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), "replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace")  # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), "replace")
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def fromlocal(s):
                 # type: (bytes) -> bytes
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10) : inst.start + 10]
                     raise error.Abort(
                         b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
                     )
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def unitolocal(u):
                 # type: (Text) -> bytes
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 # type: (bytes) -> Text
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 def strtolocal(s):
                     # type: (str) -> bytes
                     return s  # pytype: disable=bad-return-type
                 def strfromlocal(s):
                     # type: (bytes) -> str
                     return s  # pytype: disable=bad-return-type
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict(
                     (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
                     for k, v in os.environ.items()  # re-exports
                 )
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     getcwd = lambda: strtolocal(os.getcwd())  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(
                 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
                 and b"WFA"
                 or b"WF"
             )
             def colwidth(s):
                 # type: (bytes) -> int
                 b"Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
             def ucolwidth(d):
                 # type: (Text) -> int
                 b"Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 # type: (bytes, int, int) -> bytes
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
                 raise ValueError('substring not found')
             def trim(s, width, ellipsis=b'', leftside=False):
                 # type: (bytes, int, bytes, bool) -> bytes
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width:  # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0:  # no enough room even for ellipsis
                         return ellipsis[: width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width:  # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0:  # no enough room even for ellipsis
                     return ellipsis[: width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis  # no enough room for multi-column characters
             def lower(s):
                 # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s  # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             def upper(s):
                 # type: (bytes) -> bytes
                 b"best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 # type: (Any) -> Any
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s  # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper()  # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint=b"please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 # type: (Any, Any) -> Any
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 # type: (bytes, int) -> bytes
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
                 if not l:  # ascii
                     return s[pos : pos + 1]
                 c = s[pos : pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 # type: (bytes) -> bytes
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if b"\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 # type: (bytes) -> bytes
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if b"\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = b""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
                     r += c
                 return r