upstream/mercurial-mirror Commit - r42002:25694a78

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import locale

10

import locale

11

import os

11

import os

12

import unicodedata

12

import unicodedata

13

14

from . import (

14

from . import (

15

error,

15

error,

16

policy,

16

policy,

17

pycompat,

17

pycompat,

18

)

18

)

19

20

from .pure import (

20

from .pure import (

21

charencode as charencodepure,

21

charencode as charencodepure,

22

)

22

)

23

24

charencode = policy.importmod(r'charencode')

24

charencode = policy.importmod(r'charencode')

25

26

isasciistr = charencode.isasciistr

26

isasciistr = charencode.isasciistr

27

asciilower = charencode.asciilower

27

asciilower = charencode.asciilower

28

asciiupper = charencode.asciiupper

28

asciiupper = charencode.asciiupper

29

_jsonescapeu8fast = charencode.jsonescapeu8fast

29

_jsonescapeu8fast = charencode.jsonescapeu8fast

30

31

_sysstr = pycompat.sysstr

31

_sysstr = pycompat.sysstr

32

33

if pycompat.ispy3:

33

if pycompat.ispy3:

34

unichr = chr

34

unichr = chr

35

36

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

36

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

37

# "Unicode Subtleties"), so we need to ignore them in some places for

37

# "Unicode Subtleties"), so we need to ignore them in some places for

38

# sanity.

38

# sanity.

39

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

39

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

40

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

40

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

"206a 206b 206c 206d 206e 206f feff".split()]

41

"206a 206b 206c 206d 206e 206f feff".split()]

42

# verify the next function will work

42

# verify the next function will work

43

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

43

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

44

45

def hfsignoreclean(s):

45

def hfsignoreclean(s):

46

"""Remove codepoints ignored by HFS+ from s.

46

"""Remove codepoints ignored by HFS+ from s.

47

48

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

48

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

49

'.hg'

49

'.hg'

50

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

50

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

51

'.hg'

51

'.hg'

52

"""

52

"""

53

if "\xe2" in s or "\xef" in s:

53

if "\xe2" in s or "\xef" in s:

54

for c in _ignore:

54

for c in _ignore:

55

s = s.replace(c, '')

55

s = s.replace(c, '')

56

return s

56

return s

57

58

# encoding.environ is provided read-only, which may not be used to modify

58

# encoding.environ is provided read-only, which may not be used to modify

59

# the process environment

59

# the process environment

60

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

60

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

61

if not pycompat.ispy3:

61

if not pycompat.ispy3:

62

environ = os.environ # re-exports

62

environ = os.environ # re-exports

63

elif _nativeenviron:

63

elif _nativeenviron:

64

environ = os.environb # re-exports

64

environ = os.environb # re-exports

65

else:

65

else:

66

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

66

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

67

# and recreate it once encoding is settled

67

# and recreate it once encoding is settled

68

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

68

environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))

69

for k, v in os.environ.items()) # re-exports

69

for k, v in os.environ.items()) # re-exports

70

71

_encodingrewrites = {

71

_encodingrewrites = {

72

'646': 'ascii',

72

'646': 'ascii',

73

'ANSI_X3.4-1968': 'ascii',

73

'ANSI_X3.4-1968': 'ascii',

74

}

74

}

75

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

75

# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.

76

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

76

# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.

77

# https://bugs.python.org/issue13216

77

# https://bugs.python.org/issue13216

78

if pycompat.iswindows and not pycompat.ispy3:

78

if pycompat.iswindows and not pycompat.ispy3:

79

_encodingrewrites['cp65001'] = 'utf-8'

79

_encodingrewrites['cp65001'] = 'utf-8'

80

81

try:

81

try:

82

encoding = environ.get("HGENCODING")

82

encoding = environ.get("HGENCODING")

83

if not encoding:

83

if not encoding:

84

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

84

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

85

encoding = _encodingrewrites.get(encoding, encoding)

85

encoding = _encodingrewrites.get(encoding, encoding)

86

except locale.Error:

86

except locale.Error:

87

encoding = 'ascii'

87

encoding = 'ascii'

88

encodingmode = environ.get("HGENCODINGMODE", "strict")

88

encodingmode = environ.get("HGENCODINGMODE", "strict")

89

fallbackencoding = 'ISO-8859-1'

89

fallbackencoding = 'ISO-8859-1'

90

91

class localstr(bytes):

91

class localstr(bytes):

92

'''This class allows strings that are unmodified to be

92

'''This class allows strings that are unmodified to be

93

round-tripped to the local encoding and back'''

93

round-tripped to the local encoding and back'''

94

def __new__(cls, u, l):

94

def __new__(cls, u, l):

95

s = bytes.__new__(cls, l)

95

s = bytes.__new__(cls, l)

96

s._utf8 = u

96

s._utf8 = u

97

return s

97

return s

98

def __hash__(self):

98

def __hash__(self):

99

return hash(self._utf8) # avoid collisions in local string space

99

return hash(self._utf8) # avoid collisions in local string space

100

101

class safelocalstr(bytes):

101

class safelocalstr(bytes):

102

"""Tagged string denoting it was previously an internal UTF-8 string,

102

"""Tagged string denoting it was previously an internal UTF-8 string,

103

and can be converted back to UTF-8 losslessly

103

and can be converted back to UTF-8 losslessly

104

105

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

105

>>> assert safelocalstr(b'\\xc3') == b'\\xc3'

106

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

106

>>> assert b'\\xc3' == safelocalstr(b'\\xc3')

107

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

107

>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}

108

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

108

>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}

109

"""

109

"""

110

111

def tolocal(s):

111

def tolocal(s):

112

"""

112

"""

113

Convert a string from internal UTF-8 to local encoding

113

Convert a string from internal UTF-8 to local encoding

114

115

All internal strings should be UTF-8 but some repos before the

115

All internal strings should be UTF-8 but some repos before the

116

implementation of locale support may contain latin1 or possibly

116

implementation of locale support may contain latin1 or possibly

117

other character sets. We attempt to decode everything strictly

117

other character sets. We attempt to decode everything strictly

118

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

118

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

119

replace unknown characters.

119

replace unknown characters.

120

121

The localstr class is used to cache the known UTF-8 encoding of

121

The localstr class is used to cache the known UTF-8 encoding of

122

strings next to their local representation to allow lossless

122

strings next to their local representation to allow lossless

123

round-trip conversion back to UTF-8.

123

round-trip conversion back to UTF-8.

124

125

>>> u = b'foo: \\xc3\\xa4' # utf-8

125

>>> u = b'foo: \\xc3\\xa4' # utf-8

126

>>> l = tolocal(u)

126

>>> l = tolocal(u)

127

>>> l

127

>>> l

128

'foo: ?'

128

'foo: ?'

129

>>> fromlocal(l)

129

>>> fromlocal(l)

130

'foo: \\xc3\\xa4'

130

'foo: \\xc3\\xa4'

131

>>> u2 = b'foo: \\xc3\\xa1'

131

>>> u2 = b'foo: \\xc3\\xa1'

132

>>> d = { l: 1, tolocal(u2): 2 }

132

>>> d = { l: 1, tolocal(u2): 2 }

133

>>> len(d) # no collision

133

>>> len(d) # no collision

134

2

134

2

135

>>> b'foo: ?' in d

135

>>> b'foo: ?' in d

136

False

136

False

137

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

137

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

138

>>> l = tolocal(l1)

138

>>> l = tolocal(l1)

139

>>> l

139

>>> l

140

'foo: ?'

140

'foo: ?'

141

>>> fromlocal(l) # magically in utf-8

141

>>> fromlocal(l) # magically in utf-8

142

'foo: \\xc3\\xa4'

142

'foo: \\xc3\\xa4'

143

"""

143

"""

144

145

if isasciistr(s):

145

if isasciistr(s):

146

return s

146

return s

147

148

try:

148

try:

149

try:

149

try:

150

# make sure string is actually stored in UTF-8

150

# make sure string is actually stored in UTF-8

151

u = s.decode('UTF-8')

151

u = s.decode('UTF-8')

152

if encoding == 'UTF-8':

152

if encoding == 'UTF-8':

153

# fast path

153

# fast path

154

return s

154

return s

155

r = u.encode(_sysstr(encoding), u"replace")

155

r = u.encode(_sysstr(encoding), r"replace")

156

if u == r.decode(_sysstr(encoding)):

156

if u == r.decode(_sysstr(encoding)):

157

# r is a safe, non-lossy encoding of s

157

# r is a safe, non-lossy encoding of s

158

return safelocalstr(r)

158

return safelocalstr(r)

159

return localstr(s, r)

159

return localstr(s, r)

160

except UnicodeDecodeError:

160

except UnicodeDecodeError:

161

# we should only get here if we're looking at an ancient changeset

161

# we should only get here if we're looking at an ancient changeset

162

try:

162

try:

163

u = s.decode(_sysstr(fallbackencoding))

163

u = s.decode(_sysstr(fallbackencoding))

164

r = u.encode(_sysstr(encoding), u"replace")

164

r = u.encode(_sysstr(encoding), r"replace")

165

if u == r.decode(_sysstr(encoding)):

165

if u == r.decode(_sysstr(encoding)):

166

# r is a safe, non-lossy encoding of s

166

# r is a safe, non-lossy encoding of s

167

return safelocalstr(r)

167

return safelocalstr(r)

168

return localstr(u.encode('UTF-8'), r)

168

return localstr(u.encode('UTF-8'), r)

169

except UnicodeDecodeError:

169

except UnicodeDecodeError:

170

u = s.decode("utf-8", "replace") # last ditch

170

u = s.decode("utf-8", "replace") # last ditch

171

# can't round-trip

171

# can't round-trip

172

return u.encode(_sysstr(encoding), u"replace")

172

return u.encode(_sysstr(encoding), r"replace")

173

except LookupError as k:

173

except LookupError as k:

174

raise error.Abort(k, hint="please check your locale settings")

174

raise error.Abort(k, hint="please check your locale settings")

175

176

def fromlocal(s):

176

def fromlocal(s):

177

"""

177

"""

178

Convert a string from the local character encoding to UTF-8

178

Convert a string from the local character encoding to UTF-8

179

180

We attempt to decode strings using the encoding mode set by

180

We attempt to decode strings using the encoding mode set by

181

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

181

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

182

characters will cause an error message. Other modes include

182

characters will cause an error message. Other modes include

183

'replace', which replaces unknown characters with a special

183

'replace', which replaces unknown characters with a special

184

Unicode character, and 'ignore', which drops the character.

184

Unicode character, and 'ignore', which drops the character.

185

"""

185

"""

186

187

# can we do a lossless round-trip?

187

# can we do a lossless round-trip?

188

if isinstance(s, localstr):

188

if isinstance(s, localstr):

189

return s._utf8

189

return s._utf8

190

if isasciistr(s):

190

if isasciistr(s):

191

return s

191

return s

192

193

try:

193

try:

194

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

194

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

195

return u.encode("utf-8")

195

return u.encode("utf-8")

196

except UnicodeDecodeError as inst:

196

except UnicodeDecodeError as inst:

197

sub = s[max(0, inst.start - 10):inst.start + 10]

197

sub = s[max(0, inst.start - 10):inst.start + 10]

198

raise error.Abort("decoding near '%s': %s!"

198

raise error.Abort("decoding near '%s': %s!"

199

% (sub, pycompat.bytestr(inst)))

199

% (sub, pycompat.bytestr(inst)))

200

except LookupError as k:

200

except LookupError as k:

201

raise error.Abort(k, hint="please check your locale settings")

201

raise error.Abort(k, hint="please check your locale settings")

202

203

def unitolocal(u):

203

def unitolocal(u):

204

"""Convert a unicode string to a byte string of local encoding"""

204

"""Convert a unicode string to a byte string of local encoding"""

205

return tolocal(u.encode('utf-8'))

205

return tolocal(u.encode('utf-8'))

206

207

def unifromlocal(s):

207

def unifromlocal(s):

208

"""Convert a byte string of local encoding to a unicode string"""

208

"""Convert a byte string of local encoding to a unicode string"""

209

return fromlocal(s).decode('utf-8')

209

return fromlocal(s).decode('utf-8')

210

211

def unimethod(bytesfunc):

211

def unimethod(bytesfunc):

212

"""Create a proxy method that forwards __unicode__() and __str__() of

212

"""Create a proxy method that forwards __unicode__() and __str__() of

213

Python 3 to __bytes__()"""

213

Python 3 to __bytes__()"""

214

def unifunc(obj):

214

def unifunc(obj):

215

return unifromlocal(bytesfunc(obj))

215

return unifromlocal(bytesfunc(obj))

216

return unifunc

216

return unifunc

217

218

# converter functions between native str and byte string. use these if the

218

# converter functions between native str and byte string. use these if the

219

# character encoding is not aware (e.g. exception message) or is known to

219

# character encoding is not aware (e.g. exception message) or is known to

220

# be locale dependent (e.g. date formatting.)

220

# be locale dependent (e.g. date formatting.)

221

if pycompat.ispy3:

221

if pycompat.ispy3:

222

strtolocal = unitolocal

222

strtolocal = unitolocal

223

strfromlocal = unifromlocal

223

strfromlocal = unifromlocal

224

strmethod = unimethod

224

strmethod = unimethod

225

else:

225

else:

226

strtolocal = pycompat.identity

226

strtolocal = pycompat.identity

227

strfromlocal = pycompat.identity

227

strfromlocal = pycompat.identity

228

strmethod = pycompat.identity

228

strmethod = pycompat.identity

229

230

if not _nativeenviron:

230

if not _nativeenviron:

231

# now encoding and helper functions are available, recreate the environ

231

# now encoding and helper functions are available, recreate the environ

232

# dict to be exported to other modules

232

# dict to be exported to other modules

233

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

233

environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))

234

for k, v in os.environ.items()) # re-exports

234

for k, v in os.environ.items()) # re-exports

235

236

if pycompat.ispy3:

236

if pycompat.ispy3:

237

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

237

# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which

238

# returns bytes.

238

# returns bytes.

239

if pycompat.iswindows:

239

if pycompat.iswindows:

240

# Python 3 on Windows issues a DeprecationWarning about using the bytes

240

# Python 3 on Windows issues a DeprecationWarning about using the bytes

241

# API when os.getcwdb() is called.

241

# API when os.getcwdb() is called.

242

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

242

getcwd = lambda: strtolocal(os.getcwd()) # re-exports

243

else:

243

else:

244

getcwd = os.getcwdb # re-exports

244

getcwd = os.getcwdb # re-exports

245

else:

245

else:

246

getcwd = os.getcwd # re-exports

246

getcwd = os.getcwd # re-exports

247

248

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

248

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

249

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

249

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

250

and "WFA" or "WF")

250

and "WFA" or "WF")

251

252

def colwidth(s):

252

def colwidth(s):

253

"Find the column width of a string for display in the local encoding"

253

"Find the column width of a string for display in the local encoding"

254

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

254

return ucolwidth(s.decode(_sysstr(encoding), r'replace'))

255

256

def ucolwidth(d):

256

def ucolwidth(d):

257

"Find the column width of a Unicode string for display"

257

"Find the column width of a Unicode string for display"

258

eaw = getattr(unicodedata, 'east_asian_width', None)

258

eaw = getattr(unicodedata, 'east_asian_width', None)

259

if eaw is not None:

259

if eaw is not None:

260

return sum([eaw(c) in _wide and 2 or 1 for c in d])

260

return sum([eaw(c) in _wide and 2 or 1 for c in d])

261

return len(d)

261

return len(d)

262

263

def getcols(s, start, c):

263

def getcols(s, start, c):

264

'''Use colwidth to find a c-column substring of s starting at byte

264

'''Use colwidth to find a c-column substring of s starting at byte

265

index start'''

265

index start'''

266

for x in pycompat.xrange(start + c, len(s)):

266

for x in pycompat.xrange(start + c, len(s)):

267

t = s[start:x]

267

t = s[start:x]

268

if colwidth(t) == c:

268

if colwidth(t) == c:

269

return t

269

return t

270

271

def trim(s, width, ellipsis='', leftside=False):

271

def trim(s, width, ellipsis='', leftside=False):

272

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

272

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

273

274

If 'leftside' is True, left side of string 's' is trimmed.

274

If 'leftside' is True, left side of string 's' is trimmed.

275

'ellipsis' is always placed at trimmed side.

275

'ellipsis' is always placed at trimmed side.

276

277

>>> from .node import bin

277

>>> from .node import bin

278

>>> def bprint(s):

278

>>> def bprint(s):

279

... print(pycompat.sysstr(s))

279

... print(pycompat.sysstr(s))

280

>>> ellipsis = b'+++'

280

>>> ellipsis = b'+++'

281

>>> from . import encoding

281

>>> from . import encoding

282

>>> encoding.encoding = b'utf-8'

282

>>> encoding.encoding = b'utf-8'

283

>>> t = b'1234567890'

283

>>> t = b'1234567890'

284

>>> bprint(trim(t, 12, ellipsis=ellipsis))

284

>>> bprint(trim(t, 12, ellipsis=ellipsis))

285

1234567890

285

1234567890

286

>>> bprint(trim(t, 10, ellipsis=ellipsis))

286

>>> bprint(trim(t, 10, ellipsis=ellipsis))

287

1234567890

287

1234567890

288

>>> bprint(trim(t, 8, ellipsis=ellipsis))

288

>>> bprint(trim(t, 8, ellipsis=ellipsis))

289

12345+++

289

12345+++

290

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

290

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

291

+++67890

291

+++67890

292

>>> bprint(trim(t, 8))

292

>>> bprint(trim(t, 8))

293

12345678

293

12345678

294

>>> bprint(trim(t, 8, leftside=True))

294

>>> bprint(trim(t, 8, leftside=True))

295

34567890

295

34567890

296

>>> bprint(trim(t, 3, ellipsis=ellipsis))

296

>>> bprint(trim(t, 3, ellipsis=ellipsis))

297

+++

297

+++

298

>>> bprint(trim(t, 1, ellipsis=ellipsis))

298

>>> bprint(trim(t, 1, ellipsis=ellipsis))

299

+

299

+

300

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

300

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

301

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

301

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

302

>>> bprint(trim(t, 12, ellipsis=ellipsis))

302

>>> bprint(trim(t, 12, ellipsis=ellipsis))

303

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

303

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

304

>>> bprint(trim(t, 10, ellipsis=ellipsis))

304

>>> bprint(trim(t, 10, ellipsis=ellipsis))

305

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

305

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

306

>>> bprint(trim(t, 8, ellipsis=ellipsis))

306

>>> bprint(trim(t, 8, ellipsis=ellipsis))

307

\xe3\x81\x82\xe3\x81\x84+++

307

\xe3\x81\x82\xe3\x81\x84+++

308

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

308

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

309

+++\xe3\x81\x88\xe3\x81\x8a

309

+++\xe3\x81\x88\xe3\x81\x8a

310

>>> bprint(trim(t, 5))

310

>>> bprint(trim(t, 5))

311

\xe3\x81\x82\xe3\x81\x84

311

\xe3\x81\x82\xe3\x81\x84

312

>>> bprint(trim(t, 5, leftside=True))

312

>>> bprint(trim(t, 5, leftside=True))

313

\xe3\x81\x88\xe3\x81\x8a

313

\xe3\x81\x88\xe3\x81\x8a

314

>>> bprint(trim(t, 4, ellipsis=ellipsis))

314

>>> bprint(trim(t, 4, ellipsis=ellipsis))

315

+++

315

+++

316

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

316

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

317

+++

317

+++

318

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

318

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

319

>>> bprint(trim(t, 12, ellipsis=ellipsis))

319

>>> bprint(trim(t, 12, ellipsis=ellipsis))

320

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

320

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

321

>>> bprint(trim(t, 10, ellipsis=ellipsis))

321

>>> bprint(trim(t, 10, ellipsis=ellipsis))

322

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

322

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

323

>>> bprint(trim(t, 8, ellipsis=ellipsis))

323

>>> bprint(trim(t, 8, ellipsis=ellipsis))

324

\x11\x22\x33\x44\x55+++

324

\x11\x22\x33\x44\x55+++

325

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

325

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

326

+++\x66\x77\x88\x99\xaa

326

+++\x66\x77\x88\x99\xaa

327

>>> bprint(trim(t, 8))

327

>>> bprint(trim(t, 8))

328

\x11\x22\x33\x44\x55\x66\x77\x88

328

\x11\x22\x33\x44\x55\x66\x77\x88

329

>>> bprint(trim(t, 8, leftside=True))

329

>>> bprint(trim(t, 8, leftside=True))

330

\x33\x44\x55\x66\x77\x88\x99\xaa

330

\x33\x44\x55\x66\x77\x88\x99\xaa

331

>>> bprint(trim(t, 3, ellipsis=ellipsis))

331

>>> bprint(trim(t, 3, ellipsis=ellipsis))

332

+++

332

+++

333

>>> bprint(trim(t, 1, ellipsis=ellipsis))

333

>>> bprint(trim(t, 1, ellipsis=ellipsis))

334

+

334

+

335

"""

335

"""

336

try:

336

try:

337

u = s.decode(_sysstr(encoding))

337

u = s.decode(_sysstr(encoding))

338

except UnicodeDecodeError:

338

except UnicodeDecodeError:

339

if len(s) <= width: # trimming is not needed

339

if len(s) <= width: # trimming is not needed

340

return s

340

return s

341

width -= len(ellipsis)

341

width -= len(ellipsis)

342

if width <= 0: # no enough room even for ellipsis

342

if width <= 0: # no enough room even for ellipsis

343

return ellipsis[:width + len(ellipsis)]

343

return ellipsis[:width + len(ellipsis)]

344

if leftside:

344

if leftside:

345

return ellipsis + s[-width:]

345

return ellipsis + s[-width:]

346

return s[:width] + ellipsis

346

return s[:width] + ellipsis

347

348

if ucolwidth(u) <= width: # trimming is not needed

348

if ucolwidth(u) <= width: # trimming is not needed

349

return s

349

return s

350

351

width -= len(ellipsis)

351

width -= len(ellipsis)

352

if width <= 0: # no enough room even for ellipsis

352

if width <= 0: # no enough room even for ellipsis

353

return ellipsis[:width + len(ellipsis)]

353

return ellipsis[:width + len(ellipsis)]

354

355

if leftside:

355

if leftside:

356

uslice = lambda i: u[i:]

356

uslice = lambda i: u[i:]

357

concat = lambda s: ellipsis + s

357

concat = lambda s: ellipsis + s

358

else:

358

else:

359

uslice = lambda i: u[:-i]

359

uslice = lambda i: u[:-i]

360

concat = lambda s: s + ellipsis

360

concat = lambda s: s + ellipsis

361

for i in pycompat.xrange(1, len(u)):

361

for i in pycompat.xrange(1, len(u)):

362

usub = uslice(i)

362

usub = uslice(i)

363

if ucolwidth(usub) <= width:

363

if ucolwidth(usub) <= width:

364

return concat(usub.encode(_sysstr(encoding)))

364

return concat(usub.encode(_sysstr(encoding)))

365

return ellipsis # no enough room for multi-column characters

365

return ellipsis # no enough room for multi-column characters

366

367

def lower(s):

367

def lower(s):

368

"best-effort encoding-aware case-folding of local string s"

368

"best-effort encoding-aware case-folding of local string s"

369

try:

369

try:

370

return asciilower(s)

370

return asciilower(s)

371

except UnicodeDecodeError:

371

except UnicodeDecodeError:

372

pass

372

pass

373

try:

373

try:

374

if isinstance(s, localstr):

374

if isinstance(s, localstr):

375

u = s._utf8.decode("utf-8")

375

u = s._utf8.decode("utf-8")

376

else:

376

else:

377

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

377

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

378

379

lu = u.lower()

379

lu = u.lower()

380

if u == lu:

380

if u == lu:

381

return s # preserve localstring

381

return s # preserve localstring

382

return lu.encode(_sysstr(encoding))

382

return lu.encode(_sysstr(encoding))

383

except UnicodeError:

383

except UnicodeError:

384

return s.lower() # we don't know how to fold this except in ASCII

384

return s.lower() # we don't know how to fold this except in ASCII

385

except LookupError as k:

385

except LookupError as k:

386

raise error.Abort(k, hint="please check your locale settings")

386

raise error.Abort(k, hint="please check your locale settings")

387

388

def upper(s):

388

def upper(s):

389

"best-effort encoding-aware case-folding of local string s"

389

"best-effort encoding-aware case-folding of local string s"

390

try:

390

try:

391

return asciiupper(s)

391

return asciiupper(s)

392

except UnicodeDecodeError:

392

except UnicodeDecodeError:

393

return upperfallback(s)

393

return upperfallback(s)

394

395

def upperfallback(s):

395

def upperfallback(s):

396

try:

396

try:

397

if isinstance(s, localstr):

397

if isinstance(s, localstr):

398

u = s._utf8.decode("utf-8")

398

u = s._utf8.decode("utf-8")

399

else:

399

else:

400

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

400

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

401

402

uu = u.upper()

402

uu = u.upper()

403

if u == uu:

403

if u == uu:

404

return s # preserve localstring

404

return s # preserve localstring

405

return uu.encode(_sysstr(encoding))

405

return uu.encode(_sysstr(encoding))

406

except UnicodeError:

406

except UnicodeError:

407

return s.upper() # we don't know how to fold this except in ASCII

407

return s.upper() # we don't know how to fold this except in ASCII

408

except LookupError as k:

408

except LookupError as k:

409

raise error.Abort(k, hint="please check your locale settings")

409

raise error.Abort(k, hint="please check your locale settings")

410

411

class normcasespecs(object):

411

class normcasespecs(object):

412

'''what a platform's normcase does to ASCII strings

412

'''what a platform's normcase does to ASCII strings

413

414

This is specified per platform, and should be consistent with what normcase

414

This is specified per platform, and should be consistent with what normcase

415

on that platform actually does.

415

on that platform actually does.

416

417

lower: normcase lowercases ASCII strings

417

lower: normcase lowercases ASCII strings

418

upper: normcase uppercases ASCII strings

418

upper: normcase uppercases ASCII strings

419

other: the fallback function should always be called

419

other: the fallback function should always be called

420

421

This should be kept in sync with normcase_spec in util.h.'''

421

This should be kept in sync with normcase_spec in util.h.'''

422

lower = -1

422

lower = -1

423

upper = 1

423

upper = 1

424

other = 0

424

other = 0

425

426

def jsonescape(s, paranoid=False):

426

def jsonescape(s, paranoid=False):

427

'''returns a string suitable for JSON

427

'''returns a string suitable for JSON

428

429

JSON is problematic for us because it doesn't support non-Unicode

429

JSON is problematic for us because it doesn't support non-Unicode

430

bytes. To deal with this, we take the following approach:

430

bytes. To deal with this, we take the following approach:

431

432

- localstr/safelocalstr objects are converted back to UTF-8

432

- localstr/safelocalstr objects are converted back to UTF-8

433

- valid UTF-8/ASCII strings are passed as-is

433

- valid UTF-8/ASCII strings are passed as-is

434

- other strings are converted to UTF-8b surrogate encoding

434

- other strings are converted to UTF-8b surrogate encoding

435

- apply JSON-specified string escaping

435

- apply JSON-specified string escaping

436

437

(escapes are doubled in these tests)

437

(escapes are doubled in these tests)

438

439

>>> jsonescape(b'this is a test')

439

>>> jsonescape(b'this is a test')

440

'this is a test'

440

'this is a test'

441

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

441

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

442

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

442

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

443

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

443

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

444

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

444

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

445

>>> jsonescape(b'a weird byte: \\xdd')

445

>>> jsonescape(b'a weird byte: \\xdd')

446

'a weird byte: \\xed\\xb3\\x9d'

446

'a weird byte: \\xed\\xb3\\x9d'

447

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

447

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

448

'utf-8: caf\\xc3\\xa9'

448

'utf-8: caf\\xc3\\xa9'

449

>>> jsonescape(b'')

449

>>> jsonescape(b'')

450

''

450

''

451

452

If paranoid, non-ascii and common troublesome characters are also escaped.

452

If paranoid, non-ascii and common troublesome characters are also escaped.

453

This is suitable for web output.

453

This is suitable for web output.

454

455

>>> s = b'escape characters: \\0 \\x0b \\x7f'

455

>>> s = b'escape characters: \\0 \\x0b \\x7f'

456

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

456

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

457

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

457

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

458

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

458

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

459

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

459

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

460

'escape boundary: ~ \\\\u007f \\\\u0080'

460

'escape boundary: ~ \\\\u007f \\\\u0080'

461

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

461

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

462

'a weird byte: \\\\udcdd'

462

'a weird byte: \\\\udcdd'

463

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

463

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

464

'utf-8: caf\\\\u00e9'

464

'utf-8: caf\\\\u00e9'

465

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

465

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

466

'non-BMP: \\\\ud834\\\\udd1e'

466

'non-BMP: \\\\ud834\\\\udd1e'

467

>>> jsonescape(b'<foo@example.org>', paranoid=True)

467

>>> jsonescape(b'<foo@example.org>', paranoid=True)

468

'\\\\u003cfoo@example.org\\\\u003e'

468

'\\\\u003cfoo@example.org\\\\u003e'

469

'''

469

'''

470

471

u8chars = toutf8b(s)

471

u8chars = toutf8b(s)

472

try:

472

try:

473

return _jsonescapeu8fast(u8chars, paranoid)

473

return _jsonescapeu8fast(u8chars, paranoid)

474

except ValueError:

474

except ValueError:

475

pass

475

pass

476

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

476

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

477

478

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

478

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

479

# bytes are mapped to that range.

479

# bytes are mapped to that range.

480

if pycompat.ispy3:

480

if pycompat.ispy3:

481

_utf8strict = r'surrogatepass'

481

_utf8strict = r'surrogatepass'

482

else:

482

else:

483

_utf8strict = r'strict'

483

_utf8strict = r'strict'

484

485

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

485

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

486

487

def getutf8char(s, pos):

487

def getutf8char(s, pos):

488

'''get the next full utf-8 character in the given string, starting at pos

488

'''get the next full utf-8 character in the given string, starting at pos

489

490

Raises a UnicodeError if the given location does not start a valid

490

Raises a UnicodeError if the given location does not start a valid

491

utf-8 character.

491

utf-8 character.

492

'''

492

'''

493

494

# find how many bytes to attempt decoding from first nibble

494

# find how many bytes to attempt decoding from first nibble

495

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

495

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

496

if not l: # ascii

496

if not l: # ascii

497

return s[pos:pos + 1]

497

return s[pos:pos + 1]

498

499

c = s[pos:pos + l]

499

c = s[pos:pos + l]

500

# validate with attempted decode

500

# validate with attempted decode

501

c.decode("utf-8", _utf8strict)

501

c.decode("utf-8", _utf8strict)

502

return c

502

return c

503

504

def toutf8b(s):

504

def toutf8b(s):

505

'''convert a local, possibly-binary string into UTF-8b

505

'''convert a local, possibly-binary string into UTF-8b

506

507

This is intended as a generic method to preserve data when working

507

This is intended as a generic method to preserve data when working

508

with schemes like JSON and XML that have no provision for

508

with schemes like JSON and XML that have no provision for

509

arbitrary byte strings. As Mercurial often doesn't know

509

arbitrary byte strings. As Mercurial often doesn't know

510

what encoding data is in, we use so-called UTF-8b.

510

what encoding data is in, we use so-called UTF-8b.

511

512

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

512

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

513

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

513

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

514

uDC00-uDCFF.

514

uDC00-uDCFF.

515

516

Principles of operation:

516

Principles of operation:

517

518

- ASCII and UTF-8 data successfully round-trips and is understood

518

- ASCII and UTF-8 data successfully round-trips and is understood

519

by Unicode-oriented clients

519

by Unicode-oriented clients

520

- filenames and file contents in arbitrary other encodings can have

520

- filenames and file contents in arbitrary other encodings can have

521

be round-tripped or recovered by clueful clients

521

be round-tripped or recovered by clueful clients

522

- local strings that have a cached known UTF-8 encoding (aka

522

- local strings that have a cached known UTF-8 encoding (aka

523

localstr) get sent as UTF-8 so Unicode-oriented clients get the

523

localstr) get sent as UTF-8 so Unicode-oriented clients get the

524

Unicode data they want

524

Unicode data they want

525

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

525

- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well

526

- because we must preserve UTF-8 bytestring in places such as

526

- because we must preserve UTF-8 bytestring in places such as

527

filenames, metadata can't be roundtripped without help

527

filenames, metadata can't be roundtripped without help

528

529

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

529

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

530

arbitrary bytes into an internal Unicode format that can be

530

arbitrary bytes into an internal Unicode format that can be

531

re-encoded back into the original. Here we are exposing the

531

re-encoded back into the original. Here we are exposing the

532

internal surrogate encoding as a UTF-8 string.)

532

internal surrogate encoding as a UTF-8 string.)

533

'''

533

'''

534

535

if isinstance(s, localstr):

535

if isinstance(s, localstr):

536

# assume that the original UTF-8 sequence would never contain

536

# assume that the original UTF-8 sequence would never contain

537

# invalid characters in U+DCxx range

537

# invalid characters in U+DCxx range

538

return s._utf8

538

return s._utf8

539

elif isinstance(s, safelocalstr):

539

elif isinstance(s, safelocalstr):

540

# already verified that s is non-lossy in legacy encoding, which

540

# already verified that s is non-lossy in legacy encoding, which

541

# shouldn't contain characters in U+DCxx range

541

# shouldn't contain characters in U+DCxx range

542

return fromlocal(s)

542

return fromlocal(s)

543

elif isasciistr(s):

543

elif isasciistr(s):

544

return s

544

return s

545

if "\xed" not in s:

545

if "\xed" not in s:

546

try:

546

try:

547

s.decode('utf-8', _utf8strict)

547

s.decode('utf-8', _utf8strict)

548

return s

548

return s

549

except UnicodeDecodeError:

549

except UnicodeDecodeError:

550

pass

550

pass

551

552

s = pycompat.bytestr(s)

552

s = pycompat.bytestr(s)

553

r = ""

553

r = ""

554

pos = 0

554

pos = 0

555

l = len(s)

555

l = len(s)

556

while pos < l:

556

while pos < l:

557

try:

557

try:

558

c = getutf8char(s, pos)

558

c = getutf8char(s, pos)

559

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

559

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

560

# have to re-escape existing U+DCxx characters

560

# have to re-escape existing U+DCxx characters

561

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

561

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

562

pos += 1

562

pos += 1

563

else:

563

else:

564

pos += len(c)

564

pos += len(c)

565

except UnicodeDecodeError:

565

except UnicodeDecodeError:

566

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

566

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

567

pos += 1

567

pos += 1

568

r += c

568

r += c

569

return r

569

return r

570

571

def fromutf8b(s):

571

def fromutf8b(s):

572

'''Given a UTF-8b string, return a local, possibly-binary string.

572

'''Given a UTF-8b string, return a local, possibly-binary string.

573

574

return the original binary string. This

574

return the original binary string. This

575

is a round-trip process for strings like filenames, but metadata

575

is a round-trip process for strings like filenames, but metadata

576

that's was passed through tolocal will remain in UTF-8.

576

that's was passed through tolocal will remain in UTF-8.

577

578

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

578

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

579

>>> m = b"\\xc3\\xa9\\x99abcd"

579

>>> m = b"\\xc3\\xa9\\x99abcd"

580

>>> toutf8b(m)

580

>>> toutf8b(m)

581

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

581

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

582

>>> roundtrip(m)

582

>>> roundtrip(m)

583

True

583

True

584

>>> roundtrip(b"\\xc2\\xc2\\x80")

584

>>> roundtrip(b"\\xc2\\xc2\\x80")

585

True

585

True

586

>>> roundtrip(b"\\xef\\xbf\\xbd")

586

>>> roundtrip(b"\\xef\\xbf\\xbd")

587

True

587

True

588

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

588

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

589

True

589

True

590

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

590

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

591

True

591

True

592

'''

592

'''

593

594

if isasciistr(s):

594

if isasciistr(s):

595

return s

595

return s

596

# fast path - look for uDxxx prefixes in s

596

# fast path - look for uDxxx prefixes in s

597

if "\xed" not in s:

597

if "\xed" not in s:

598

return s

598

return s

599

600

# We could do this with the unicode type but some Python builds

600

# We could do this with the unicode type but some Python builds

601

# use UTF-16 internally (issue5031) which causes non-BMP code

601

# use UTF-16 internally (issue5031) which causes non-BMP code

602

# points to be escaped. Instead, we use our handy getutf8char

602

# points to be escaped. Instead, we use our handy getutf8char

603

# helper again to walk the string without "decoding" it.

603

# helper again to walk the string without "decoding" it.

604

605

s = pycompat.bytestr(s)

605

s = pycompat.bytestr(s)

606

r = ""

606

r = ""

607

pos = 0

607

pos = 0

608

l = len(s)

608

l = len(s)

609

while pos < l:

609

while pos < l:

610

c = getutf8char(s, pos)

610

c = getutf8char(s, pos)

611

pos += len(c)

611

pos += len(c)

612

# unescape U+DCxx characters

612

# unescape U+DCxx characters

613

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

613

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

614

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)

614

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)

615

r += c

615

r += c

616

return r

616

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import (
                 charencode as charencodepure,
             )
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
-                environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
+                environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingrewrites = {
                 '646': 'ascii',
                 'ANSI_X3.4-1968': 'ascii',
             }
             # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
             # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
             # https://bugs.python.org/issue13216
             if pycompat.iswindows and not pycompat.ispy3:
                 _encodingrewrites['cp65001'] = 'utf-8'
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingrewrites.get(encoding, encoding)
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             class safelocalstr(bytes):
                 """Tagged string denoting it was previously an internal UTF-8 string,
                 and can be converted back to UTF-8 losslessly
                 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
                 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
                 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
                 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
                 """
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
-                        r = u.encode(_sysstr(encoding), u"replace")
+                        r = u.encode(_sysstr(encoding), r"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return safelocalstr(r)
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
-                            r = u.encode(_sysstr(encoding), u"replace")
+                            r = u.encode(_sysstr(encoding), r"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return safelocalstr(r)
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
-                            return u.encode(_sysstr(encoding), u"replace")
+                            return u.encode(_sysstr(encoding), r"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!"
                                       % (sub, pycompat.bytestr(inst)))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
-                environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
+                environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             if pycompat.ispy3:
                 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
                 # returns bytes.
                 if pycompat.iswindows:
                     # Python 3 on Windows issues a DeprecationWarning about using the bytes
                     # API when os.getcwdb() is called.
                     getcwd = lambda: strtolocal(os.getcwd())  # re-exports
                 else:
                     getcwd = os.getcwdb  # re-exports
             else:
                 getcwd = os.getcwd  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
-                return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
+                return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in pycompat.xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in pycompat.xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr/safelocalstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
             # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
             # bytes are mapped to that range.
             if pycompat.ispy3:
                 _utf8strict = r'surrogatepass'
             else:
                 _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
                 if not l: # ascii
                     return s[pos:pos + 1]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     # assume that the original UTF-8 sequence would never contain
                     # invalid characters in U+DCxx range
                     return s._utf8
                 elif isinstance(s, safelocalstr):
                     # already verified that s is non-lossy in legacy encoding, which
                     # shouldn't contain characters in U+DCxx range
                     return fromlocal(s)
                 elif isasciistr(s):
                     return s
                 if "\xed" not in s:
                     try:
                         s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
                     r += c
                 return r