upstream/mercurial-mirror Commit - r34215:aa877860

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import, print_function

8

from __future__ import absolute_import, print_function

9

10

import io

10

import io

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

policy,

17

policy,

18

pycompat,

18

pycompat,

19

)

19

)

20

21

from .pure import (

21

from .pure import (

22

charencode as charencodepure,

22

charencode as charencodepure,

23

)

23

)

24

25

charencode = policy.importmod(r'charencode')

25

charencode = policy.importmod(r'charencode')

26

27

isasciistr = charencode.isasciistr

27

isasciistr = charencode.isasciistr

28

asciilower = charencode.asciilower

28

asciilower = charencode.asciilower

29

asciiupper = charencode.asciiupper

29

asciiupper = charencode.asciiupper

30

_jsonescapeu8fast = charencode.jsonescapeu8fast

30

_jsonescapeu8fast = charencode.jsonescapeu8fast

31

32

_sysstr = pycompat.sysstr

32

_sysstr = pycompat.sysstr

33

34

if pycompat.ispy3:

34

if pycompat.ispy3:

35

unichr = chr

35

unichr = chr

36

37

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

37

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

38

# "Unicode Subtleties"), so we need to ignore them in some places for

38

# "Unicode Subtleties"), so we need to ignore them in some places for

39

# sanity.

39

# sanity.

40

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

40

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

41

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

41

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

42

"206a 206b 206c 206d 206e 206f feff".split()]

42

"206a 206b 206c 206d 206e 206f feff".split()]

43

# verify the next function will work

43

# verify the next function will work

44

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

44

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

45

46

def hfsignoreclean(s):

46

def hfsignoreclean(s):

47

"""Remove codepoints ignored by HFS+ from s.

47

"""Remove codepoints ignored by HFS+ from s.

48

49

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

49

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

50

'.hg'

50

'.hg'

51

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

51

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

52

'.hg'

52

'.hg'

53

"""

53

"""

54

if "\xe2" in s or "\xef" in s:

54

if "\xe2" in s or "\xef" in s:

55

for c in _ignore:

55

for c in _ignore:

56

s = s.replace(c, '')

56

s = s.replace(c, '')

57

return s

57

return s

58

59

# encoding.environ is provided read-only, which may not be used to modify

59

# encoding.environ is provided read-only, which may not be used to modify

60

# the process environment

60

# the process environment

61

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

61

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

62

if not pycompat.ispy3:

62

if not pycompat.ispy3:

63

environ = os.environ # re-exports

63

environ = os.environ # re-exports

64

elif _nativeenviron:

64

elif _nativeenviron:

65

environ = os.environb # re-exports

65

environ = os.environb # re-exports

66

else:

66

else:

67

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

67

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

68

# and recreate it once encoding is settled

68

# and recreate it once encoding is settled

69

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

69

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

70

for k, v in os.environ.items()) # re-exports

70

for k, v in os.environ.items()) # re-exports

71

72

_encodingfixers = {

72

_encodingfixers = {

73

'646': lambda: 'ascii',

73

'646': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

74

'ANSI_X3.4-1968': lambda: 'ascii',

75

}

75

}

76

77

try:

77

try:

78

encoding = environ.get("HGENCODING")

78

encoding = environ.get("HGENCODING")

79

if not encoding:

79

if not encoding:

80

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

80

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

81

encoding = _encodingfixers.get(encoding, lambda: encoding)()

81

encoding = _encodingfixers.get(encoding, lambda: encoding)()

82

except locale.Error:

82

except locale.Error:

83

encoding = 'ascii'

83

encoding = 'ascii'

84

encodingmode = environ.get("HGENCODINGMODE", "strict")

84

encodingmode = environ.get("HGENCODINGMODE", "strict")

85

fallbackencoding = 'ISO-8859-1'

85

fallbackencoding = 'ISO-8859-1'

86

87

class localstr(bytes):

87

class localstr(bytes):

88

'''This class allows strings that are unmodified to be

88

'''This class allows strings that are unmodified to be

89

round-tripped to the local encoding and back'''

89

round-tripped to the local encoding and back'''

90

def __new__(cls, u, l):

90

def __new__(cls, u, l):

91

s = bytes.__new__(cls, l)

91

s = bytes.__new__(cls, l)

92

s._utf8 = u

92

s._utf8 = u

93

return s

93

return s

94

def __hash__(self):

94

def __hash__(self):

95

return hash(self._utf8) # avoid collisions in local string space

95

return hash(self._utf8) # avoid collisions in local string space

96

97

def tolocal(s):

97

def tolocal(s):

98

"""

98

"""

99

Convert a string from internal UTF-8 to local encoding

99

Convert a string from internal UTF-8 to local encoding

100

101

All internal strings should be UTF-8 but some repos before the

101

All internal strings should be UTF-8 but some repos before the

102

implementation of locale support may contain latin1 or possibly

102

implementation of locale support may contain latin1 or possibly

103

other character sets. We attempt to decode everything strictly

103

other character sets. We attempt to decode everything strictly

104

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

104

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

105

replace unknown characters.

105

replace unknown characters.

106

107

The localstr class is used to cache the known UTF-8 encoding of

107

The localstr class is used to cache the known UTF-8 encoding of

108

strings next to their local representation to allow lossless

108

strings next to their local representation to allow lossless

109

round-trip conversion back to UTF-8.

109

round-trip conversion back to UTF-8.

110

111

>>> u = b'foo: \\xc3\\xa4' # utf-8

111

>>> u = b'foo: \\xc3\\xa4' # utf-8

112

>>> l = tolocal(u)

112

>>> l = tolocal(u)

113

>>> l

113

>>> l

114

'foo: ?'

114

'foo: ?'

115

>>> fromlocal(l)

115

>>> fromlocal(l)

116

'foo: \\xc3\\xa4'

116

'foo: \\xc3\\xa4'

117

>>> u2 = b'foo: \\xc3\\xa1'

117

>>> u2 = b'foo: \\xc3\\xa1'

118

>>> d = { l: 1, tolocal(u2): 2 }

118

>>> d = { l: 1, tolocal(u2): 2 }

119

>>> len(d) # no collision

119

>>> len(d) # no collision

120

2

120

2

121

>>> b'foo: ?' in d

121

>>> b'foo: ?' in d

122

False

122

False

123

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

123

>>> l1 = b'foo: \\xe4' # historical latin1 fallback

124

>>> l = tolocal(l1)

124

>>> l = tolocal(l1)

125

>>> l

125

>>> l

126

'foo: ?'

126

'foo: ?'

127

>>> fromlocal(l) # magically in utf-8

127

>>> fromlocal(l) # magically in utf-8

128

'foo: \\xc3\\xa4'

128

'foo: \\xc3\\xa4'

129

"""

129

"""

130

131

if isasciistr(s):

131

if isasciistr(s):

132

return s

132

return s

133

134

try:

134

try:

135

try:

135

try:

136

# make sure string is actually stored in UTF-8

136

# make sure string is actually stored in UTF-8

137

u = s.decode('UTF-8')

137

u = s.decode('UTF-8')

138

if encoding == 'UTF-8':

138

if encoding == 'UTF-8':

139

# fast path

139

# fast path

140

return s

140

return s

141

r = u.encode(_sysstr(encoding), u"replace")

141

r = u.encode(_sysstr(encoding), u"replace")

142

if u == r.decode(_sysstr(encoding)):

142

if u == r.decode(_sysstr(encoding)):

143

# r is a safe, non-lossy encoding of s

143

# r is a safe, non-lossy encoding of s

144

return r

144

return r

145

return localstr(s, r)

145

return localstr(s, r)

146

except UnicodeDecodeError:

146

except UnicodeDecodeError:

147

# we should only get here if we're looking at an ancient changeset

147

# we should only get here if we're looking at an ancient changeset

148

try:

148

try:

149

u = s.decode(_sysstr(fallbackencoding))

149

u = s.decode(_sysstr(fallbackencoding))

150

r = u.encode(_sysstr(encoding), u"replace")

150

r = u.encode(_sysstr(encoding), u"replace")

151

if u == r.decode(_sysstr(encoding)):

151

if u == r.decode(_sysstr(encoding)):

152

# r is a safe, non-lossy encoding of s

152

# r is a safe, non-lossy encoding of s

153

return r

153

return r

154

return localstr(u.encode('UTF-8'), r)

154

return localstr(u.encode('UTF-8'), r)

155

except UnicodeDecodeError:

155

except UnicodeDecodeError:

156

u = s.decode("utf-8", "replace") # last ditch

156

u = s.decode("utf-8", "replace") # last ditch

157

# can't round-trip

157

# can't round-trip

158

return u.encode(_sysstr(encoding), u"replace")

158

return u.encode(_sysstr(encoding), u"replace")

159

except LookupError as k:

159

except LookupError as k:

160

raise error.Abort(k, hint="please check your locale settings")

160

raise error.Abort(k, hint="please check your locale settings")

161

162

def fromlocal(s):

162

def fromlocal(s):

163

"""

163

"""

164

Convert a string from the local character encoding to UTF-8

164

Convert a string from the local character encoding to UTF-8

165

166

We attempt to decode strings using the encoding mode set by

166

We attempt to decode strings using the encoding mode set by

167

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

167

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

168

characters will cause an error message. Other modes include

168

characters will cause an error message. Other modes include

169

'replace', which replaces unknown characters with a special

169

'replace', which replaces unknown characters with a special

170

Unicode character, and 'ignore', which drops the character.

170

Unicode character, and 'ignore', which drops the character.

171

"""

171

"""

172

173

# can we do a lossless round-trip?

173

# can we do a lossless round-trip?

174

if isinstance(s, localstr):

174

if isinstance(s, localstr):

175

return s._utf8

175

return s._utf8

176

if isasciistr(s):

176

if isasciistr(s):

177

return s

177

return s

178

179

try:

179

try:

180

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

180

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

181

return u.encode("utf-8")

181

return u.encode("utf-8")

182

except UnicodeDecodeError as inst:

182

except UnicodeDecodeError as inst:

183

sub = s[max(0, inst.start - 10):inst.start + 10]

183

sub = s[max(0, inst.start - 10):inst.start + 10]

184

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

184

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

185

except LookupError as k:

185

except LookupError as k:

186

raise error.Abort(k, hint="please check your locale settings")

186

raise error.Abort(k, hint="please check your locale settings")

187

188

def unitolocal(u):

188

def unitolocal(u):

189

"""Convert a unicode string to a byte string of local encoding"""

189

"""Convert a unicode string to a byte string of local encoding"""

190

return tolocal(u.encode('utf-8'))

190

return tolocal(u.encode('utf-8'))

191

192

def unifromlocal(s):

192

def unifromlocal(s):

193

"""Convert a byte string of local encoding to a unicode string"""

193

"""Convert a byte string of local encoding to a unicode string"""

194

return fromlocal(s).decode('utf-8')

194

return fromlocal(s).decode('utf-8')

195

196

def unimethod(bytesfunc):

196

def unimethod(bytesfunc):

197

"""Create a proxy method that forwards __unicode__() and __str__() of

197

"""Create a proxy method that forwards __unicode__() and __str__() of

198

Python 3 to __bytes__()"""

198

Python 3 to __bytes__()"""

199

def unifunc(obj):

199

def unifunc(obj):

200

return unifromlocal(bytesfunc(obj))

200

return unifromlocal(bytesfunc(obj))

201

return unifunc

201

return unifunc

202

203

# converter functions between native str and byte string. use these if the

203

# converter functions between native str and byte string. use these if the

204

# character encoding is not aware (e.g. exception message) or is known to

204

# character encoding is not aware (e.g. exception message) or is known to

205

# be locale dependent (e.g. date formatting.)

205

# be locale dependent (e.g. date formatting.)

206

if pycompat.ispy3:

206

if pycompat.ispy3:

207

strtolocal = unitolocal

207

strtolocal = unitolocal

208

strfromlocal = unifromlocal

208

strfromlocal = unifromlocal

209

strmethod = unimethod

209

strmethod = unimethod

210

else:

210

else:

211

strtolocal = pycompat.identity

211

strtolocal = pycompat.identity

212

strfromlocal = pycompat.identity

212

strfromlocal = pycompat.identity

213

strmethod = pycompat.identity

213

strmethod = pycompat.identity

214

215

if not _nativeenviron:

215

if not _nativeenviron:

216

# now encoding and helper functions are available, recreate the environ

216

# now encoding and helper functions are available, recreate the environ

217

# dict to be exported to other modules

217

# dict to be exported to other modules

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

218

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

219

for k, v in os.environ.items()) # re-exports

219

for k, v in os.environ.items()) # re-exports

220

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

221

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

222

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

223

and "WFA" or "WF")

223

and "WFA" or "WF")

224

225

def colwidth(s):

225

def colwidth(s):

226

"Find the column width of a string for display in the local encoding"

226

"Find the column width of a string for display in the local encoding"

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

227

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

228

229

def ucolwidth(d):

229

def ucolwidth(d):

230

"Find the column width of a Unicode string for display"

230

"Find the column width of a Unicode string for display"

231

eaw = getattr(unicodedata, 'east_asian_width', None)

231

eaw = getattr(unicodedata, 'east_asian_width', None)

232

if eaw is not None:

232

if eaw is not None:

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

233

return sum([eaw(c) in _wide and 2 or 1 for c in d])

234

return len(d)

234

return len(d)

235

236

def getcols(s, start, c):

236

def getcols(s, start, c):

237

'''Use colwidth to find a c-column substring of s starting at byte

237

'''Use colwidth to find a c-column substring of s starting at byte

238

index start'''

238

index start'''

239

for x in xrange(start + c, len(s)):

239

for x in xrange(start + c, len(s)):

240

t = s[start:x]

240

t = s[start:x]

241

if colwidth(t) == c:

241

if colwidth(t) == c:

242

return t

242

return t

243

244

def trim(s, width, ellipsis='', leftside=False):

244

def trim(s, width, ellipsis='', leftside=False):

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

245

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

246

247

If 'leftside' is True, left side of string 's' is trimmed.

247

If 'leftside' is True, left side of string 's' is trimmed.

248

'ellipsis' is always placed at trimmed side.

248

'ellipsis' is always placed at trimmed side.

249

250

>>> from .node import bin

250

>>> from .node import bin

251

>>> def bprint(s):

251

>>> def bprint(s):

252

... print(pycompat.sysstr(s))

252

... print(pycompat.sysstr(s))

253

>>> ellipsis = b'+++'

253

>>> ellipsis = b'+++'

254

>>> from . import encoding

254

>>> from . import encoding

255

>>> encoding.encoding = b'utf-8'

255

>>> encoding.encoding = b'utf-8'

256

>>> t = b'1234567890'

256

>>> t = b'1234567890'

257

>>> bprint(trim(t, 12, ellipsis=ellipsis))

257

>>> bprint(trim(t, 12, ellipsis=ellipsis))

258

1234567890

258

1234567890

259

>>> bprint(trim(t, 10, ellipsis=ellipsis))

259

>>> bprint(trim(t, 10, ellipsis=ellipsis))

260

1234567890

260

1234567890

261

>>> bprint(trim(t, 8, ellipsis=ellipsis))

261

>>> bprint(trim(t, 8, ellipsis=ellipsis))

262

12345+++

262

12345+++

263

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

263

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

264

+++67890

264

+++67890

265

>>> bprint(trim(t, 8))

265

>>> bprint(trim(t, 8))

266

12345678

266

12345678

267

>>> bprint(trim(t, 8, leftside=True))

267

>>> bprint(trim(t, 8, leftside=True))

268

34567890

268

34567890

269

>>> bprint(trim(t, 3, ellipsis=ellipsis))

269

>>> bprint(trim(t, 3, ellipsis=ellipsis))

270

+++

270

+++

271

>>> bprint(trim(t, 1, ellipsis=ellipsis))

271

>>> bprint(trim(t, 1, ellipsis=ellipsis))

272

+

272

+

273

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

273

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

274

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

274

>>> t = u.encode(pycompat.sysstr(encoding.encoding))

275

>>> bprint(trim(t, 12, ellipsis=ellipsis))

275

>>> bprint(trim(t, 12, ellipsis=ellipsis))

276

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

276

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

277

>>> bprint(trim(t, 10, ellipsis=ellipsis))

277

>>> bprint(trim(t, 10, ellipsis=ellipsis))

278

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

278

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

279

>>> bprint(trim(t, 8, ellipsis=ellipsis))

279

>>> bprint(trim(t, 8, ellipsis=ellipsis))

280

\xe3\x81\x82\xe3\x81\x84+++

280

\xe3\x81\x82\xe3\x81\x84+++

281

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

281

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

282

+++\xe3\x81\x88\xe3\x81\x8a

282

+++\xe3\x81\x88\xe3\x81\x8a

283

>>> bprint(trim(t, 5))

283

>>> bprint(trim(t, 5))

284

\xe3\x81\x82\xe3\x81\x84

284

\xe3\x81\x82\xe3\x81\x84

285

>>> bprint(trim(t, 5, leftside=True))

285

>>> bprint(trim(t, 5, leftside=True))

286

\xe3\x81\x88\xe3\x81\x8a

286

\xe3\x81\x88\xe3\x81\x8a

287

>>> bprint(trim(t, 4, ellipsis=ellipsis))

287

>>> bprint(trim(t, 4, ellipsis=ellipsis))

288

+++

288

+++

289

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

289

>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))

290

+++

290

+++

291

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

291

>>> t = bin(b'112233445566778899aa') # invalid byte sequence

292

>>> bprint(trim(t, 12, ellipsis=ellipsis))

292

>>> bprint(trim(t, 12, ellipsis=ellipsis))

293

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

293

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

294

>>> bprint(trim(t, 10, ellipsis=ellipsis))

294

>>> bprint(trim(t, 10, ellipsis=ellipsis))

295

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

295

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

296

>>> bprint(trim(t, 8, ellipsis=ellipsis))

296

>>> bprint(trim(t, 8, ellipsis=ellipsis))

297

\x11\x22\x33\x44\x55+++

297

\x11\x22\x33\x44\x55+++

298

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

298

>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))

299

+++\x66\x77\x88\x99\xaa

299

+++\x66\x77\x88\x99\xaa

300

>>> bprint(trim(t, 8))

300

>>> bprint(trim(t, 8))

301

\x11\x22\x33\x44\x55\x66\x77\x88

301

\x11\x22\x33\x44\x55\x66\x77\x88

302

>>> bprint(trim(t, 8, leftside=True))

302

>>> bprint(trim(t, 8, leftside=True))

303

\x33\x44\x55\x66\x77\x88\x99\xaa

303

\x33\x44\x55\x66\x77\x88\x99\xaa

304

>>> bprint(trim(t, 3, ellipsis=ellipsis))

304

>>> bprint(trim(t, 3, ellipsis=ellipsis))

305

+++

305

+++

306

>>> bprint(trim(t, 1, ellipsis=ellipsis))

306

>>> bprint(trim(t, 1, ellipsis=ellipsis))

307

+

307

+

308

"""

308

"""

309

try:

309

try:

310

u = s.decode(_sysstr(encoding))

310

u = s.decode(_sysstr(encoding))

311

except UnicodeDecodeError:

311

except UnicodeDecodeError:

312

if len(s) <= width: # trimming is not needed

312

if len(s) <= width: # trimming is not needed

313

return s

313

return s

314

width -= len(ellipsis)

314

width -= len(ellipsis)

315

if width <= 0: # no enough room even for ellipsis

315

if width <= 0: # no enough room even for ellipsis

316

return ellipsis[:width + len(ellipsis)]

316

return ellipsis[:width + len(ellipsis)]

317

if leftside:

317

if leftside:

318

return ellipsis + s[-width:]

318

return ellipsis + s[-width:]

319

return s[:width] + ellipsis

319

return s[:width] + ellipsis

320

321

if ucolwidth(u) <= width: # trimming is not needed

321

if ucolwidth(u) <= width: # trimming is not needed

322

return s

322

return s

323

324

width -= len(ellipsis)

324

width -= len(ellipsis)

325

if width <= 0: # no enough room even for ellipsis

325

if width <= 0: # no enough room even for ellipsis

326

return ellipsis[:width + len(ellipsis)]

326

return ellipsis[:width + len(ellipsis)]

327

328

if leftside:

328

if leftside:

329

uslice = lambda i: u[i:]

329

uslice = lambda i: u[i:]

330

concat = lambda s: ellipsis + s

330

concat = lambda s: ellipsis + s

331

else:

331

else:

332

uslice = lambda i: u[:-i]

332

uslice = lambda i: u[:-i]

333

concat = lambda s: s + ellipsis

333

concat = lambda s: s + ellipsis

334

for i in xrange(1, len(u)):

334

for i in xrange(1, len(u)):

335

usub = uslice(i)

335

usub = uslice(i)

336

if ucolwidth(usub) <= width:

336

if ucolwidth(usub) <= width:

337

return concat(usub.encode(_sysstr(encoding)))

337

return concat(usub.encode(_sysstr(encoding)))

338

return ellipsis # no enough room for multi-column characters

338

return ellipsis # no enough room for multi-column characters

339

340

def lower(s):

340

def lower(s):

341

"best-effort encoding-aware case-folding of local string s"

341

"best-effort encoding-aware case-folding of local string s"

342

try:

342

try:

343

return asciilower(s)

343

return asciilower(s)

344

except UnicodeDecodeError:

344

except UnicodeDecodeError:

345

pass

345

pass

346

try:

346

try:

347

if isinstance(s, localstr):

347

if isinstance(s, localstr):

348

u = s._utf8.decode("utf-8")

348

u = s._utf8.decode("utf-8")

349

else:

349

else:

350

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

350

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

351

352

lu = u.lower()

352

lu = u.lower()

353

if u == lu:

353

if u == lu:

354

return s # preserve localstring

354

return s # preserve localstring

355

return lu.encode(_sysstr(encoding))

355

return lu.encode(_sysstr(encoding))

356

except UnicodeError:

356

except UnicodeError:

357

return s.lower() # we don't know how to fold this except in ASCII

357

return s.lower() # we don't know how to fold this except in ASCII

358

except LookupError as k:

358

except LookupError as k:

359

raise error.Abort(k, hint="please check your locale settings")

359

raise error.Abort(k, hint="please check your locale settings")

360

361

def upper(s):

361

def upper(s):

362

"best-effort encoding-aware case-folding of local string s"

362

"best-effort encoding-aware case-folding of local string s"

363

try:

363

try:

364

return asciiupper(s)

364

return asciiupper(s)

365

except UnicodeDecodeError:

365

except UnicodeDecodeError:

366

return upperfallback(s)

366

return upperfallback(s)

367

368

def upperfallback(s):

368

def upperfallback(s):

369

try:

369

try:

370

if isinstance(s, localstr):

370

if isinstance(s, localstr):

371

u = s._utf8.decode("utf-8")

371

u = s._utf8.decode("utf-8")

372

else:

372

else:

373

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

373

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

374

375

uu = u.upper()

375

uu = u.upper()

376

if u == uu:

376

if u == uu:

377

return s # preserve localstring

377

return s # preserve localstring

378

return uu.encode(_sysstr(encoding))

378

return uu.encode(_sysstr(encoding))

379

except UnicodeError:

379

except UnicodeError:

380

return s.upper() # we don't know how to fold this except in ASCII

380

return s.upper() # we don't know how to fold this except in ASCII

381

except LookupError as k:

381

except LookupError as k:

382

raise error.Abort(k, hint="please check your locale settings")

382

raise error.Abort(k, hint="please check your locale settings")

383

384

class normcasespecs(object):

384

class normcasespecs(object):

385

'''what a platform's normcase does to ASCII strings

385

'''what a platform's normcase does to ASCII strings

386

387

This is specified per platform, and should be consistent with what normcase

387

This is specified per platform, and should be consistent with what normcase

388

on that platform actually does.

388

on that platform actually does.

389

390

lower: normcase lowercases ASCII strings

390

lower: normcase lowercases ASCII strings

391

upper: normcase uppercases ASCII strings

391

upper: normcase uppercases ASCII strings

392

other: the fallback function should always be called

392

other: the fallback function should always be called

393

394

This should be kept in sync with normcase_spec in util.h.'''

394

This should be kept in sync with normcase_spec in util.h.'''

395

lower = -1

395

lower = -1

396

upper = 1

396

upper = 1

397

other = 0

397

other = 0

398

399

def jsonescape(s, paranoid=False):

399

def jsonescape(s, paranoid=False):

400

'''returns a string suitable for JSON

400

'''returns a string suitable for JSON

401

402

JSON is problematic for us because it doesn't support non-Unicode

402

JSON is problematic for us because it doesn't support non-Unicode

403

bytes. To deal with this, we take the following approach:

403

bytes. To deal with this, we take the following approach:

404

405

- localstr objects are converted back to UTF-8

405

- localstr objects are converted back to UTF-8

406

- valid UTF-8/ASCII strings are passed as-is

406

- valid UTF-8/ASCII strings are passed as-is

407

- other strings are converted to UTF-8b surrogate encoding

407

- other strings are converted to UTF-8b surrogate encoding

408

- apply JSON-specified string escaping

408

- apply JSON-specified string escaping

409

410

(escapes are doubled in these tests)

410

(escapes are doubled in these tests)

411

412

>>> jsonescape(b'this is a test')

412

>>> jsonescape(b'this is a test')

413

'this is a test'

413

'this is a test'

414

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

414

>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')

415

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

415

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

416

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

416

>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')

417

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

417

'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'

418

>>> jsonescape(b'a weird byte: \\xdd')

418

>>> jsonescape(b'a weird byte: \\xdd')

419

'a weird byte: \\xed\\xb3\\x9d'

419

'a weird byte: \\xed\\xb3\\x9d'

420

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

420

>>> jsonescape(b'utf-8: caf\\xc3\\xa9')

421

'utf-8: caf\\xc3\\xa9'

421

'utf-8: caf\\xc3\\xa9'

422

>>> jsonescape(b'')

422

>>> jsonescape(b'')

423

''

423

''

424

425

If paranoid, non-ascii and common troublesome characters are also escaped.

425

If paranoid, non-ascii and common troublesome characters are also escaped.

426

This is suitable for web output.

426

This is suitable for web output.

427

428

>>> s = b'escape characters: \\0 \\x0b \\x7f'

428

>>> s = b'escape characters: \\0 \\x0b \\x7f'

429

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

429

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

430

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

430

>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'

431

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

431

>>> assert jsonescape(s) == jsonescape(s, paranoid=True)

432

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

432

>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

433

'escape boundary: ~ \\\\u007f \\\\u0080'

433

'escape boundary: ~ \\\\u007f \\\\u0080'

434

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

434

>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)

435

'a weird byte: \\\\udcdd'

435

'a weird byte: \\\\udcdd'

436

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

436

>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)

437

'utf-8: caf\\\\u00e9'

437

'utf-8: caf\\\\u00e9'

438

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

438

>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

439

'non-BMP: \\\\ud834\\\\udd1e'

439

'non-BMP: \\\\ud834\\\\udd1e'

440

>>> jsonescape(b'<foo@example.org>', paranoid=True)

440

>>> jsonescape(b'<foo@example.org>', paranoid=True)

441

'\\\\u003cfoo@example.org\\\\u003e'

441

'\\\\u003cfoo@example.org\\\\u003e'

442

'''

442

'''

443

444

u8chars = toutf8b(s)

444

u8chars = toutf8b(s)

445

try:

445

try:

446

return _jsonescapeu8fast(u8chars, paranoid)

446

return _jsonescapeu8fast(u8chars, paranoid)

447

except ValueError:

447

except ValueError:

448

pass

448

pass

449

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

449

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

450

451

# We need to decode/encode U+DCxx codes transparently since invalid UTF-8

452

# bytes are mapped to that range.

453

if pycompat.ispy3:

454

_utf8strict = r'surrogatepass'

455

else:

456

_utf8strict = r'strict'

457

451

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

458

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

452

459

453

def getutf8char(s, pos):

460

def getutf8char(s, pos):

454

'''get the next full utf-8 character in the given string, starting at pos

461

'''get the next full utf-8 character in the given string, starting at pos

455

462

456

Raises a UnicodeError if the given location does not start a valid

463

Raises a UnicodeError if the given location does not start a valid

457

utf-8 character.

464

utf-8 character.

458

'''

465

'''

459

466

460

# find how many bytes to attempt decoding from first nibble

467

# find how many bytes to attempt decoding from first nibble

461

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

468

l = _utf8len[ord(s[pos:pos + 1]) >> 4]

462

if not l: # ascii

469

if not l: # ascii

463

return s[pos:pos + 1]

470

return s[pos:pos + 1]

464

471

465

c = s[pos:pos + l]

472

c = s[pos:pos + l]

466

# validate with attempted decode

473

# validate with attempted decode

467

c.decode("utf-8")

474

c.decode("utf-8", _utf8strict)

468

return c

475

return c

469

476

470

def toutf8b(s):

477

def toutf8b(s):

471

'''convert a local, possibly-binary string into UTF-8b

478

'''convert a local, possibly-binary string into UTF-8b

472

479

473

This is intended as a generic method to preserve data when working

480

This is intended as a generic method to preserve data when working

474

with schemes like JSON and XML that have no provision for

481

with schemes like JSON and XML that have no provision for

475

arbitrary byte strings. As Mercurial often doesn't know

482

arbitrary byte strings. As Mercurial often doesn't know

476

what encoding data is in, we use so-called UTF-8b.

483

what encoding data is in, we use so-called UTF-8b.

477

484

478

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

485

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

479

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

486

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

480

uDC00-uDCFF.

487

uDC00-uDCFF.

481

488

482

Principles of operation:

489

Principles of operation:

483

490

484

- ASCII and UTF-8 data successfully round-trips and is understood

491

- ASCII and UTF-8 data successfully round-trips and is understood

485

by Unicode-oriented clients

492

by Unicode-oriented clients

486

- filenames and file contents in arbitrary other encodings can have

493

- filenames and file contents in arbitrary other encodings can have

487

be round-tripped or recovered by clueful clients

494

be round-tripped or recovered by clueful clients

488

- local strings that have a cached known UTF-8 encoding (aka

495

- local strings that have a cached known UTF-8 encoding (aka

489

localstr) get sent as UTF-8 so Unicode-oriented clients get the

496

localstr) get sent as UTF-8 so Unicode-oriented clients get the

490

Unicode data they want

497

Unicode data they want

491

- because we must preserve UTF-8 bytestring in places such as

498

- because we must preserve UTF-8 bytestring in places such as

492

filenames, metadata can't be roundtripped without help

499

filenames, metadata can't be roundtripped without help

493

500

494

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

501

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

495

arbitrary bytes into an internal Unicode format that can be

502

arbitrary bytes into an internal Unicode format that can be

496

re-encoded back into the original. Here we are exposing the

503

re-encoded back into the original. Here we are exposing the

497

internal surrogate encoding as a UTF-8 string.)

504

internal surrogate encoding as a UTF-8 string.)

498

'''

505

'''

499

506

500

if not isinstance(s, localstr) and isasciistr(s):

507

if not isinstance(s, localstr) and isasciistr(s):

501

return s

508

return s

502

if "\xed" not in s:

509

if "\xed" not in s:

503

if isinstance(s, localstr):

510

if isinstance(s, localstr):

504

return s._utf8

511

return s._utf8

505

try:

512

try:

506

s.decode('utf-8')

513

s.decode('utf-8', _utf8strict)

507

return s

514

return s

508

except UnicodeDecodeError:

515

except UnicodeDecodeError:

509

pass

516

pass

510

517

511

s = pycompat.bytestr(s)

518

s = pycompat.bytestr(s)

512

r = ""

519

r = ""

513

pos = 0

520

pos = 0

514

l = len(s)

521

l = len(s)

515

while pos < l:

522

while pos < l:

516

try:

523

try:

517

c = getutf8char(s, pos)

524

c = getutf8char(s, pos)

518

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

525

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

519

# have to re-escape existing U+DCxx characters

526

# have to re-escape existing U+DCxx characters

520

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

527

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

521

pos += 1

528

pos += 1

522

else:

529

else:

523

pos += len(c)

530

pos += len(c)

524

except UnicodeDecodeError:

531

except UnicodeDecodeError:

525

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

532

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)

526

pos += 1

533

pos += 1

527

r += c

534

r += c

528

return r

535

return r

529

536

530

def fromutf8b(s):

537

def fromutf8b(s):

531

'''Given a UTF-8b string, return a local, possibly-binary string.

538

'''Given a UTF-8b string, return a local, possibly-binary string.

532

539

533

return the original binary string. This

540

return the original binary string. This

534

is a round-trip process for strings like filenames, but metadata

541

is a round-trip process for strings like filenames, but metadata

535

that's was passed through tolocal will remain in UTF-8.

542

that's was passed through tolocal will remain in UTF-8.

536

543

537

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

544

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

538

>>> m = b"\\xc3\\xa9\\x99abcd"

545

>>> m = b"\\xc3\\xa9\\x99abcd"

539

>>> toutf8b(m)

546

>>> toutf8b(m)

540

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

547

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

541

>>> roundtrip(m)

548

>>> roundtrip(m)

542

True

549

True

543

>>> roundtrip(b"\\xc2\\xc2\\x80")

550

>>> roundtrip(b"\\xc2\\xc2\\x80")

544

True

551

True

545

>>> roundtrip(b"\\xef\\xbf\\xbd")

552

>>> roundtrip(b"\\xef\\xbf\\xbd")

546

True

553

True

547

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

554

>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")

548

True

555

True

549

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

556

>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")

550

True

557

True

551

'''

558

'''

552

559

553

if isasciistr(s):

560

if isasciistr(s):

554

return s

561

return s

555

# fast path - look for uDxxx prefixes in s

562

# fast path - look for uDxxx prefixes in s

556

if "\xed" not in s:

563

if "\xed" not in s:

557

return s

564

return s

558

565

559

# We could do this with the unicode type but some Python builds

566

# We could do this with the unicode type but some Python builds

560

# use UTF-16 internally (issue5031) which causes non-BMP code

567

# use UTF-16 internally (issue5031) which causes non-BMP code

561

# points to be escaped. Instead, we use our handy getutf8char

568

# points to be escaped. Instead, we use our handy getutf8char

562

# helper again to walk the string without "decoding" it.

569

# helper again to walk the string without "decoding" it.

563

570

564

s = pycompat.bytestr(s)

571

s = pycompat.bytestr(s)

565

r = ""

572

r = ""

566

pos = 0

573

pos = 0

567

l = len(s)

574

l = len(s)

568

while pos < l:

575

while pos < l:

569

c = getutf8char(s, pos)

576

c = getutf8char(s, pos)

570

pos += len(c)

577

pos += len(c)

571

# unescape U+DCxx characters

578

# unescape U+DCxx characters

572

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

579

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

573

c = chr(ord(c.decode("utf-8")) & 0xff)

580

c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)

574

r += c

581

r += c

575

return r

582

return r

576

583

577

if pycompat.ispy3:

584

if pycompat.ispy3:

578

class strio(io.TextIOWrapper):

585

class strio(io.TextIOWrapper):

579

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

586

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

580

587

581

Also works around Python closing streams.

588

Also works around Python closing streams.

582

"""

589

"""

583

590

584

def __init__(self, buffer):

591

def __init__(self, buffer):

585

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

592

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

586

593

587

def __del__(self):

594

def __del__(self):

588

"""Override __del__ so it doesn't close the underlying stream."""

595

"""Override __del__ so it doesn't close the underlying stream."""

589

else:

596

else:

590

strio = pycompat.identity

597

strio = pycompat.identity

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import, print_function
             import io
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
             from .pure import (
                 charencode as charencodepure,
             )
             charencode = policy.importmod(r'charencode')
             isasciistr = charencode.isasciistr
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
             _jsonescapeu8fast = charencode.jsonescapeu8fast
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = b'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = b'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> b'foo: ?' in d
                 False
                 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 if isasciistr(s):
                     return s
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 if isasciistr(s):
                     return s
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> from .node import bin
                 >>> def bprint(s):
                 ...     print(pycompat.sysstr(s))
                 >>> ellipsis = b'+++'
                 >>> from . import encoding
                 >>> encoding.encoding = b'utf-8'
                 >>> t = b'1234567890'
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 1234567890
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
 +++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++67890
                 >>> bprint(trim(t, 8))
                 12345678
                 >>> bprint(trim(t, 8, leftside=True))
                 34567890
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 5))
                 \xe3\x81\x82\xe3\x81\x84
                 >>> bprint(trim(t, 5, leftside=True))
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> bprint(trim(t, 4, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
                 +++
                 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
                 >>> bprint(trim(t, 12, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 10, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8, ellipsis=ellipsis))
                 \x11\x22\x33\x44\x55+++
                 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
                 +++\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 8))
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> bprint(trim(t, 8, leftside=True))
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> bprint(trim(t, 3, ellipsis=ellipsis))
                 +++
                 >>> bprint(trim(t, 1, ellipsis=ellipsis))
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape(b'this is a test')
                 'this is a test'
                 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
                 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape(b'a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape(b'')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> s = b'escape characters: \\0 \\x0b \\x7f'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
                 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
                 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape(b'<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 u8chars = toutf8b(s)
                 try:
                     return _jsonescapeu8fast(u8chars, paranoid)
                 except ValueError:
                     pass
                 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+            # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+            # bytes are mapped to that range.
+            if pycompat.ispy3:
+                _utf8strict = r'surrogatepass'
+            else:
+                _utf8strict = r'strict'
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos:pos + 1]) >> 4]
                 if not l: # ascii
                     return s[pos:pos + 1]
                 c = s[pos:pos + l]
                 # validate with attempted decode
-                c.decode("utf-8")
+                c.decode("utf-8", _utf8strict)
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if not isinstance(s, localstr) and isasciistr(s):
                     return s
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
-                        s.decode('utf-8')
+                        s.decode('utf-8', _utf8strict)
                         return s
                     except UnicodeDecodeError:
                         pass
                 s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
-                            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
-                        c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                        c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = b"\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip(b"\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip(b"\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 if isasciistr(s):
                     return s
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 s = pycompat.bytestr(s)
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-                        c = chr(ord(c.decode("utf-8")) & 0xff)
+                        c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
                     r += c
                 return r
             if pycompat.ispy3:
                 class strio(io.TextIOWrapper):
                     """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
                     Also works around Python closing streams.
                     """
                     def __init__(self, buffer):
                         super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
                     def __del__(self):
                         """Override __del__ so it doesn't close the underlying stream."""
             else:
                 strio = pycompat.identity

             # charencode.py - miscellaneous character encoding
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             from .. import (
                 pycompat,
             )
             def isasciistr(s):
                 try:
                     s.decode('ascii')
                     return True
                 except UnicodeDecodeError:
                     return False
             def asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescapeu8fast(u8chars, paranoid):
                 """Convert a UTF-8 byte string to JSON-escaped form (fast path)
                 Raises ValueError if non-ASCII characters have to be escaped.
                 """
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))
                 except IndexError:
                     raise ValueError
+            if pycompat.ispy3:
+                _utf8strict = r'surrogatepass'
+            else:
+                _utf8strict = r'strict'
             def jsonescapeu8fallback(u8chars, paranoid):
                 """Convert a UTF-8 byte string to JSON-escaped form (slow path)
                 Escapes all non-ASCII characters no matter if paranoid is False.
                 """
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 # non-BMP char is represented as UTF-16 surrogate pair
-                u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
+                u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
+                u16codes = array.array(r'H', u16b)
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

             # this is hack to make sure no escape characters are inserted into the output
             from __future__ import absolute_import
             import doctest
             import os
             import re
             import sys
             ispy3 = (sys.version_info[0] >= 3)
             if 'TERM' in os.environ:
                 del os.environ['TERM']
             class py3docchecker(doctest.OutputChecker):
                 def check_output(self, want, got, optionflags):
                     want2 = re.sub(r'''\bu(['"])(.*?)\1''', r'\1\2\1', want)  # py2: u''
                     got2 = re.sub(r'''\bb(['"])(.*?)\1''', r'\1\2\1', got)  # py3: b''
                     # py3: <exc.name>: b'<msg>' -> <name>: <msg>
                     #      <exc.name>: <others> -> <name>: <others>
                     got2 = re.sub(r'''^mercurial\.\w+\.(\w+): (['"])(.*?)\2''', r'\1: \3',
                                   got2, re.MULTILINE)
                     got2 = re.sub(r'^mercurial\.\w+\.(\w+): ', r'\1: ', got2, re.MULTILINE)
                     return any(doctest.OutputChecker.check_output(self, w, g, optionflags)
                                for w, g in [(want, got), (want2, got2)])
             # TODO: migrate doctests to py3 and enable them on both versions
             def testmod(name, optionflags=0, testtarget=None, py2=True, py3=True):
                 if not (not ispy3 and py2 or ispy3 and py3):
                     return
                 __import__(name)
                 mod = sys.modules[name]
                 if testtarget is not None:
                     mod = getattr(mod, testtarget)
                 # minimal copy of doctest.testmod()
                 finder = doctest.DocTestFinder()
                 checker = None
                 if ispy3:
                     checker = py3docchecker()
                 runner = doctest.DocTestRunner(checker=checker, optionflags=optionflags)
                 for test in finder.find(mod, name):
                     runner.run(test)
                 runner.summarize()
             testmod('mercurial.changegroup')
             testmod('mercurial.changelog')
             testmod('mercurial.color')
             testmod('mercurial.config')
             testmod('mercurial.context')
             testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
             testmod('mercurial.dispatch')
-            testmod('mercurial.encoding', py3=False)  # py3: multiple encoding issues
+            testmod('mercurial.encoding')
             testmod('mercurial.formatter', py3=False)  # py3: write bytes to stdout
             testmod('mercurial.hg')
             testmod('mercurial.hgweb.hgwebdir_mod', py3=False)  # py3: repr(bytes) ?
             testmod('mercurial.match')
             testmod('mercurial.mdiff')
             testmod('mercurial.minirst')
             testmod('mercurial.patch', py3=False)  # py3: bytes[n], etc. ?
             testmod('mercurial.pathutil', py3=False)  # py3: os.sep
             testmod('mercurial.parser')
             testmod('mercurial.pycompat')
             testmod('mercurial.revsetlang')
             testmod('mercurial.smartset')
             testmod('mercurial.store')
             testmod('mercurial.subrepo')
             testmod('mercurial.templatefilters')
             testmod('mercurial.templater')
             testmod('mercurial.ui')
             testmod('mercurial.url')
             testmod('mercurial.util', py3=False)  # py3: multiple bytes/unicode issues
             testmod('mercurial.util', testtarget='platform')
             testmod('hgext.convert.convcmd', py3=False)  # py3: use of str() ?
             testmod('hgext.convert.cvsps')
             testmod('hgext.convert.filemap')
             testmod('hgext.convert.p4')
             testmod('hgext.convert.subversion')
             testmod('hgext.mq')
             # Helper scripts in tests/ that have doctests:
             testmod('drawdag')