upstream/mercurial-mirror Commit - r33925:b9101467

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

11

import io

10

import io

12

import locale

11

import locale

13

import os

12

import os

14

import unicodedata

13

import unicodedata

15

14

16

from . import (

15

from . import (

17

error,

16

error,

18

policy,

17

policy,

19

pycompat,

18

pycompat,

20

)

19

)

21

20

21

from .pure import (

22

charencode as charencodepure,

23

)

24

22

charencode = policy.importmod(r'charencode')

25

charencode = policy.importmod(r'charencode')

23

26

24

asciilower = charencode.asciilower

27

asciilower = charencode.asciilower

25

asciiupper = charencode.asciiupper

28

asciiupper = charencode.asciiupper

29

_jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure"

26

30

27

_sysstr = pycompat.sysstr

31

_sysstr = pycompat.sysstr

28

32

29

if pycompat.ispy3:

33

if pycompat.ispy3:

30

unichr = chr

34

unichr = chr

31

35

32

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

36

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

33

# "Unicode Subtleties"), so we need to ignore them in some places for

37

# "Unicode Subtleties"), so we need to ignore them in some places for

34

# sanity.

38

# sanity.

35

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

39

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

36

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

40

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

37

"206a 206b 206c 206d 206e 206f feff".split()]

41

"206a 206b 206c 206d 206e 206f feff".split()]

38

# verify the next function will work

42

# verify the next function will work

39

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

43

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

40

44

41

def hfsignoreclean(s):

45

def hfsignoreclean(s):

42

"""Remove codepoints ignored by HFS+ from s.

46

"""Remove codepoints ignored by HFS+ from s.

43

47

44

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

48

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

45

'.hg'

49

'.hg'

46

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

50

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

47

'.hg'

51

'.hg'

48

"""

52

"""

49

if "\xe2" in s or "\xef" in s:

53

if "\xe2" in s or "\xef" in s:

50

for c in _ignore:

54

for c in _ignore:

51

s = s.replace(c, '')

55

s = s.replace(c, '')

52

return s

56

return s

53

57

54

# encoding.environ is provided read-only, which may not be used to modify

58

# encoding.environ is provided read-only, which may not be used to modify

55

# the process environment

59

# the process environment

56

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

60

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

57

if not pycompat.ispy3:

61

if not pycompat.ispy3:

58

environ = os.environ # re-exports

62

environ = os.environ # re-exports

59

elif _nativeenviron:

63

elif _nativeenviron:

60

environ = os.environb # re-exports

64

environ = os.environb # re-exports

61

else:

65

else:

62

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

66

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

63

# and recreate it once encoding is settled

67

# and recreate it once encoding is settled

64

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

68

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

65

for k, v in os.environ.items()) # re-exports

69

for k, v in os.environ.items()) # re-exports

66

70

67

_encodingfixers = {

71

_encodingfixers = {

68

'646': lambda: 'ascii',

72

'646': lambda: 'ascii',

69

'ANSI_X3.4-1968': lambda: 'ascii',

73

'ANSI_X3.4-1968': lambda: 'ascii',

70

}

74

}

71

75

72

try:

76

try:

73

encoding = environ.get("HGENCODING")

77

encoding = environ.get("HGENCODING")

74

if not encoding:

78

if not encoding:

75

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

79

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

76

encoding = _encodingfixers.get(encoding, lambda: encoding)()

80

encoding = _encodingfixers.get(encoding, lambda: encoding)()

77

except locale.Error:

81

except locale.Error:

78

encoding = 'ascii'

82

encoding = 'ascii'

79

encodingmode = environ.get("HGENCODINGMODE", "strict")

83

encodingmode = environ.get("HGENCODINGMODE", "strict")

80

fallbackencoding = 'ISO-8859-1'

84

fallbackencoding = 'ISO-8859-1'

81

85

82

class localstr(bytes):

86

class localstr(bytes):

83

'''This class allows strings that are unmodified to be

87

'''This class allows strings that are unmodified to be

84

round-tripped to the local encoding and back'''

88

round-tripped to the local encoding and back'''

85

def __new__(cls, u, l):

89

def __new__(cls, u, l):

86

s = bytes.__new__(cls, l)

90

s = bytes.__new__(cls, l)

87

s._utf8 = u

91

s._utf8 = u

88

return s

92

return s

89

def __hash__(self):

93

def __hash__(self):

90

return hash(self._utf8) # avoid collisions in local string space

94

return hash(self._utf8) # avoid collisions in local string space

91

95

92

def tolocal(s):

96

def tolocal(s):

93

"""

97

"""

94

Convert a string from internal UTF-8 to local encoding

98

Convert a string from internal UTF-8 to local encoding

95

99

96

All internal strings should be UTF-8 but some repos before the

100

All internal strings should be UTF-8 but some repos before the

97

implementation of locale support may contain latin1 or possibly

101

implementation of locale support may contain latin1 or possibly

98

other character sets. We attempt to decode everything strictly

102

other character sets. We attempt to decode everything strictly

99

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

103

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

100

replace unknown characters.

104

replace unknown characters.

101

105

102

The localstr class is used to cache the known UTF-8 encoding of

106

The localstr class is used to cache the known UTF-8 encoding of

103

strings next to their local representation to allow lossless

107

strings next to their local representation to allow lossless

104

round-trip conversion back to UTF-8.

108

round-trip conversion back to UTF-8.

105

109

106

>>> u = 'foo: \\xc3\\xa4' # utf-8

110

>>> u = 'foo: \\xc3\\xa4' # utf-8

107

>>> l = tolocal(u)

111

>>> l = tolocal(u)

108

>>> l

112

>>> l

109

'foo: ?'

113

'foo: ?'

110

>>> fromlocal(l)

114

>>> fromlocal(l)

111

'foo: \\xc3\\xa4'

115

'foo: \\xc3\\xa4'

112

>>> u2 = 'foo: \\xc3\\xa1'

116

>>> u2 = 'foo: \\xc3\\xa1'

113

>>> d = { l: 1, tolocal(u2): 2 }

117

>>> d = { l: 1, tolocal(u2): 2 }

114

>>> len(d) # no collision

118

>>> len(d) # no collision

115

2

119

2

116

>>> 'foo: ?' in d

120

>>> 'foo: ?' in d

117

False

121

False

118

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

122

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

119

>>> l = tolocal(l1)

123

>>> l = tolocal(l1)

120

>>> l

124

>>> l

121

'foo: ?'

125

'foo: ?'

122

>>> fromlocal(l) # magically in utf-8

126

>>> fromlocal(l) # magically in utf-8

123

'foo: \\xc3\\xa4'

127

'foo: \\xc3\\xa4'

124

"""

128

"""

125

129

126

try:

130

try:

127

try:

131

try:

128

# make sure string is actually stored in UTF-8

132

# make sure string is actually stored in UTF-8

129

u = s.decode('UTF-8')

133

u = s.decode('UTF-8')

130

if encoding == 'UTF-8':

134

if encoding == 'UTF-8':

131

# fast path

135

# fast path

132

return s

136

return s

133

r = u.encode(_sysstr(encoding), u"replace")

137

r = u.encode(_sysstr(encoding), u"replace")

134

if u == r.decode(_sysstr(encoding)):

138

if u == r.decode(_sysstr(encoding)):

135

# r is a safe, non-lossy encoding of s

139

# r is a safe, non-lossy encoding of s

136

return r

140

return r

137

return localstr(s, r)

141

return localstr(s, r)

138

except UnicodeDecodeError:

142

except UnicodeDecodeError:

139

# we should only get here if we're looking at an ancient changeset

143

# we should only get here if we're looking at an ancient changeset

140

try:

144

try:

141

u = s.decode(_sysstr(fallbackencoding))

145

u = s.decode(_sysstr(fallbackencoding))

142

r = u.encode(_sysstr(encoding), u"replace")

146

r = u.encode(_sysstr(encoding), u"replace")

143

if u == r.decode(_sysstr(encoding)):

147

if u == r.decode(_sysstr(encoding)):

144

# r is a safe, non-lossy encoding of s

148

# r is a safe, non-lossy encoding of s

145

return r

149

return r

146

return localstr(u.encode('UTF-8'), r)

150

return localstr(u.encode('UTF-8'), r)

147

except UnicodeDecodeError:

151

except UnicodeDecodeError:

148

u = s.decode("utf-8", "replace") # last ditch

152

u = s.decode("utf-8", "replace") # last ditch

149

# can't round-trip

153

# can't round-trip

150

return u.encode(_sysstr(encoding), u"replace")

154

return u.encode(_sysstr(encoding), u"replace")

151

except LookupError as k:

155

except LookupError as k:

152

raise error.Abort(k, hint="please check your locale settings")

156

raise error.Abort(k, hint="please check your locale settings")

153

157

154

def fromlocal(s):

158

def fromlocal(s):

155

"""

159

"""

156

Convert a string from the local character encoding to UTF-8

160

Convert a string from the local character encoding to UTF-8

157

161

158

We attempt to decode strings using the encoding mode set by

162

We attempt to decode strings using the encoding mode set by

159

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

163

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

160

characters will cause an error message. Other modes include

164

characters will cause an error message. Other modes include

161

'replace', which replaces unknown characters with a special

165

'replace', which replaces unknown characters with a special

162

Unicode character, and 'ignore', which drops the character.

166

Unicode character, and 'ignore', which drops the character.

163

"""

167

"""

164

168

165

# can we do a lossless round-trip?

169

# can we do a lossless round-trip?

166

if isinstance(s, localstr):

170

if isinstance(s, localstr):

167

return s._utf8

171

return s._utf8

168

172

169

try:

173

try:

170

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

174

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

171

return u.encode("utf-8")

175

return u.encode("utf-8")

172

except UnicodeDecodeError as inst:

176

except UnicodeDecodeError as inst:

173

sub = s[max(0, inst.start - 10):inst.start + 10]

177

sub = s[max(0, inst.start - 10):inst.start + 10]

174

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

178

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

175

except LookupError as k:

179

except LookupError as k:

176

raise error.Abort(k, hint="please check your locale settings")

180

raise error.Abort(k, hint="please check your locale settings")

177

181

178

def unitolocal(u):

182

def unitolocal(u):

179

"""Convert a unicode string to a byte string of local encoding"""

183

"""Convert a unicode string to a byte string of local encoding"""

180

return tolocal(u.encode('utf-8'))

184

return tolocal(u.encode('utf-8'))

181

185

182

def unifromlocal(s):

186

def unifromlocal(s):

183

"""Convert a byte string of local encoding to a unicode string"""

187

"""Convert a byte string of local encoding to a unicode string"""

184

return fromlocal(s).decode('utf-8')

188

return fromlocal(s).decode('utf-8')

185

189

186

def unimethod(bytesfunc):

190

def unimethod(bytesfunc):

187

"""Create a proxy method that forwards __unicode__() and __str__() of

191

"""Create a proxy method that forwards __unicode__() and __str__() of

188

Python 3 to __bytes__()"""

192

Python 3 to __bytes__()"""

189

def unifunc(obj):

193

def unifunc(obj):

190

return unifromlocal(bytesfunc(obj))

194

return unifromlocal(bytesfunc(obj))

191

return unifunc

195

return unifunc

192

196

193

# converter functions between native str and byte string. use these if the

197

# converter functions between native str and byte string. use these if the

194

# character encoding is not aware (e.g. exception message) or is known to

198

# character encoding is not aware (e.g. exception message) or is known to

195

# be locale dependent (e.g. date formatting.)

199

# be locale dependent (e.g. date formatting.)

196

if pycompat.ispy3:

200

if pycompat.ispy3:

197

strtolocal = unitolocal

201

strtolocal = unitolocal

198

strfromlocal = unifromlocal

202

strfromlocal = unifromlocal

199

strmethod = unimethod

203

strmethod = unimethod

200

else:

204

else:

201

strtolocal = pycompat.identity

205

strtolocal = pycompat.identity

202

strfromlocal = pycompat.identity

206

strfromlocal = pycompat.identity

203

strmethod = pycompat.identity

207

strmethod = pycompat.identity

204

208

205

if not _nativeenviron:

209

if not _nativeenviron:

206

# now encoding and helper functions are available, recreate the environ

210

# now encoding and helper functions are available, recreate the environ

207

# dict to be exported to other modules

211

# dict to be exported to other modules

208

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

212

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

209

for k, v in os.environ.items()) # re-exports

213

for k, v in os.environ.items()) # re-exports

210

214

211

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

215

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

212

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

216

_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

213

and "WFA" or "WF")

217

and "WFA" or "WF")

214

218

215

def colwidth(s):

219

def colwidth(s):

216

"Find the column width of a string for display in the local encoding"

220

"Find the column width of a string for display in the local encoding"

217

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

221

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

218

222

219

def ucolwidth(d):

223

def ucolwidth(d):

220

"Find the column width of a Unicode string for display"

224

"Find the column width of a Unicode string for display"

221

eaw = getattr(unicodedata, 'east_asian_width', None)

225

eaw = getattr(unicodedata, 'east_asian_width', None)

222

if eaw is not None:

226

if eaw is not None:

223

return sum([eaw(c) in _wide and 2 or 1 for c in d])

227

return sum([eaw(c) in _wide and 2 or 1 for c in d])

224

return len(d)

228

return len(d)

225

229

226

def getcols(s, start, c):

230

def getcols(s, start, c):

227

'''Use colwidth to find a c-column substring of s starting at byte

231

'''Use colwidth to find a c-column substring of s starting at byte

228

index start'''

232

index start'''

229

for x in xrange(start + c, len(s)):

233

for x in xrange(start + c, len(s)):

230

t = s[start:x]

234

t = s[start:x]

231

if colwidth(t) == c:

235

if colwidth(t) == c:

232

return t

236

return t

233

237

234

def trim(s, width, ellipsis='', leftside=False):

238

def trim(s, width, ellipsis='', leftside=False):

235

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

239

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

236

240

237

If 'leftside' is True, left side of string 's' is trimmed.

241

If 'leftside' is True, left side of string 's' is trimmed.

238

'ellipsis' is always placed at trimmed side.

242

'ellipsis' is always placed at trimmed side.

239

243

240

>>> ellipsis = '+++'

244

>>> ellipsis = '+++'

241

>>> from . import encoding

245

>>> from . import encoding

242

>>> encoding.encoding = 'utf-8'

246

>>> encoding.encoding = 'utf-8'

243

>>> t= '1234567890'

247

>>> t= '1234567890'

244

>>> print trim(t, 12, ellipsis=ellipsis)

248

>>> print trim(t, 12, ellipsis=ellipsis)

245

1234567890

249

1234567890

246

>>> print trim(t, 10, ellipsis=ellipsis)

250

>>> print trim(t, 10, ellipsis=ellipsis)

247

1234567890

251

1234567890

248

>>> print trim(t, 8, ellipsis=ellipsis)

252

>>> print trim(t, 8, ellipsis=ellipsis)

249

12345+++

253

12345+++

250

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

254

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

251

+++67890

255

+++67890

252

>>> print trim(t, 8)

256

>>> print trim(t, 8)

253

12345678

257

12345678

254

>>> print trim(t, 8, leftside=True)

258

>>> print trim(t, 8, leftside=True)

255

34567890

259

34567890

256

>>> print trim(t, 3, ellipsis=ellipsis)

260

>>> print trim(t, 3, ellipsis=ellipsis)

257

+++

261

+++

258

>>> print trim(t, 1, ellipsis=ellipsis)

262

>>> print trim(t, 1, ellipsis=ellipsis)

259

+

263

+

260

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

264

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

261

>>> t = u.encode(encoding.encoding)

265

>>> t = u.encode(encoding.encoding)

262

>>> print trim(t, 12, ellipsis=ellipsis)

266

>>> print trim(t, 12, ellipsis=ellipsis)

263

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

267

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

264

>>> print trim(t, 10, ellipsis=ellipsis)

268

>>> print trim(t, 10, ellipsis=ellipsis)

265

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

269

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

266

>>> print trim(t, 8, ellipsis=ellipsis)

270

>>> print trim(t, 8, ellipsis=ellipsis)

267

\xe3\x81\x82\xe3\x81\x84+++

271

\xe3\x81\x82\xe3\x81\x84+++

268

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

272

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

269

+++\xe3\x81\x88\xe3\x81\x8a

273

+++\xe3\x81\x88\xe3\x81\x8a

270

>>> print trim(t, 5)

274

>>> print trim(t, 5)

271

\xe3\x81\x82\xe3\x81\x84

275

\xe3\x81\x82\xe3\x81\x84

272

>>> print trim(t, 5, leftside=True)

276

>>> print trim(t, 5, leftside=True)

273

\xe3\x81\x88\xe3\x81\x8a

277

\xe3\x81\x88\xe3\x81\x8a

274

>>> print trim(t, 4, ellipsis=ellipsis)

278

>>> print trim(t, 4, ellipsis=ellipsis)

275

+++

279

+++

276

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

280

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

277

+++

281

+++

278

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

282

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

279

>>> print trim(t, 12, ellipsis=ellipsis)

283

>>> print trim(t, 12, ellipsis=ellipsis)

280

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

284

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

281

>>> print trim(t, 10, ellipsis=ellipsis)

285

>>> print trim(t, 10, ellipsis=ellipsis)

282

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

286

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

283

>>> print trim(t, 8, ellipsis=ellipsis)

287

>>> print trim(t, 8, ellipsis=ellipsis)

284

\x11\x22\x33\x44\x55+++

288

\x11\x22\x33\x44\x55+++

285

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

289

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

286

+++\x66\x77\x88\x99\xaa

290

+++\x66\x77\x88\x99\xaa

287

>>> print trim(t, 8)

291

>>> print trim(t, 8)

288

\x11\x22\x33\x44\x55\x66\x77\x88

292

\x11\x22\x33\x44\x55\x66\x77\x88

289

>>> print trim(t, 8, leftside=True)

293

>>> print trim(t, 8, leftside=True)

290

\x33\x44\x55\x66\x77\x88\x99\xaa

294

\x33\x44\x55\x66\x77\x88\x99\xaa

291

>>> print trim(t, 3, ellipsis=ellipsis)

295

>>> print trim(t, 3, ellipsis=ellipsis)

292

+++

296

+++

293

>>> print trim(t, 1, ellipsis=ellipsis)

297

>>> print trim(t, 1, ellipsis=ellipsis)

294

+

298

+

295

"""

299

"""

296

try:

300

try:

297

u = s.decode(_sysstr(encoding))

301

u = s.decode(_sysstr(encoding))

298

except UnicodeDecodeError:

302

except UnicodeDecodeError:

299

if len(s) <= width: # trimming is not needed

303

if len(s) <= width: # trimming is not needed

300

return s

304

return s

301

width -= len(ellipsis)

305

width -= len(ellipsis)

302

if width <= 0: # no enough room even for ellipsis

306

if width <= 0: # no enough room even for ellipsis

303

return ellipsis[:width + len(ellipsis)]

307

return ellipsis[:width + len(ellipsis)]

304

if leftside:

308

if leftside:

305

return ellipsis + s[-width:]

309

return ellipsis + s[-width:]

306

return s[:width] + ellipsis

310

return s[:width] + ellipsis

307

311

308

if ucolwidth(u) <= width: # trimming is not needed

312

if ucolwidth(u) <= width: # trimming is not needed

309

return s

313

return s

310

314

311

width -= len(ellipsis)

315

width -= len(ellipsis)

312

if width <= 0: # no enough room even for ellipsis

316

if width <= 0: # no enough room even for ellipsis

313

return ellipsis[:width + len(ellipsis)]

317

return ellipsis[:width + len(ellipsis)]

314

318

315

if leftside:

319

if leftside:

316

uslice = lambda i: u[i:]

320

uslice = lambda i: u[i:]

317

concat = lambda s: ellipsis + s

321

concat = lambda s: ellipsis + s

318

else:

322

else:

319

uslice = lambda i: u[:-i]

323

uslice = lambda i: u[:-i]

320

concat = lambda s: s + ellipsis

324

concat = lambda s: s + ellipsis

321

for i in xrange(1, len(u)):

325

for i in xrange(1, len(u)):

322

usub = uslice(i)

326

usub = uslice(i)

323

if ucolwidth(usub) <= width:

327

if ucolwidth(usub) <= width:

324

return concat(usub.encode(_sysstr(encoding)))

328

return concat(usub.encode(_sysstr(encoding)))

325

return ellipsis # no enough room for multi-column characters

329

return ellipsis # no enough room for multi-column characters

326

330

327

def lower(s):

331

def lower(s):

328

"best-effort encoding-aware case-folding of local string s"

332

"best-effort encoding-aware case-folding of local string s"

329

try:

333

try:

330

return asciilower(s)

334

return asciilower(s)

331

except UnicodeDecodeError:

335

except UnicodeDecodeError:

332

pass

336

pass

333

try:

337

try:

334

if isinstance(s, localstr):

338

if isinstance(s, localstr):

335

u = s._utf8.decode("utf-8")

339

u = s._utf8.decode("utf-8")

336

else:

340

else:

337

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

341

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

338

342

339

lu = u.lower()

343

lu = u.lower()

340

if u == lu:

344

if u == lu:

341

return s # preserve localstring

345

return s # preserve localstring

342

return lu.encode(_sysstr(encoding))

346

return lu.encode(_sysstr(encoding))

343

except UnicodeError:

347

except UnicodeError:

344

return s.lower() # we don't know how to fold this except in ASCII

348

return s.lower() # we don't know how to fold this except in ASCII

345

except LookupError as k:

349

except LookupError as k:

346

raise error.Abort(k, hint="please check your locale settings")

350

raise error.Abort(k, hint="please check your locale settings")

347

351

348

def upper(s):

352

def upper(s):

349

"best-effort encoding-aware case-folding of local string s"

353

"best-effort encoding-aware case-folding of local string s"

350

try:

354

try:

351

return asciiupper(s)

355

return asciiupper(s)

352

except UnicodeDecodeError:

356

except UnicodeDecodeError:

353

return upperfallback(s)

357

return upperfallback(s)

354

358

355

def upperfallback(s):

359

def upperfallback(s):

356

try:

360

try:

357

if isinstance(s, localstr):

361

if isinstance(s, localstr):

358

u = s._utf8.decode("utf-8")

362

u = s._utf8.decode("utf-8")

359

else:

363

else:

360

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

364

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

361

365

362

uu = u.upper()

366

uu = u.upper()

363

if u == uu:

367

if u == uu:

364

return s # preserve localstring

368

return s # preserve localstring

365

return uu.encode(_sysstr(encoding))

369

return uu.encode(_sysstr(encoding))

366

except UnicodeError:

370

except UnicodeError:

367

return s.upper() # we don't know how to fold this except in ASCII

371

return s.upper() # we don't know how to fold this except in ASCII

368

except LookupError as k:

372

except LookupError as k:

369

raise error.Abort(k, hint="please check your locale settings")

373

raise error.Abort(k, hint="please check your locale settings")

370

374

371

class normcasespecs(object):

375

class normcasespecs(object):

372

'''what a platform's normcase does to ASCII strings

376

'''what a platform's normcase does to ASCII strings

373

377

374

This is specified per platform, and should be consistent with what normcase

378

This is specified per platform, and should be consistent with what normcase

375

on that platform actually does.

379

on that platform actually does.

376

380

377

lower: normcase lowercases ASCII strings

381

lower: normcase lowercases ASCII strings

378

upper: normcase uppercases ASCII strings

382

upper: normcase uppercases ASCII strings

379

other: the fallback function should always be called

383

other: the fallback function should always be called

380

384

381

This should be kept in sync with normcase_spec in util.h.'''

385

This should be kept in sync with normcase_spec in util.h.'''

382

lower = -1

386

lower = -1

383

upper = 1

387

upper = 1

384

other = 0

388

other = 0

385

389

386

_jsonmap = []

387

_jsonmap.extend("\\u%04x" % x for x in range(32))

388

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

389

_jsonmap.append('\\u007f')

390

_jsonmap[0x09] = '\\t'

391

_jsonmap[0x0a] = '\\n'

392

_jsonmap[0x22] = '\\"'

393

_jsonmap[0x5c] = '\\\\'

394

_jsonmap[0x08] = '\\b'

395

_jsonmap[0x0c] = '\\f'

396

_jsonmap[0x0d] = '\\r'

397

_paranoidjsonmap = _jsonmap[:]

398

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

399

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

400

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

401

402

def jsonescape(s, paranoid=False):

390

def jsonescape(s, paranoid=False):

403

'''returns a string suitable for JSON

391

'''returns a string suitable for JSON

404

392

405

JSON is problematic for us because it doesn't support non-Unicode

393

JSON is problematic for us because it doesn't support non-Unicode

406

bytes. To deal with this, we take the following approach:

394

bytes. To deal with this, we take the following approach:

407

395

408

- localstr objects are converted back to UTF-8

396

- localstr objects are converted back to UTF-8

409

- valid UTF-8/ASCII strings are passed as-is

397

- valid UTF-8/ASCII strings are passed as-is

410

- other strings are converted to UTF-8b surrogate encoding

398

- other strings are converted to UTF-8b surrogate encoding

411

- apply JSON-specified string escaping

399

- apply JSON-specified string escaping

412

400

413

(escapes are doubled in these tests)

401

(escapes are doubled in these tests)

414

402

415

>>> jsonescape('this is a test')

403

>>> jsonescape('this is a test')

416

'this is a test'

404

'this is a test'

417

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

405

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

418

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

406

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

419

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

407

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

420

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

408

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

421

>>> jsonescape('a weird byte: \\xdd')

409

>>> jsonescape('a weird byte: \\xdd')

422

'a weird byte: \\xed\\xb3\\x9d'

410

'a weird byte: \\xed\\xb3\\x9d'

423

>>> jsonescape('utf-8: caf\\xc3\\xa9')

411

>>> jsonescape('utf-8: caf\\xc3\\xa9')

424

'utf-8: caf\\xc3\\xa9'

412

'utf-8: caf\\xc3\\xa9'

425

>>> jsonescape('')

413

>>> jsonescape('')

426

''

414

''

427

415

428

If paranoid, non-ascii and common troublesome characters are also escaped.

416

If paranoid, non-ascii and common troublesome characters are also escaped.

429

This is suitable for web output.

417

This is suitable for web output.

430

418

431

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

419

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

432

'escape boundary: ~ \\\\u007f \\\\u0080'

420

'escape boundary: ~ \\\\u007f \\\\u0080'

433

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

421

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

434

'a weird byte: \\\\udcdd'

422

'a weird byte: \\\\udcdd'

435

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

423

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

436

'utf-8: caf\\\\u00e9'

424

'utf-8: caf\\\\u00e9'

437

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

425

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

438

'non-BMP: \\\\ud834\\\\udd1e'

426

'non-BMP: \\\\ud834\\\\udd1e'

439

>>> jsonescape('<foo@example.org>', paranoid=True)

427

>>> jsonescape('<foo@example.org>', paranoid=True)

440

'\\\\u003cfoo@example.org\\\\u003e'

428

'\\\\u003cfoo@example.org\\\\u003e'

441

'''

429

'''

442

430

443

if paranoid:

444

jm = _paranoidjsonmap

445

else:

446

jm = _jsonmap

447

448

u8chars = toutf8b(s)

431

u8chars = toutf8b(s)

449

try:

432

try:

450

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

433

return _jsonescapeu8fast(u8chars, paranoid)

451

except ~~Index~~Error:

434

except ValueError:

452

pass

435

pass

453

# non-BMP char is represented as UTF-16 surrogate pair

436

return charencodepure.jsonescapeu8fallback(u8chars, paranoid)

454

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

455

u16codes.pop(0) # drop BOM

456

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

457

437

458

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

438

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

459

439

460

def getutf8char(s, pos):

440

def getutf8char(s, pos):

461

'''get the next full utf-8 character in the given string, starting at pos

441

'''get the next full utf-8 character in the given string, starting at pos

462

442

463

Raises a UnicodeError if the given location does not start a valid

443

Raises a UnicodeError if the given location does not start a valid

464

utf-8 character.

444

utf-8 character.

465

'''

445

'''

466

446

467

# find how many bytes to attempt decoding from first nibble

447

# find how many bytes to attempt decoding from first nibble

468

l = _utf8len[ord(s[pos]) >> 4]

448

l = _utf8len[ord(s[pos]) >> 4]

469

if not l: # ascii

449

if not l: # ascii

470

return s[pos]

450

return s[pos]

471

451

472

c = s[pos:pos + l]

452

c = s[pos:pos + l]

473

# validate with attempted decode

453

# validate with attempted decode

474

c.decode("utf-8")

454

c.decode("utf-8")

475

return c

455

return c

476

456

477

def toutf8b(s):

457

def toutf8b(s):

478

'''convert a local, possibly-binary string into UTF-8b

458

'''convert a local, possibly-binary string into UTF-8b

479

459

480

This is intended as a generic method to preserve data when working

460

This is intended as a generic method to preserve data when working

481

with schemes like JSON and XML that have no provision for

461

with schemes like JSON and XML that have no provision for

482

arbitrary byte strings. As Mercurial often doesn't know

462

arbitrary byte strings. As Mercurial often doesn't know

483

what encoding data is in, we use so-called UTF-8b.

463

what encoding data is in, we use so-called UTF-8b.

484

464

485

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

465

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

486

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

466

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

487

uDC00-uDCFF.

467

uDC00-uDCFF.

488

468

489

Principles of operation:

469

Principles of operation:

490

470

491

- ASCII and UTF-8 data successfully round-trips and is understood

471

- ASCII and UTF-8 data successfully round-trips and is understood

492

by Unicode-oriented clients

472

by Unicode-oriented clients

493

- filenames and file contents in arbitrary other encodings can have

473

- filenames and file contents in arbitrary other encodings can have

494

be round-tripped or recovered by clueful clients

474

be round-tripped or recovered by clueful clients

495

- local strings that have a cached known UTF-8 encoding (aka

475

- local strings that have a cached known UTF-8 encoding (aka

496

localstr) get sent as UTF-8 so Unicode-oriented clients get the

476

localstr) get sent as UTF-8 so Unicode-oriented clients get the

497

Unicode data they want

477

Unicode data they want

498

- because we must preserve UTF-8 bytestring in places such as

478

- because we must preserve UTF-8 bytestring in places such as

499

filenames, metadata can't be roundtripped without help

479

filenames, metadata can't be roundtripped without help

500

480

501

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

481

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

502

arbitrary bytes into an internal Unicode format that can be

482

arbitrary bytes into an internal Unicode format that can be

503

re-encoded back into the original. Here we are exposing the

483

re-encoded back into the original. Here we are exposing the

504

internal surrogate encoding as a UTF-8 string.)

484

internal surrogate encoding as a UTF-8 string.)

505

'''

485

'''

506

486

507

if "\xed" not in s:

487

if "\xed" not in s:

508

if isinstance(s, localstr):

488

if isinstance(s, localstr):

509

return s._utf8

489

return s._utf8

510

try:

490

try:

511

s.decode('utf-8')

491

s.decode('utf-8')

512

return s

492

return s

513

except UnicodeDecodeError:

493

except UnicodeDecodeError:

514

pass

494

pass

515

495

516

r = ""

496

r = ""

517

pos = 0

497

pos = 0

518

l = len(s)

498

l = len(s)

519

while pos < l:

499

while pos < l:

520

try:

500

try:

521

c = getutf8char(s, pos)

501

c = getutf8char(s, pos)

522

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

502

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

523

# have to re-escape existing U+DCxx characters

503

# have to re-escape existing U+DCxx characters

524

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

504

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

525

pos += 1

505

pos += 1

526

else:

506

else:

527

pos += len(c)

507

pos += len(c)

528

except UnicodeDecodeError:

508

except UnicodeDecodeError:

529

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

509

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

530

pos += 1

510

pos += 1

531

r += c

511

r += c

532

return r

512

return r

533

513

534

def fromutf8b(s):

514

def fromutf8b(s):

535

'''Given a UTF-8b string, return a local, possibly-binary string.

515

'''Given a UTF-8b string, return a local, possibly-binary string.

536

516

537

return the original binary string. This

517

return the original binary string. This

538

is a round-trip process for strings like filenames, but metadata

518

is a round-trip process for strings like filenames, but metadata

539

that's was passed through tolocal will remain in UTF-8.

519

that's was passed through tolocal will remain in UTF-8.

540

520

541

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

521

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

542

>>> m = "\\xc3\\xa9\\x99abcd"

522

>>> m = "\\xc3\\xa9\\x99abcd"

543

>>> toutf8b(m)

523

>>> toutf8b(m)

544

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

524

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

545

>>> roundtrip(m)

525

>>> roundtrip(m)

546

True

526

True

547

>>> roundtrip("\\xc2\\xc2\\x80")

527

>>> roundtrip("\\xc2\\xc2\\x80")

548

True

528

True

549

>>> roundtrip("\\xef\\xbf\\xbd")

529

>>> roundtrip("\\xef\\xbf\\xbd")

550

True

530

True

551

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

531

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

552

True

532

True

553

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

533

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

554

True

534

True

555

'''

535

'''

556

536

557

# fast path - look for uDxxx prefixes in s

537

# fast path - look for uDxxx prefixes in s

558

if "\xed" not in s:

538

if "\xed" not in s:

559

return s

539

return s

560

540

561

# We could do this with the unicode type but some Python builds

541

# We could do this with the unicode type but some Python builds

562

# use UTF-16 internally (issue5031) which causes non-BMP code

542

# use UTF-16 internally (issue5031) which causes non-BMP code

563

# points to be escaped. Instead, we use our handy getutf8char

543

# points to be escaped. Instead, we use our handy getutf8char

564

# helper again to walk the string without "decoding" it.

544

# helper again to walk the string without "decoding" it.

565

545

566

r = ""

546

r = ""

567

pos = 0

547

pos = 0

568

l = len(s)

548

l = len(s)

569

while pos < l:

549

while pos < l:

570

c = getutf8char(s, pos)

550

c = getutf8char(s, pos)

571

pos += len(c)

551

pos += len(c)

572

# unescape U+DCxx characters

552

# unescape U+DCxx characters

573

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

553

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

574

c = chr(ord(c.decode("utf-8")) & 0xff)

554

c = chr(ord(c.decode("utf-8")) & 0xff)

575

r += c

555

r += c

576

return r

556

return r

577

557

578

if pycompat.ispy3:

558

if pycompat.ispy3:

579

class strio(io.TextIOWrapper):

559

class strio(io.TextIOWrapper):

580

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

560

"""Wrapper around TextIOWrapper that respects hg's encoding assumptions.

581

561

582

Also works around Python closing streams.

562

Also works around Python closing streams.

583

"""

563

"""

584

564

585

def __init__(self, buffer):

565

def __init__(self, buffer):

586

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

566

super(strio, self).__init__(buffer, encoding=_sysstr(encoding))

587

567

588

def __del__(self):

568

def __del__(self):

589

"""Override __del__ so it doesn't close the underlying stream."""

569

"""Override __del__ so it doesn't close the underlying stream."""

590

else:

570

else:

591

strio = pycompat.identity

571

strio = pycompat.identity

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
-            import array
             import io
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 policy,
                 pycompat,
             )
+            from .pure import (
+                charencode as charencodepure,
+            )
             charencode = policy.importmod(r'charencode')
             asciilower = charencode.asciilower
             asciiupper = charencode.asciiupper
+            _jsonescapeu8fast = charencodepure.jsonescapeu8fast  # TODO: no "pure"
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(bytes):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = bytes.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             def unimethod(bytesfunc):
                 """Create a proxy method that forwards __unicode__() and __str__() of
                 Python 3 to __bytes__()"""
                 def unifunc(obj):
                     return unifromlocal(bytesfunc(obj))
                 return unifunc
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
                 strmethod = unimethod
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
                 strmethod = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                             and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in _wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
-            _jsonmap = []
-            _jsonmap.extend("\\u%04x" % x for x in range(32))
-            _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
-            _jsonmap.append('\\u007f')
-            _jsonmap[0x09] = '\\t'
-            _jsonmap[0x0a] = '\\n'
-            _jsonmap[0x22] = '\\"'
-            _jsonmap[0x5c] = '\\\\'
-            _jsonmap[0x08] = '\\b'
-            _jsonmap[0x0c] = '\\f'
-            _jsonmap[0x0d] = '\\r'
-            _paranoidjsonmap = _jsonmap[:]
-            _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
-            _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
-            _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
-                if paranoid:
-                    jm = _paranoidjsonmap
-                else:
-                    jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
-                    return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
+                    return _jsonescapeu8fast(u8chars, paranoid)
-                except IndexError:
+                except ValueError:
                     pass
-                # non-BMP char is represented as UTF-16 surrogate pair
+                return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
-                u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
-                u16codes.pop(0)  # drop BOM
-                return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r
             if pycompat.ispy3:
                 class strio(io.TextIOWrapper):
                     """Wrapper around TextIOWrapper that respects hg's encoding assumptions.
                     Also works around Python closing streams.
                     """
                     def __init__(self, buffer):
                         super(strio, self).__init__(buffer, encoding=_sysstr(encoding))
                     def __del__(self):
                         """Override __del__ so it doesn't close the underlying stream."""
             else:
                 strio = pycompat.identity

@@ -1,22 +1,72 b''
1	# charencode.py - miscellaneous character encoding	1	# charencode.py - miscellaneous character encoding
2	#	2	#
3	# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others	3	# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
4	#	4	#
5	# This software may be used and distributed according to the terms of the	5	# This software may be used and distributed according to the terms of the
6	# GNU General Public License version 2 or any later version.	6	# GNU General Public License version 2 or any later version.
7		7
8	from __future__ import absolute_import	8	from __future__ import absolute_import
9		9
		10	import array
		11
		12	from .. import (
		13	pycompat,
		14	)
		15
10	def asciilower(s):	16	def asciilower(s):
11	'''convert a string to lowercase if ASCII	17	'''convert a string to lowercase if ASCII
12		18
13	Raises UnicodeDecodeError if non-ASCII characters are found.'''	19	Raises UnicodeDecodeError if non-ASCII characters are found.'''
14	s.decode('ascii')	20	s.decode('ascii')
15	return s.lower()	21	return s.lower()
16		22
17	def asciiupper(s):	23	def asciiupper(s):
18	'''convert a string to uppercase if ASCII	24	'''convert a string to uppercase if ASCII
19		25
20	Raises UnicodeDecodeError if non-ASCII characters are found.'''	26	Raises UnicodeDecodeError if non-ASCII characters are found.'''
21	s.decode('ascii')	27	s.decode('ascii')
22	return s.upper()	28	return s.upper()
		29
		30	_jsonmap = []
		31	_jsonmap.extend("\\u%04x" % x for x in range(32))
		32	_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
		33	_jsonmap.append('\\u007f')
		34	_jsonmap[0x09] = '\\t'
		35	_jsonmap[0x0a] = '\\n'
		36	_jsonmap[0x22] = '\\"'
		37	_jsonmap[0x5c] = '\\\\'
		38	_jsonmap[0x08] = '\\b'
		39	_jsonmap[0x0c] = '\\f'
		40	_jsonmap[0x0d] = '\\r'
		41	_paranoidjsonmap = _jsonmap[:]
		42	_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")
		43	_paranoidjsonmap[0x3e] = '\\u003e' # '>'
		44	_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
		45
		46	def jsonescapeu8fast(u8chars, paranoid):
		47	"""Convert a UTF-8 byte string to JSON-escaped form (fast path)
		48
		49	Raises ValueError if non-ASCII characters have to be escaped.
		50	"""
		51	if paranoid:
		52	jm = _paranoidjsonmap
		53	else:
		54	jm = _jsonmap
		55	try:
		56	return ''.join(jm[x] for x in bytearray(u8chars))
		57	except IndexError:
		58	raise ValueError
		59
		60	def jsonescapeu8fallback(u8chars, paranoid):
		61	"""Convert a UTF-8 byte string to JSON-escaped form (slow path)
		62
		63	Escapes all non-ASCII characters no matter if paranoid is False.
		64	"""
		65	if paranoid:
		66	jm = _paranoidjsonmap
		67	else:
		68	jm = _jsonmap
		69	# non-BMP char is represented as UTF-16 surrogate pair
		70	u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
		71	u16codes.pop(0) # drop BOM
		72	return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)