upstream/mercurial-mirror Commit - r32299:7040f513

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import array

10

import array

11

import locale

11

import locale

12

import os

12

import os

13

import unicodedata

13

import unicodedata

14

15

from . import (

15

from . import (

16

error,

16

error,

17

pycompat,

17

pycompat,

18

)

18

)

19

20

_sysstr = pycompat.sysstr

20

_sysstr = pycompat.sysstr

21

22

if pycompat.ispy3:

22

if pycompat.ispy3:

23

unichr = chr

23

unichr = chr

24

25

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

25

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

26

# "Unicode Subtleties"), so we need to ignore them in some places for

26

# "Unicode Subtleties"), so we need to ignore them in some places for

27

# sanity.

27

# sanity.

28

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

28

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

29

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

29

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

30

"206a 206b 206c 206d 206e 206f feff".split()]

30

"206a 206b 206c 206d 206e 206f feff".split()]

31

# verify the next function will work

31

# verify the next function will work

32

if pycompat.ispy3:

32

assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)

33

assert set(i[0] for i in _ignore) == {ord(b'\xe2'), ord(b'\xef')}

34

else:

35

assert set(i[0] for i in _ignore) == {"\xe2", "\xef"}

36

33

37

def hfsignoreclean(s):

34

def hfsignoreclean(s):

38

"""Remove codepoints ignored by HFS+ from s.

35

"""Remove codepoints ignored by HFS+ from s.

39

36

40

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

37

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

41

'.hg'

38

'.hg'

42

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

39

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

43

'.hg'

40

'.hg'

44

"""

41

"""

45

if "\xe2" in s or "\xef" in s:

42

if "\xe2" in s or "\xef" in s:

46

for c in _ignore:

43

for c in _ignore:

47

s = s.replace(c, '')

44

s = s.replace(c, '')

48

return s

45

return s

49

46

50

# encoding.environ is provided read-only, which may not be used to modify

47

# encoding.environ is provided read-only, which may not be used to modify

51

# the process environment

48

# the process environment

52

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

49

_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)

53

if not pycompat.ispy3:

50

if not pycompat.ispy3:

54

environ = os.environ # re-exports

51

environ = os.environ # re-exports

55

elif _nativeenviron:

52

elif _nativeenviron:

56

environ = os.environb # re-exports

53

environ = os.environb # re-exports

57

else:

54

else:

58

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

55

# preferred encoding isn't known yet; use utf-8 to avoid unicode error

59

# and recreate it once encoding is settled

56

# and recreate it once encoding is settled

60

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

57

environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))

61

for k, v in os.environ.items()) # re-exports

58

for k, v in os.environ.items()) # re-exports

62

59

63

_encodingfixers = {

60

_encodingfixers = {

64

'646': lambda: 'ascii',

61

'646': lambda: 'ascii',

65

'ANSI_X3.4-1968': lambda: 'ascii',

62

'ANSI_X3.4-1968': lambda: 'ascii',

66

}

63

}

67

64

68

try:

65

try:

69

encoding = environ.get("HGENCODING")

66

encoding = environ.get("HGENCODING")

70

if not encoding:

67

if not encoding:

71

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

68

encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'

72

encoding = _encodingfixers.get(encoding, lambda: encoding)()

69

encoding = _encodingfixers.get(encoding, lambda: encoding)()

73

except locale.Error:

70

except locale.Error:

74

encoding = 'ascii'

71

encoding = 'ascii'

75

encodingmode = environ.get("HGENCODINGMODE", "strict")

72

encodingmode = environ.get("HGENCODINGMODE", "strict")

76

fallbackencoding = 'ISO-8859-1'

73

fallbackencoding = 'ISO-8859-1'

77

74

78

class localstr(str):

75

class localstr(str):

79

'''This class allows strings that are unmodified to be

76

'''This class allows strings that are unmodified to be

80

round-tripped to the local encoding and back'''

77

round-tripped to the local encoding and back'''

81

def __new__(cls, u, l):

78

def __new__(cls, u, l):

82

s = str.__new__(cls, l)

79

s = str.__new__(cls, l)

83

s._utf8 = u

80

s._utf8 = u

84

return s

81

return s

85

def __hash__(self):

82

def __hash__(self):

86

return hash(self._utf8) # avoid collisions in local string space

83

return hash(self._utf8) # avoid collisions in local string space

87

84

88

def tolocal(s):

85

def tolocal(s):

89

"""

86

"""

90

Convert a string from internal UTF-8 to local encoding

87

Convert a string from internal UTF-8 to local encoding

91

88

92

All internal strings should be UTF-8 but some repos before the

89

All internal strings should be UTF-8 but some repos before the

93

implementation of locale support may contain latin1 or possibly

90

implementation of locale support may contain latin1 or possibly

94

other character sets. We attempt to decode everything strictly

91

other character sets. We attempt to decode everything strictly

95

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

92

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

96

replace unknown characters.

93

replace unknown characters.

97

94

98

The localstr class is used to cache the known UTF-8 encoding of

95

The localstr class is used to cache the known UTF-8 encoding of

99

strings next to their local representation to allow lossless

96

strings next to their local representation to allow lossless

100

round-trip conversion back to UTF-8.

97

round-trip conversion back to UTF-8.

101

98

102

>>> u = 'foo: \\xc3\\xa4' # utf-8

99

>>> u = 'foo: \\xc3\\xa4' # utf-8

103

>>> l = tolocal(u)

100

>>> l = tolocal(u)

104

>>> l

101

>>> l

105

'foo: ?'

102

'foo: ?'

106

>>> fromlocal(l)

103

>>> fromlocal(l)

107

'foo: \\xc3\\xa4'

104

'foo: \\xc3\\xa4'

108

>>> u2 = 'foo: \\xc3\\xa1'

105

>>> u2 = 'foo: \\xc3\\xa1'

109

>>> d = { l: 1, tolocal(u2): 2 }

106

>>> d = { l: 1, tolocal(u2): 2 }

110

>>> len(d) # no collision

107

>>> len(d) # no collision

111

2

108

2

112

>>> 'foo: ?' in d

109

>>> 'foo: ?' in d

113

False

110

False

114

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

111

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

115

>>> l = tolocal(l1)

112

>>> l = tolocal(l1)

116

>>> l

113

>>> l

117

'foo: ?'

114

'foo: ?'

118

>>> fromlocal(l) # magically in utf-8

115

>>> fromlocal(l) # magically in utf-8

119

'foo: \\xc3\\xa4'

116

'foo: \\xc3\\xa4'

120

"""

117

"""

121

118

122

try:

119

try:

123

try:

120

try:

124

# make sure string is actually stored in UTF-8

121

# make sure string is actually stored in UTF-8

125

u = s.decode('UTF-8')

122

u = s.decode('UTF-8')

126

if encoding == 'UTF-8':

123

if encoding == 'UTF-8':

127

# fast path

124

# fast path

128

return s

125

return s

129

r = u.encode(_sysstr(encoding), u"replace")

126

r = u.encode(_sysstr(encoding), u"replace")

130

if u == r.decode(_sysstr(encoding)):

127

if u == r.decode(_sysstr(encoding)):

131

# r is a safe, non-lossy encoding of s

128

# r is a safe, non-lossy encoding of s

132

return r

129

return r

133

return localstr(s, r)

130

return localstr(s, r)

134

except UnicodeDecodeError:

131

except UnicodeDecodeError:

135

# we should only get here if we're looking at an ancient changeset

132

# we should only get here if we're looking at an ancient changeset

136

try:

133

try:

137

u = s.decode(_sysstr(fallbackencoding))

134

u = s.decode(_sysstr(fallbackencoding))

138

r = u.encode(_sysstr(encoding), u"replace")

135

r = u.encode(_sysstr(encoding), u"replace")

139

if u == r.decode(_sysstr(encoding)):

136

if u == r.decode(_sysstr(encoding)):

140

# r is a safe, non-lossy encoding of s

137

# r is a safe, non-lossy encoding of s

141

return r

138

return r

142

return localstr(u.encode('UTF-8'), r)

139

return localstr(u.encode('UTF-8'), r)

143

except UnicodeDecodeError:

140

except UnicodeDecodeError:

144

u = s.decode("utf-8", "replace") # last ditch

141

u = s.decode("utf-8", "replace") # last ditch

145

# can't round-trip

142

# can't round-trip

146

return u.encode(_sysstr(encoding), u"replace")

143

return u.encode(_sysstr(encoding), u"replace")

147

except LookupError as k:

144

except LookupError as k:

148

raise error.Abort(k, hint="please check your locale settings")

145

raise error.Abort(k, hint="please check your locale settings")

149

146

150

def fromlocal(s):

147

def fromlocal(s):

151

"""

148

"""

152

Convert a string from the local character encoding to UTF-8

149

Convert a string from the local character encoding to UTF-8

153

150

154

We attempt to decode strings using the encoding mode set by

151

We attempt to decode strings using the encoding mode set by

155

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

152

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

156

characters will cause an error message. Other modes include

153

characters will cause an error message. Other modes include

157

'replace', which replaces unknown characters with a special

154

'replace', which replaces unknown characters with a special

158

Unicode character, and 'ignore', which drops the character.

155

Unicode character, and 'ignore', which drops the character.

159

"""

156

"""

160

157

161

# can we do a lossless round-trip?

158

# can we do a lossless round-trip?

162

if isinstance(s, localstr):

159

if isinstance(s, localstr):

163

return s._utf8

160

return s._utf8

164

161

165

try:

162

try:

166

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

163

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

167

return u.encode("utf-8")

164

return u.encode("utf-8")

168

except UnicodeDecodeError as inst:

165

except UnicodeDecodeError as inst:

169

sub = s[max(0, inst.start - 10):inst.start + 10]

166

sub = s[max(0, inst.start - 10):inst.start + 10]

170

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

167

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

171

except LookupError as k:

168

except LookupError as k:

172

raise error.Abort(k, hint="please check your locale settings")

169

raise error.Abort(k, hint="please check your locale settings")

173

170

174

def unitolocal(u):

171

def unitolocal(u):

175

"""Convert a unicode string to a byte string of local encoding"""

172

"""Convert a unicode string to a byte string of local encoding"""

176

return tolocal(u.encode('utf-8'))

173

return tolocal(u.encode('utf-8'))

177

174

178

def unifromlocal(s):

175

def unifromlocal(s):

179

"""Convert a byte string of local encoding to a unicode string"""

176

"""Convert a byte string of local encoding to a unicode string"""

180

return fromlocal(s).decode('utf-8')

177

return fromlocal(s).decode('utf-8')

181

178

182

# converter functions between native str and byte string. use these if the

179

# converter functions between native str and byte string. use these if the

183

# character encoding is not aware (e.g. exception message) or is known to

180

# character encoding is not aware (e.g. exception message) or is known to

184

# be locale dependent (e.g. date formatting.)

181

# be locale dependent (e.g. date formatting.)

185

if pycompat.ispy3:

182

if pycompat.ispy3:

186

strtolocal = unitolocal

183

strtolocal = unitolocal

187

strfromlocal = unifromlocal

184

strfromlocal = unifromlocal

188

else:

185

else:

189

strtolocal = pycompat.identity

186

strtolocal = pycompat.identity

190

strfromlocal = pycompat.identity

187

strfromlocal = pycompat.identity

191

188

192

if not _nativeenviron:

189

if not _nativeenviron:

193

# now encoding and helper functions are available, recreate the environ

190

# now encoding and helper functions are available, recreate the environ

194

# dict to be exported to other modules

191

# dict to be exported to other modules

195

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

192

environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))

196

for k, v in os.environ.items()) # re-exports

193

for k, v in os.environ.items()) # re-exports

197

194

198

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

195

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

199

wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

196

wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

200

and "WFA" or "WF")

197

and "WFA" or "WF")

201

198

202

def colwidth(s):

199

def colwidth(s):

203

"Find the column width of a string for display in the local encoding"

200

"Find the column width of a string for display in the local encoding"

204

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

201

return ucolwidth(s.decode(_sysstr(encoding), u'replace'))

205

202

206

def ucolwidth(d):

203

def ucolwidth(d):

207

"Find the column width of a Unicode string for display"

204

"Find the column width of a Unicode string for display"

208

eaw = getattr(unicodedata, 'east_asian_width', None)

205

eaw = getattr(unicodedata, 'east_asian_width', None)

209

if eaw is not None:

206

if eaw is not None:

210

return sum([eaw(c) in wide and 2 or 1 for c in d])

207

return sum([eaw(c) in wide and 2 or 1 for c in d])

211

return len(d)

208

return len(d)

212

209

213

def getcols(s, start, c):

210

def getcols(s, start, c):

214

'''Use colwidth to find a c-column substring of s starting at byte

211

'''Use colwidth to find a c-column substring of s starting at byte

215

index start'''

212

index start'''

216

for x in xrange(start + c, len(s)):

213

for x in xrange(start + c, len(s)):

217

t = s[start:x]

214

t = s[start:x]

218

if colwidth(t) == c:

215

if colwidth(t) == c:

219

return t

216

return t

220

217

221

def trim(s, width, ellipsis='', leftside=False):

218

def trim(s, width, ellipsis='', leftside=False):

222

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

219

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

223

220

224

If 'leftside' is True, left side of string 's' is trimmed.

221

If 'leftside' is True, left side of string 's' is trimmed.

225

'ellipsis' is always placed at trimmed side.

222

'ellipsis' is always placed at trimmed side.

226

223

227

>>> ellipsis = '+++'

224

>>> ellipsis = '+++'

228

>>> from . import encoding

225

>>> from . import encoding

229

>>> encoding.encoding = 'utf-8'

226

>>> encoding.encoding = 'utf-8'

230

>>> t= '1234567890'

227

>>> t= '1234567890'

231

>>> print trim(t, 12, ellipsis=ellipsis)

228

>>> print trim(t, 12, ellipsis=ellipsis)

232

1234567890

229

1234567890

233

>>> print trim(t, 10, ellipsis=ellipsis)

230

>>> print trim(t, 10, ellipsis=ellipsis)

234

1234567890

231

1234567890

235

>>> print trim(t, 8, ellipsis=ellipsis)

232

>>> print trim(t, 8, ellipsis=ellipsis)

236

12345+++

233

12345+++

237

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

234

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

238

+++67890

235

+++67890

239

>>> print trim(t, 8)

236

>>> print trim(t, 8)

240

12345678

237

12345678

241

>>> print trim(t, 8, leftside=True)

238

>>> print trim(t, 8, leftside=True)

242

34567890

239

34567890

243

>>> print trim(t, 3, ellipsis=ellipsis)

240

>>> print trim(t, 3, ellipsis=ellipsis)

244

+++

241

+++

245

>>> print trim(t, 1, ellipsis=ellipsis)

242

>>> print trim(t, 1, ellipsis=ellipsis)

246

+

243

+

247

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

244

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

248

>>> t = u.encode(encoding.encoding)

245

>>> t = u.encode(encoding.encoding)

249

>>> print trim(t, 12, ellipsis=ellipsis)

246

>>> print trim(t, 12, ellipsis=ellipsis)

250

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

247

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

251

>>> print trim(t, 10, ellipsis=ellipsis)

248

>>> print trim(t, 10, ellipsis=ellipsis)

252

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

249

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

253

>>> print trim(t, 8, ellipsis=ellipsis)

250

>>> print trim(t, 8, ellipsis=ellipsis)

254

\xe3\x81\x82\xe3\x81\x84+++

251

\xe3\x81\x82\xe3\x81\x84+++

255

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

252

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

256

+++\xe3\x81\x88\xe3\x81\x8a

253

+++\xe3\x81\x88\xe3\x81\x8a

257

>>> print trim(t, 5)

254

>>> print trim(t, 5)

258

\xe3\x81\x82\xe3\x81\x84

255

\xe3\x81\x82\xe3\x81\x84

259

>>> print trim(t, 5, leftside=True)

256

>>> print trim(t, 5, leftside=True)

260

\xe3\x81\x88\xe3\x81\x8a

257

\xe3\x81\x88\xe3\x81\x8a

261

>>> print trim(t, 4, ellipsis=ellipsis)

258

>>> print trim(t, 4, ellipsis=ellipsis)

262

+++

259

+++

263

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

260

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

264

+++

261

+++

265

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

262

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

266

>>> print trim(t, 12, ellipsis=ellipsis)

263

>>> print trim(t, 12, ellipsis=ellipsis)

267

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

264

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

268

>>> print trim(t, 10, ellipsis=ellipsis)

265

>>> print trim(t, 10, ellipsis=ellipsis)

269

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

266

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

270

>>> print trim(t, 8, ellipsis=ellipsis)

267

>>> print trim(t, 8, ellipsis=ellipsis)

271

\x11\x22\x33\x44\x55+++

268

\x11\x22\x33\x44\x55+++

272

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

269

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

273

+++\x66\x77\x88\x99\xaa

270

+++\x66\x77\x88\x99\xaa

274

>>> print trim(t, 8)

271

>>> print trim(t, 8)

275

\x11\x22\x33\x44\x55\x66\x77\x88

272

\x11\x22\x33\x44\x55\x66\x77\x88

276

>>> print trim(t, 8, leftside=True)

273

>>> print trim(t, 8, leftside=True)

277

\x33\x44\x55\x66\x77\x88\x99\xaa

274

\x33\x44\x55\x66\x77\x88\x99\xaa

278

>>> print trim(t, 3, ellipsis=ellipsis)

275

>>> print trim(t, 3, ellipsis=ellipsis)

279

+++

276

+++

280

>>> print trim(t, 1, ellipsis=ellipsis)

277

>>> print trim(t, 1, ellipsis=ellipsis)

281

+

278

+

282

"""

279

"""

283

try:

280

try:

284

u = s.decode(_sysstr(encoding))

281

u = s.decode(_sysstr(encoding))

285

except UnicodeDecodeError:

282

except UnicodeDecodeError:

286

if len(s) <= width: # trimming is not needed

283

if len(s) <= width: # trimming is not needed

287

return s

284

return s

288

width -= len(ellipsis)

285

width -= len(ellipsis)

289

if width <= 0: # no enough room even for ellipsis

286

if width <= 0: # no enough room even for ellipsis

290

return ellipsis[:width + len(ellipsis)]

287

return ellipsis[:width + len(ellipsis)]

291

if leftside:

288

if leftside:

292

return ellipsis + s[-width:]

289

return ellipsis + s[-width:]

293

return s[:width] + ellipsis

290

return s[:width] + ellipsis

294

291

295

if ucolwidth(u) <= width: # trimming is not needed

292

if ucolwidth(u) <= width: # trimming is not needed

296

return s

293

return s

297

294

298

width -= len(ellipsis)

295

width -= len(ellipsis)

299

if width <= 0: # no enough room even for ellipsis

296

if width <= 0: # no enough room even for ellipsis

300

return ellipsis[:width + len(ellipsis)]

297

return ellipsis[:width + len(ellipsis)]

301

298

302

if leftside:

299

if leftside:

303

uslice = lambda i: u[i:]

300

uslice = lambda i: u[i:]

304

concat = lambda s: ellipsis + s

301

concat = lambda s: ellipsis + s

305

else:

302

else:

306

uslice = lambda i: u[:-i]

303

uslice = lambda i: u[:-i]

307

concat = lambda s: s + ellipsis

304

concat = lambda s: s + ellipsis

308

for i in xrange(1, len(u)):

305

for i in xrange(1, len(u)):

309

usub = uslice(i)

306

usub = uslice(i)

310

if ucolwidth(usub) <= width:

307

if ucolwidth(usub) <= width:

311

return concat(usub.encode(_sysstr(encoding)))

308

return concat(usub.encode(_sysstr(encoding)))

312

return ellipsis # no enough room for multi-column characters

309

return ellipsis # no enough room for multi-column characters

313

310

314

def _asciilower(s):

311

def _asciilower(s):

315

'''convert a string to lowercase if ASCII

312

'''convert a string to lowercase if ASCII

316

313

317

Raises UnicodeDecodeError if non-ASCII characters are found.'''

314

Raises UnicodeDecodeError if non-ASCII characters are found.'''

318

s.decode('ascii')

315

s.decode('ascii')

319

return s.lower()

316

return s.lower()

320

317

321

def asciilower(s):

318

def asciilower(s):

322

# delay importing avoids cyclic dependency around "parsers" in

319

# delay importing avoids cyclic dependency around "parsers" in

323

# pure Python build (util => i18n => encoding => parsers => util)

320

# pure Python build (util => i18n => encoding => parsers => util)

324

from . import parsers

321

from . import parsers

325

impl = getattr(parsers, 'asciilower', _asciilower)

322

impl = getattr(parsers, 'asciilower', _asciilower)

326

global asciilower

323

global asciilower

327

asciilower = impl

324

asciilower = impl

328

return impl(s)

325

return impl(s)

329

326

330

def _asciiupper(s):

327

def _asciiupper(s):

331

'''convert a string to uppercase if ASCII

328

'''convert a string to uppercase if ASCII

332

329

333

Raises UnicodeDecodeError if non-ASCII characters are found.'''

330

Raises UnicodeDecodeError if non-ASCII characters are found.'''

334

s.decode('ascii')

331

s.decode('ascii')

335

return s.upper()

332

return s.upper()

336

333

337

def asciiupper(s):

334

def asciiupper(s):

338

# delay importing avoids cyclic dependency around "parsers" in

335

# delay importing avoids cyclic dependency around "parsers" in

339

# pure Python build (util => i18n => encoding => parsers => util)

336

# pure Python build (util => i18n => encoding => parsers => util)

340

from . import parsers

337

from . import parsers

341

impl = getattr(parsers, 'asciiupper', _asciiupper)

338

impl = getattr(parsers, 'asciiupper', _asciiupper)

342

global asciiupper

339

global asciiupper

343

asciiupper = impl

340

asciiupper = impl

344

return impl(s)

341

return impl(s)

345

342

346

def lower(s):

343

def lower(s):

347

"best-effort encoding-aware case-folding of local string s"

344

"best-effort encoding-aware case-folding of local string s"

348

try:

345

try:

349

return asciilower(s)

346

return asciilower(s)

350

except UnicodeDecodeError:

347

except UnicodeDecodeError:

351

pass

348

pass

352

try:

349

try:

353

if isinstance(s, localstr):

350

if isinstance(s, localstr):

354

u = s._utf8.decode("utf-8")

351

u = s._utf8.decode("utf-8")

355

else:

352

else:

356

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

353

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

357

354

358

lu = u.lower()

355

lu = u.lower()

359

if u == lu:

356

if u == lu:

360

return s # preserve localstring

357

return s # preserve localstring

361

return lu.encode(_sysstr(encoding))

358

return lu.encode(_sysstr(encoding))

362

except UnicodeError:

359

except UnicodeError:

363

return s.lower() # we don't know how to fold this except in ASCII

360

return s.lower() # we don't know how to fold this except in ASCII

364

except LookupError as k:

361

except LookupError as k:

365

raise error.Abort(k, hint="please check your locale settings")

362

raise error.Abort(k, hint="please check your locale settings")

366

363

367

def upper(s):

364

def upper(s):

368

"best-effort encoding-aware case-folding of local string s"

365

"best-effort encoding-aware case-folding of local string s"

369

try:

366

try:

370

return asciiupper(s)

367

return asciiupper(s)

371

except UnicodeDecodeError:

368

except UnicodeDecodeError:

372

return upperfallback(s)

369

return upperfallback(s)

373

370

374

def upperfallback(s):

371

def upperfallback(s):

375

try:

372

try:

376

if isinstance(s, localstr):

373

if isinstance(s, localstr):

377

u = s._utf8.decode("utf-8")

374

u = s._utf8.decode("utf-8")

378

else:

375

else:

379

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

376

u = s.decode(_sysstr(encoding), _sysstr(encodingmode))

380

377

381

uu = u.upper()

378

uu = u.upper()

382

if u == uu:

379

if u == uu:

383

return s # preserve localstring

380

return s # preserve localstring

384

return uu.encode(_sysstr(encoding))

381

return uu.encode(_sysstr(encoding))

385

except UnicodeError:

382

except UnicodeError:

386

return s.upper() # we don't know how to fold this except in ASCII

383

return s.upper() # we don't know how to fold this except in ASCII

387

except LookupError as k:

384

except LookupError as k:

388

raise error.Abort(k, hint="please check your locale settings")

385

raise error.Abort(k, hint="please check your locale settings")

389

386

390

class normcasespecs(object):

387

class normcasespecs(object):

391

'''what a platform's normcase does to ASCII strings

388

'''what a platform's normcase does to ASCII strings

392

389

393

This is specified per platform, and should be consistent with what normcase

390

This is specified per platform, and should be consistent with what normcase

394

on that platform actually does.

391

on that platform actually does.

395

392

396

lower: normcase lowercases ASCII strings

393

lower: normcase lowercases ASCII strings

397

upper: normcase uppercases ASCII strings

394

upper: normcase uppercases ASCII strings

398

other: the fallback function should always be called

395

other: the fallback function should always be called

399

396

400

This should be kept in sync with normcase_spec in util.h.'''

397

This should be kept in sync with normcase_spec in util.h.'''

401

lower = -1

398

lower = -1

402

upper = 1

399

upper = 1

403

other = 0

400

other = 0

404

401

405

_jsonmap = []

402

_jsonmap = []

406

_jsonmap.extend("\\u%04x" % x for x in range(32))

403

_jsonmap.extend("\\u%04x" % x for x in range(32))

407

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

404

_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))

408

_jsonmap.append('\\u007f')

405

_jsonmap.append('\\u007f')

409

_jsonmap[0x09] = '\\t'

406

_jsonmap[0x09] = '\\t'

410

_jsonmap[0x0a] = '\\n'

407

_jsonmap[0x0a] = '\\n'

411

_jsonmap[0x22] = '\\"'

408

_jsonmap[0x22] = '\\"'

412

_jsonmap[0x5c] = '\\\\'

409

_jsonmap[0x5c] = '\\\\'

413

_jsonmap[0x08] = '\\b'

410

_jsonmap[0x08] = '\\b'

414

_jsonmap[0x0c] = '\\f'

411

_jsonmap[0x0c] = '\\f'

415

_jsonmap[0x0d] = '\\r'

412

_jsonmap[0x0d] = '\\r'

416

_paranoidjsonmap = _jsonmap[:]

413

_paranoidjsonmap = _jsonmap[:]

417

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

414

_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>")

418

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

415

_paranoidjsonmap[0x3e] = '\\u003e' # '>'

419

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

416

_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))

420

417

421

def jsonescape(s, paranoid=False):

418

def jsonescape(s, paranoid=False):

422

'''returns a string suitable for JSON

419

'''returns a string suitable for JSON

423

420

424

JSON is problematic for us because it doesn't support non-Unicode

421

JSON is problematic for us because it doesn't support non-Unicode

425

bytes. To deal with this, we take the following approach:

422

bytes. To deal with this, we take the following approach:

426

423

427

- localstr objects are converted back to UTF-8

424

- localstr objects are converted back to UTF-8

428

- valid UTF-8/ASCII strings are passed as-is

425

- valid UTF-8/ASCII strings are passed as-is

429

- other strings are converted to UTF-8b surrogate encoding

426

- other strings are converted to UTF-8b surrogate encoding

430

- apply JSON-specified string escaping

427

- apply JSON-specified string escaping

431

428

432

(escapes are doubled in these tests)

429

(escapes are doubled in these tests)

433

430

434

>>> jsonescape('this is a test')

431

>>> jsonescape('this is a test')

435

'this is a test'

432

'this is a test'

436

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

433

>>> jsonescape('escape characters: \\0 \\x0b \\x7f')

437

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

434

'escape characters: \\\\u0000 \\\\u000b \\\\u007f'

438

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

435

>>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')

439

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

436

'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

440

>>> jsonescape('a weird byte: \\xdd')

437

>>> jsonescape('a weird byte: \\xdd')

441

'a weird byte: \\xed\\xb3\\x9d'

438

'a weird byte: \\xed\\xb3\\x9d'

442

>>> jsonescape('utf-8: caf\\xc3\\xa9')

439

>>> jsonescape('utf-8: caf\\xc3\\xa9')

443

'utf-8: caf\\xc3\\xa9'

440

'utf-8: caf\\xc3\\xa9'

444

>>> jsonescape('')

441

>>> jsonescape('')

445

''

442

''

446

443

447

If paranoid, non-ascii and common troublesome characters are also escaped.

444

If paranoid, non-ascii and common troublesome characters are also escaped.

448

This is suitable for web output.

445

This is suitable for web output.

449

446

450

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

447

>>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)

451

'escape boundary: ~ \\\\u007f \\\\u0080'

448

'escape boundary: ~ \\\\u007f \\\\u0080'

452

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

449

>>> jsonescape('a weird byte: \\xdd', paranoid=True)

453

'a weird byte: \\\\udcdd'

450

'a weird byte: \\\\udcdd'

454

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

451

>>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)

455

'utf-8: caf\\\\u00e9'

452

'utf-8: caf\\\\u00e9'

456

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

453

>>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)

457

'non-BMP: \\\\ud834\\\\udd1e'

454

'non-BMP: \\\\ud834\\\\udd1e'

458

>>> jsonescape('<foo@example.org>', paranoid=True)

455

>>> jsonescape('<foo@example.org>', paranoid=True)

459

'\\\\u003cfoo@example.org\\\\u003e'

456

'\\\\u003cfoo@example.org\\\\u003e'

460

'''

457

'''

461

458

462

if paranoid:

459

if paranoid:

463

jm = _paranoidjsonmap

460

jm = _paranoidjsonmap

464

else:

461

else:

465

jm = _jsonmap

462

jm = _jsonmap

466

463

467

u8chars = toutf8b(s)

464

u8chars = toutf8b(s)

468

try:

465

try:

469

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

466

return ''.join(jm[x] for x in bytearray(u8chars)) # fast path

470

except IndexError:

467

except IndexError:

471

pass

468

pass

472

# non-BMP char is represented as UTF-16 surrogate pair

469

# non-BMP char is represented as UTF-16 surrogate pair

473

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

470

u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))

474

u16codes.pop(0) # drop BOM

471

u16codes.pop(0) # drop BOM

475

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

472

return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)

476

473

477

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

474

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

478

475

479

def getutf8char(s, pos):

476

def getutf8char(s, pos):

480

'''get the next full utf-8 character in the given string, starting at pos

477

'''get the next full utf-8 character in the given string, starting at pos

481

478

482

Raises a UnicodeError if the given location does not start a valid

479

Raises a UnicodeError if the given location does not start a valid

483

utf-8 character.

480

utf-8 character.

484

'''

481

'''

485

482

486

# find how many bytes to attempt decoding from first nibble

483

# find how many bytes to attempt decoding from first nibble

487

l = _utf8len[ord(s[pos]) >> 4]

484

l = _utf8len[ord(s[pos]) >> 4]

488

if not l: # ascii

485

if not l: # ascii

489

return s[pos]

486

return s[pos]

490

487

491

c = s[pos:pos + l]

488

c = s[pos:pos + l]

492

# validate with attempted decode

489

# validate with attempted decode

493

c.decode("utf-8")

490

c.decode("utf-8")

494

return c

491

return c

495

492

496

def toutf8b(s):

493

def toutf8b(s):

497

'''convert a local, possibly-binary string into UTF-8b

494

'''convert a local, possibly-binary string into UTF-8b

498

495

499

This is intended as a generic method to preserve data when working

496

This is intended as a generic method to preserve data when working

500

with schemes like JSON and XML that have no provision for

497

with schemes like JSON and XML that have no provision for

501

arbitrary byte strings. As Mercurial often doesn't know

498

arbitrary byte strings. As Mercurial often doesn't know

502

what encoding data is in, we use so-called UTF-8b.

499

what encoding data is in, we use so-called UTF-8b.

503

500

504

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

501

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

505

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

502

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

506

uDC00-uDCFF.

503

uDC00-uDCFF.

507

504

508

Principles of operation:

505

Principles of operation:

509

506

510

- ASCII and UTF-8 data successfully round-trips and is understood

507

- ASCII and UTF-8 data successfully round-trips and is understood

511

by Unicode-oriented clients

508

by Unicode-oriented clients

512

- filenames and file contents in arbitrary other encodings can have

509

- filenames and file contents in arbitrary other encodings can have

513

be round-tripped or recovered by clueful clients

510

be round-tripped or recovered by clueful clients

514

- local strings that have a cached known UTF-8 encoding (aka

511

- local strings that have a cached known UTF-8 encoding (aka

515

localstr) get sent as UTF-8 so Unicode-oriented clients get the

512

localstr) get sent as UTF-8 so Unicode-oriented clients get the

516

Unicode data they want

513

Unicode data they want

517

- because we must preserve UTF-8 bytestring in places such as

514

- because we must preserve UTF-8 bytestring in places such as

518

filenames, metadata can't be roundtripped without help

515

filenames, metadata can't be roundtripped without help

519

516

520

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

517

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

521

arbitrary bytes into an internal Unicode format that can be

518

arbitrary bytes into an internal Unicode format that can be

522

re-encoded back into the original. Here we are exposing the

519

re-encoded back into the original. Here we are exposing the

523

internal surrogate encoding as a UTF-8 string.)

520

internal surrogate encoding as a UTF-8 string.)

524

'''

521

'''

525

522

526

if "\xed" not in s:

523

if "\xed" not in s:

527

if isinstance(s, localstr):

524

if isinstance(s, localstr):

528

return s._utf8

525

return s._utf8

529

try:

526

try:

530

s.decode('utf-8')

527

s.decode('utf-8')

531

return s

528

return s

532

except UnicodeDecodeError:

529

except UnicodeDecodeError:

533

pass

530

pass

534

531

535

r = ""

532

r = ""

536

pos = 0

533

pos = 0

537

l = len(s)

534

l = len(s)

538

while pos < l:

535

while pos < l:

539

try:

536

try:

540

c = getutf8char(s, pos)

537

c = getutf8char(s, pos)

541

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

538

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

542

# have to re-escape existing U+DCxx characters

539

# have to re-escape existing U+DCxx characters

543

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

540

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

544

pos += 1

541

pos += 1

545

else:

542

else:

546

pos += len(c)

543

pos += len(c)

547

except UnicodeDecodeError:

544

except UnicodeDecodeError:

548

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

545

c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')

549

pos += 1

546

pos += 1

550

r += c

547

r += c

551

return r

548

return r

552

549

553

def fromutf8b(s):

550

def fromutf8b(s):

554

'''Given a UTF-8b string, return a local, possibly-binary string.

551

'''Given a UTF-8b string, return a local, possibly-binary string.

555

552

556

return the original binary string. This

553

return the original binary string. This

557

is a round-trip process for strings like filenames, but metadata

554

is a round-trip process for strings like filenames, but metadata

558

that's was passed through tolocal will remain in UTF-8.

555

that's was passed through tolocal will remain in UTF-8.

559

556

560

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

557

>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x

561

>>> m = "\\xc3\\xa9\\x99abcd"

558

>>> m = "\\xc3\\xa9\\x99abcd"

562

>>> toutf8b(m)

559

>>> toutf8b(m)

563

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

560

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

564

>>> roundtrip(m)

561

>>> roundtrip(m)

565

True

562

True

566

>>> roundtrip("\\xc2\\xc2\\x80")

563

>>> roundtrip("\\xc2\\xc2\\x80")

567

True

564

True

568

>>> roundtrip("\\xef\\xbf\\xbd")

565

>>> roundtrip("\\xef\\xbf\\xbd")

569

True

566

True

570

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

567

>>> roundtrip("\\xef\\xef\\xbf\\xbd")

571

True

568

True

572

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

569

>>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")

573

True

570

True

574

'''

571

'''

575

572

576

# fast path - look for uDxxx prefixes in s

573

# fast path - look for uDxxx prefixes in s

577

if "\xed" not in s:

574

if "\xed" not in s:

578

return s

575

return s

579

576

580

# We could do this with the unicode type but some Python builds

577

# We could do this with the unicode type but some Python builds

581

# use UTF-16 internally (issue5031) which causes non-BMP code

578

# use UTF-16 internally (issue5031) which causes non-BMP code

582

# points to be escaped. Instead, we use our handy getutf8char

579

# points to be escaped. Instead, we use our handy getutf8char

583

# helper again to walk the string without "decoding" it.

580

# helper again to walk the string without "decoding" it.

584

581

585

r = ""

582

r = ""

586

pos = 0

583

pos = 0

587

l = len(s)

584

l = len(s)

588

while pos < l:

585

while pos < l:

589

c = getutf8char(s, pos)

586

c = getutf8char(s, pos)

590

pos += len(c)

587

pos += len(c)

591

# unescape U+DCxx characters

588

# unescape U+DCxx characters

592

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

589

if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":

593

c = chr(ord(c.decode("utf-8")) & 0xff)

590

c = chr(ord(c.decode("utf-8")) & 0xff)

594

r += c

591

r += c

595

return r

592

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import array
             import locale
             import os
             import unicodedata
             from . import (
                 error,
                 pycompat,
             )
             _sysstr = pycompat.sysstr
             if pycompat.ispy3:
                 unichr = chr
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
-            if pycompat.ispy3:
+            assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
-                assert set(i[0] for i in _ignore) == {ord(b'\xe2'), ord(b'\xef')}
-            else:
-                assert set(i[0] for i in _ignore) == {"\xe2", "\xef"}
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             # encoding.environ is provided read-only, which may not be used to modify
             # the process environment
             _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
             if not pycompat.ispy3:
                 environ = os.environ  # re-exports
             elif _nativeenviron:
                 environ = os.environb  # re-exports
             else:
                 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
                 # and recreate it once encoding is settled
                 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
                                for k, v in os.environ.items())  # re-exports
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
             }
             try:
                 encoding = environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(_sysstr(encoding), u"replace")
                         if u == r.decode(_sysstr(encoding)):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(_sysstr(fallbackencoding))
                             r = u.encode(_sysstr(encoding), u"replace")
                             if u == r.decode(_sysstr(encoding)):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             # can't round-trip
                             return u.encode(_sysstr(encoding), u"replace")
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     return u.encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def unitolocal(u):
                 """Convert a unicode string to a byte string of local encoding"""
                 return tolocal(u.encode('utf-8'))
             def unifromlocal(s):
                 """Convert a byte string of local encoding to a unicode string"""
                 return fromlocal(s).decode('utf-8')
             # converter functions between native str and byte string. use these if the
             # character encoding is not aware (e.g. exception message) or is known to
             # be locale dependent (e.g. date formatting.)
             if pycompat.ispy3:
                 strtolocal = unitolocal
                 strfromlocal = unifromlocal
             else:
                 strtolocal = pycompat.identity
                 strfromlocal = pycompat.identity
             if not _nativeenviron:
                 # now encoding and helper functions are available, recreate the environ
                 # dict to be exported to other modules
                 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
                                for k, v in os.environ.items())  # re-exports
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from . import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(_sysstr(encoding))
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(_sysstr(encoding)))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 from . import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(_sysstr(encoding))
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = []
             _jsonmap.extend("\\u%04x" % x for x in range(32))
             _jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
             _jsonmap.append('\\u007f')
             _jsonmap[0x09] = '\\t'
             _jsonmap[0x0a] = '\\n'
             _jsonmap[0x22] = '\\"'
             _jsonmap[0x5c] = '\\\\'
             _jsonmap[0x08] = '\\b'
             _jsonmap[0x0c] = '\\f'
             _jsonmap[0x0d] = '\\r'
             _paranoidjsonmap = _jsonmap[:]
             _paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
             _paranoidjsonmap[0x3e] = '\\u003e'  # '>'
             _jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
             def jsonescape(s, paranoid=False):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
                 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
                 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 If paranoid, non-ascii and common troublesome characters are also escaped.
                 This is suitable for web output.
                 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
                 'escape boundary: ~ \\\\u007f \\\\u0080'
                 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
                 'a weird byte: \\\\udcdd'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
                 'utf-8: caf\\\\u00e9'
                 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
                 'non-BMP: \\\\ud834\\\\udd1e'
                 >>> jsonescape('<foo@example.org>', paranoid=True)
                 '\\\\u003cfoo@example.org\\\\u003e'
                 '''
                 if paranoid:
                     jm = _paranoidjsonmap
                 else:
                     jm = _jsonmap
                 u8chars = toutf8b(s)
                 try:
                     return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
                 except IndexError:
                     pass
                 # non-BMP char is represented as UTF-16 surrogate pair
                 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
                 u16codes.pop(0)  # drop BOM
                 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
             _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
             def getutf8char(s, pos):
                 '''get the next full utf-8 character in the given string, starting at pos
                 Raises a UnicodeError if the given location does not start a valid
                 utf-8 character.
                 '''
                 # find how many bytes to attempt decoding from first nibble
                 l = _utf8len[ord(s[pos]) >> 4]
                 if not l: # ascii
                     return s[pos]
                 c = s[pos:pos + l]
                 # validate with attempted decode
                 c.decode("utf-8")
                 return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if "\xed" not in s:
                     if isinstance(s, localstr):
                         return s._utf8
                     try:
                         s.decode('utf-8')
                         return s
                     except UnicodeDecodeError:
                         pass
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     try:
                         c = getutf8char(s, pos)
                         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                             # have to re-escape existing U+DCxx characters
                             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                             pos += 1
                         else:
                             pos += len(c)
                     except UnicodeDecodeError:
                         c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
                         pos += 1
                     r += c
                 return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> toutf8b(m)
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> roundtrip(m)
                 True
                 >>> roundtrip("\\xc2\\xc2\\x80")
                 True
                 >>> roundtrip("\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xef\\xef\\xbf\\xbd")
                 True
                 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 # We could do this with the unicode type but some Python builds
                 # use UTF-16 internally (issue5031) which causes non-BMP code
                 # points to be escaped. Instead, we use our handy getutf8char
                 # helper again to walk the string without "decoding" it.
                 r = ""
                 pos = 0
                 l = len(s)
                 while pos < l:
                     c = getutf8char(s, pos)
                     pos += len(c)
                     # unescape U+DCxx characters
                     if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                         c = chr(ord(c.decode("utf-8")) & 0xff)
                     r += c
                 return r