upstream/mercurial-mirror Commit - r22779:d9585dda

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error, parsers

8

import error, parsers

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

def _getpreferredencoding():

11

def _getpreferredencoding():

12

'''

12

'''

13

On darwin, getpreferredencoding ignores the locale environment and

13

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

15

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

16

Python versions.

17

18

However, we can't use a version check for this method, as some distributions

18

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

20

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

21

'''

22

try:

22

try:

23

locale.CODESET

23

locale.CODESET

24

except AttributeError:

24

except AttributeError:

25

# Fall back to parsing environment variables :-(

25

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

26

return locale.getdefaultlocale()[1]

27

28

oldloc = locale.setlocale(locale.LC_CTYPE)

28

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

29

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

30

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

32

33

return result

33

return result

34

35

_encodingfixers = {

35

_encodingfixers = {

36

'646': lambda: 'ascii',

36

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

38

'mac-roman': _getpreferredencoding

39

}

39

}

40

41

try:

41

try:

42

encoding = os.environ.get("HGENCODING")

42

encoding = os.environ.get("HGENCODING")

43

if not encoding:

43

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

44

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

46

except locale.Error:

47

encoding = 'ascii'

47

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

49

fallbackencoding = 'ISO-8859-1'

50

51

class localstr(str):

51

class localstr(str):

52

'''This class allows strings that are unmodified to be

52

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

53

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

54

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

55

s = str.__new__(cls, l)

56

s._utf8 = u

56

s._utf8 = u

57

return s

57

return s

58

def __hash__(self):

58

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

59

return hash(self._utf8) # avoid collisions in local string space

60

61

def tolocal(s):

61

def tolocal(s):

62

"""

62

"""

63

Convert a string from internal UTF-8 to local encoding

63

Convert a string from internal UTF-8 to local encoding

64

65

All internal strings should be UTF-8 but some repos before the

65

All internal strings should be UTF-8 but some repos before the

66

implementation of locale support may contain latin1 or possibly

66

implementation of locale support may contain latin1 or possibly

67

other character sets. We attempt to decode everything strictly

67

other character sets. We attempt to decode everything strictly

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

69

replace unknown characters.

69

replace unknown characters.

70

71

The localstr class is used to cache the known UTF-8 encoding of

71

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

72

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

73

round-trip conversion back to UTF-8.

74

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

76

>>> l = tolocal(u)

77

>>> l

77

>>> l

78

'foo: ?'

78

'foo: ?'

79

>>> fromlocal(l)

79

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

80

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

81

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

82

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> len(d) # no collision

83

>>> len(d) # no collision

84

2

84

2

85

>>> 'foo: ?' in d

85

>>> 'foo: ?' in d

86

False

86

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

88

>>> l = tolocal(l1)

89

>>> l

89

>>> l

90

'foo: ?'

90

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

91

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

92

'foo: \\xc3\\xa4'

93

"""

93

"""

94

95

try:

95

try:

96

try:

96

try:

97

# make sure string is actually stored in UTF-8

97

# make sure string is actually stored in UTF-8

98

u = s.decode('UTF-8')

98

u = s.decode('UTF-8')

99

if encoding == 'UTF-8':

99

if encoding == 'UTF-8':

100

# fast path

100

# fast path

101

return s

101

return s

102

r = u.encode(encoding, "replace")

102

r = u.encode(encoding, "replace")

103

if u == r.decode(encoding):

103

if u == r.decode(encoding):

104

# r is a safe, non-lossy encoding of s

104

# r is a safe, non-lossy encoding of s

105

return r

105

return r

106

return localstr(s, r)

106

return localstr(s, r)

107

except UnicodeDecodeError:

107

except UnicodeDecodeError:

108

# we should only get here if we're looking at an ancient changeset

108

# we should only get here if we're looking at an ancient changeset

109

try:

109

try:

110

u = s.decode(fallbackencoding)

110

u = s.decode(fallbackencoding)

111

r = u.encode(encoding, "replace")

111

r = u.encode(encoding, "replace")

112

if u == r.decode(encoding):

112

if u == r.decode(encoding):

113

# r is a safe, non-lossy encoding of s

113

# r is a safe, non-lossy encoding of s

114

return r

114

return r

115

return localstr(u.encode('UTF-8'), r)

115

return localstr(u.encode('UTF-8'), r)

116

except UnicodeDecodeError:

116

except UnicodeDecodeError:

117

u = s.decode("utf-8", "replace") # last ditch

117

u = s.decode("utf-8", "replace") # last ditch

118

return u.encode(encoding, "replace") # can't round-trip

118

return u.encode(encoding, "replace") # can't round-trip

119

except LookupError, k:

119

except LookupError, k:

120

raise error.Abort(k, hint="please check your locale settings")

120

raise error.Abort(k, hint="please check your locale settings")

121

122

def fromlocal(s):

122

def fromlocal(s):

123

"""

123

"""

124

Convert a string from the local character encoding to UTF-8

124

Convert a string from the local character encoding to UTF-8

125

126

We attempt to decode strings using the encoding mode set by

126

We attempt to decode strings using the encoding mode set by

127

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

127

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

128

characters will cause an error message. Other modes include

128

characters will cause an error message. Other modes include

129

'replace', which replaces unknown characters with a special

129

'replace', which replaces unknown characters with a special

130

Unicode character, and 'ignore', which drops the character.

130

Unicode character, and 'ignore', which drops the character.

131

"""

131

"""

132

133

# can we do a lossless round-trip?

133

# can we do a lossless round-trip?

134

if isinstance(s, localstr):

134

if isinstance(s, localstr):

135

return s._utf8

135

return s._utf8

136

137

try:

137

try:

138

return s.decode(encoding, encodingmode).encode("utf-8")

138

return s.decode(encoding, encodingmode).encode("utf-8")

139

except UnicodeDecodeError, inst:

139

except UnicodeDecodeError, inst:

140

sub = s[max(0, inst.start - 10):inst.start + 10]

140

sub = s[max(0, inst.start - 10):inst.start + 10]

141

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

141

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

142

except LookupError, k:

142

except LookupError, k:

143

raise error.Abort(k, hint="please check your locale settings")

143

raise error.Abort(k, hint="please check your locale settings")

144

145

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

145

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

146

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

146

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

147

and "WFA" or "WF")

147

and "WFA" or "WF")

148

149

def colwidth(s):

149

def colwidth(s):

150

"Find the column width of a string for display in the local encoding"

150

"Find the column width of a string for display in the local encoding"

151

return ucolwidth(s.decode(encoding, 'replace'))

151

return ucolwidth(s.decode(encoding, 'replace'))

152

153

def ucolwidth(d):

153

def ucolwidth(d):

154

"Find the column width of a Unicode string for display"

154

"Find the column width of a Unicode string for display"

155

eaw = getattr(unicodedata, 'east_asian_width', None)

155

eaw = getattr(unicodedata, 'east_asian_width', None)

156

if eaw is not None:

156

if eaw is not None:

157

return sum([eaw(c) in wide and 2 or 1 for c in d])

157

return sum([eaw(c) in wide and 2 or 1 for c in d])

158

return len(d)

158

return len(d)

159

160

def getcols(s, start, c):

160

def getcols(s, start, c):

161

'''Use colwidth to find a c-column substring of s starting at byte

161

'''Use colwidth to find a c-column substring of s starting at byte

162

index start'''

162

index start'''

163

for x in xrange(start + c, len(s)):

163

for x in xrange(start + c, len(s)):

164

t = s[start:x]

164

t = s[start:x]

165

if colwidth(t) == c:

165

if colwidth(t) == c:

166

return t

166

return t

167

168

def trim(s, width, ellipsis='', leftside=False):

168

def trim(s, width, ellipsis='', leftside=False):

169

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

169

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

170

171

If 'leftside' is True, left side of string 's' is trimmed.

171

If 'leftside' is True, left side of string 's' is trimmed.

172

'ellipsis' is always placed at trimmed side.

172

'ellipsis' is always placed at trimmed side.

173

174

>>> ellipsis = '+++'

174

>>> ellipsis = '+++'

175

>>> from mercurial import encoding

175

>>> from mercurial import encoding

176

>>> encoding.encoding = 'utf-8'

176

>>> encoding.encoding = 'utf-8'

177

>>> t= '1234567890'

177

>>> t= '1234567890'

178

>>> print trim(t, 12, ellipsis=ellipsis)

178

>>> print trim(t, 12, ellipsis=ellipsis)

179

1234567890

179

1234567890

180

>>> print trim(t, 10, ellipsis=ellipsis)

180

>>> print trim(t, 10, ellipsis=ellipsis)

181

1234567890

181

1234567890

182

>>> print trim(t, 8, ellipsis=ellipsis)

182

>>> print trim(t, 8, ellipsis=ellipsis)

183

12345+++

183

12345+++

184

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

184

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

185

+++67890

185

+++67890

186

>>> print trim(t, 8)

186

>>> print trim(t, 8)

187

12345678

187

12345678

188

>>> print trim(t, 8, leftside=True)

188

>>> print trim(t, 8, leftside=True)

189

34567890

189

34567890

190

>>> print trim(t, 3, ellipsis=ellipsis)

190

>>> print trim(t, 3, ellipsis=ellipsis)

191

+++

191

+++

192

>>> print trim(t, 1, ellipsis=ellipsis)

192

>>> print trim(t, 1, ellipsis=ellipsis)

193

+

193

+

194

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

194

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

195

>>> t = u.encode(encoding.encoding)

195

>>> t = u.encode(encoding.encoding)

196

>>> print trim(t, 12, ellipsis=ellipsis)

196

>>> print trim(t, 12, ellipsis=ellipsis)

197

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

197

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

198

>>> print trim(t, 10, ellipsis=ellipsis)

198

>>> print trim(t, 10, ellipsis=ellipsis)

199

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

199

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

200

>>> print trim(t, 8, ellipsis=ellipsis)

200

>>> print trim(t, 8, ellipsis=ellipsis)

201

\xe3\x81\x82\xe3\x81\x84+++

201

\xe3\x81\x82\xe3\x81\x84+++

202

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

202

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

203

+++\xe3\x81\x88\xe3\x81\x8a

203

+++\xe3\x81\x88\xe3\x81\x8a

204

>>> print trim(t, 5)

204

>>> print trim(t, 5)

205

\xe3\x81\x82\xe3\x81\x84

205

\xe3\x81\x82\xe3\x81\x84

206

>>> print trim(t, 5, leftside=True)

206

>>> print trim(t, 5, leftside=True)

207

\xe3\x81\x88\xe3\x81\x8a

207

\xe3\x81\x88\xe3\x81\x8a

208

>>> print trim(t, 4, ellipsis=ellipsis)

208

>>> print trim(t, 4, ellipsis=ellipsis)

209

+++

209

+++

210

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

210

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

211

+++

211

+++

212

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

212

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

213

>>> print trim(t, 12, ellipsis=ellipsis)

213

>>> print trim(t, 12, ellipsis=ellipsis)

214

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

214

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

215

>>> print trim(t, 10, ellipsis=ellipsis)

215

>>> print trim(t, 10, ellipsis=ellipsis)

216

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

216

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

217

>>> print trim(t, 8, ellipsis=ellipsis)

217

>>> print trim(t, 8, ellipsis=ellipsis)

218

\x11\x22\x33\x44\x55+++

218

\x11\x22\x33\x44\x55+++

219

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

219

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

220

+++\x66\x77\x88\x99\xaa

220

+++\x66\x77\x88\x99\xaa

221

>>> print trim(t, 8)

221

>>> print trim(t, 8)

222

\x11\x22\x33\x44\x55\x66\x77\x88

222

\x11\x22\x33\x44\x55\x66\x77\x88

223

>>> print trim(t, 8, leftside=True)

223

>>> print trim(t, 8, leftside=True)

224

\x33\x44\x55\x66\x77\x88\x99\xaa

224

\x33\x44\x55\x66\x77\x88\x99\xaa

225

>>> print trim(t, 3, ellipsis=ellipsis)

225

>>> print trim(t, 3, ellipsis=ellipsis)

226

+++

226

+++

227

>>> print trim(t, 1, ellipsis=ellipsis)

227

>>> print trim(t, 1, ellipsis=ellipsis)

228

+

228

+

229

"""

229

"""

230

try:

230

try:

231

u = s.decode(encoding)

231

u = s.decode(encoding)

232

except UnicodeDecodeError:

232

except UnicodeDecodeError:

233

if len(s) <= width: # trimming is not needed

233

if len(s) <= width: # trimming is not needed

234

return s

234

return s

235

width -= len(ellipsis)

235

width -= len(ellipsis)

236

if width <= 0: # no enough room even for ellipsis

236

if width <= 0: # no enough room even for ellipsis

237

return ellipsis[:width + len(ellipsis)]

237

return ellipsis[:width + len(ellipsis)]

238

if leftside:

238

if leftside:

239

return ellipsis + s[-width:]

239

return ellipsis + s[-width:]

240

return s[:width] + ellipsis

240

return s[:width] + ellipsis

241

242

if ucolwidth(u) <= width: # trimming is not needed

242

if ucolwidth(u) <= width: # trimming is not needed

243

return s

243

return s

244

245

width -= len(ellipsis)

245

width -= len(ellipsis)

246

if width <= 0: # no enough room even for ellipsis

246

if width <= 0: # no enough room even for ellipsis

247

return ellipsis[:width + len(ellipsis)]

247

return ellipsis[:width + len(ellipsis)]

248

249

if leftside:

249

if leftside:

250

uslice = lambda i: u[i:]

250

uslice = lambda i: u[i:]

251

concat = lambda s: ellipsis + s

251

concat = lambda s: ellipsis + s

252

else:

252

else:

253

uslice = lambda i: u[:-i]

253

uslice = lambda i: u[:-i]

254

concat = lambda s: s + ellipsis

254

concat = lambda s: s + ellipsis

255

for i in xrange(1, len(u)):

255

for i in xrange(1, len(u)):

256

usub = uslice(i)

256

usub = uslice(i)

257

if ucolwidth(usub) <= width:

257

if ucolwidth(usub) <= width:

258

return concat(usub.encode(encoding))

258

return concat(usub.encode(encoding))

259

return ellipsis # no enough room for multi-column characters

259

return ellipsis # no enough room for multi-column characters

260

261

def asciilower(s):

261

def asciilower(s):

262

'''convert a string to lowercase if ASCII

262

'''convert a string to lowercase if ASCII

263

264

Raises UnicodeDecodeError if non-ASCII characters are found.'''

264

Raises UnicodeDecodeError if non-ASCII characters are found.'''

265

s.decode('ascii')

265

s.decode('ascii')

266

return s.lower()

266

return s.lower()

267

268

asciilower = getattr(parsers, 'asciilower', asciilower)

268

asciilower = getattr(parsers, 'asciilower', asciilower)

269

270

def lower(s):

270

def lower(s):

271

"best-effort encoding-aware case-folding of local string s"

271

"best-effort encoding-aware case-folding of local string s"

272

try:

272

try:

273

s.decode('ascii') # throw exception for non-ASCII character

273

return asciilower(s)

274

return s.lower()

275

except UnicodeDecodeError:

274

except UnicodeDecodeError:

276

pass

275

pass

277

try:

276

try:

278

if isinstance(s, localstr):

277

if isinstance(s, localstr):

279

u = s._utf8.decode("utf-8")

278

u = s._utf8.decode("utf-8")

280

else:

279

else:

281

u = s.decode(encoding, encodingmode)

280

u = s.decode(encoding, encodingmode)

282

281

283

lu = u.lower()

282

lu = u.lower()

284

if u == lu:

283

if u == lu:

285

return s # preserve localstring

284

return s # preserve localstring

286

return lu.encode(encoding)

285

return lu.encode(encoding)

287

except UnicodeError:

286

except UnicodeError:

288

return s.lower() # we don't know how to fold this except in ASCII

287

return s.lower() # we don't know how to fold this except in ASCII

289

except LookupError, k:

288

except LookupError, k:

290

raise error.Abort(k, hint="please check your locale settings")

289

raise error.Abort(k, hint="please check your locale settings")

291

290

292

def upper(s):

291

def upper(s):

293

"best-effort encoding-aware case-folding of local string s"

292

"best-effort encoding-aware case-folding of local string s"

294

try:

293

try:

295

s.decode('ascii') # throw exception for non-ASCII character

294

s.decode('ascii') # throw exception for non-ASCII character

296

return s.upper()

295

return s.upper()

297

except UnicodeDecodeError:

296

except UnicodeDecodeError:

298

pass

297

pass

299

try:

298

try:

300

if isinstance(s, localstr):

299

if isinstance(s, localstr):

301

u = s._utf8.decode("utf-8")

300

u = s._utf8.decode("utf-8")

302

else:

301

else:

303

u = s.decode(encoding, encodingmode)

302

u = s.decode(encoding, encodingmode)

304

303

305

uu = u.upper()

304

uu = u.upper()

306

if u == uu:

305

if u == uu:

307

return s # preserve localstring

306

return s # preserve localstring

308

return uu.encode(encoding)

307

return uu.encode(encoding)

309

except UnicodeError:

308

except UnicodeError:

310

return s.upper() # we don't know how to fold this except in ASCII

309

return s.upper() # we don't know how to fold this except in ASCII

311

except LookupError, k:

310

except LookupError, k:

312

raise error.Abort(k, hint="please check your locale settings")

311

raise error.Abort(k, hint="please check your locale settings")

313

312

314

_jsonmap = {}

313

_jsonmap = {}

315

314

316

def jsonescape(s):

315

def jsonescape(s):

317

'''returns a string suitable for JSON

316

'''returns a string suitable for JSON

318

317

319

JSON is problematic for us because it doesn't support non-Unicode

318

JSON is problematic for us because it doesn't support non-Unicode

320

bytes. To deal with this, we take the following approach:

319

bytes. To deal with this, we take the following approach:

321

320

322

- localstr objects are converted back to UTF-8

321

- localstr objects are converted back to UTF-8

323

- valid UTF-8/ASCII strings are passed as-is

322

- valid UTF-8/ASCII strings are passed as-is

324

- other strings are converted to UTF-8b surrogate encoding

323

- other strings are converted to UTF-8b surrogate encoding

325

- apply JSON-specified string escaping

324

- apply JSON-specified string escaping

326

325

327

(escapes are doubled in these tests)

326

(escapes are doubled in these tests)

328

327

329

>>> jsonescape('this is a test')

328

>>> jsonescape('this is a test')

330

'this is a test'

329

'this is a test'

331

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

330

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

332

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

331

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

333

>>> jsonescape('a weird byte: \\xdd')

332

>>> jsonescape('a weird byte: \\xdd')

334

'a weird byte: \\xed\\xb3\\x9d'

333

'a weird byte: \\xed\\xb3\\x9d'

335

>>> jsonescape('utf-8: caf\\xc3\\xa9')

334

>>> jsonescape('utf-8: caf\\xc3\\xa9')

336

'utf-8: caf\\xc3\\xa9'

335

'utf-8: caf\\xc3\\xa9'

337

>>> jsonescape('')

336

>>> jsonescape('')

338

''

337

''

339

'''

338

'''

340

339

341

if not _jsonmap:

340

if not _jsonmap:

342

for x in xrange(32):

341

for x in xrange(32):

343

_jsonmap[chr(x)] = "\u%04x" %x

342

_jsonmap[chr(x)] = "\u%04x" %x

344

for x in xrange(32, 256):

343

for x in xrange(32, 256):

345

c = chr(x)

344

c = chr(x)

346

_jsonmap[c] = c

345

_jsonmap[c] = c

347

_jsonmap['\t'] = '\\t'

346

_jsonmap['\t'] = '\\t'

348

_jsonmap['\n'] = '\\n'

347

_jsonmap['\n'] = '\\n'

349

_jsonmap['\"'] = '\\"'

348

_jsonmap['\"'] = '\\"'

350

_jsonmap['\\'] = '\\\\'

349

_jsonmap['\\'] = '\\\\'

351

_jsonmap['\b'] = '\\b'

350

_jsonmap['\b'] = '\\b'

352

_jsonmap['\f'] = '\\f'

351

_jsonmap['\f'] = '\\f'

353

_jsonmap['\r'] = '\\r'

352

_jsonmap['\r'] = '\\r'

354

353

355

return ''.join(_jsonmap[c] for c in toutf8b(s))

354

return ''.join(_jsonmap[c] for c in toutf8b(s))

356

355

357

def toutf8b(s):

356

def toutf8b(s):

358

'''convert a local, possibly-binary string into UTF-8b

357

'''convert a local, possibly-binary string into UTF-8b

359

358

360

This is intended as a generic method to preserve data when working

359

This is intended as a generic method to preserve data when working

361

with schemes like JSON and XML that have no provision for

360

with schemes like JSON and XML that have no provision for

362

arbitrary byte strings. As Mercurial often doesn't know

361

arbitrary byte strings. As Mercurial often doesn't know

363

what encoding data is in, we use so-called UTF-8b.

362

what encoding data is in, we use so-called UTF-8b.

364

363

365

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

364

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

366

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

365

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

367

uDC00-uDCFF.

366

uDC00-uDCFF.

368

367

369

Principles of operation:

368

Principles of operation:

370

369

371

- ASCII and UTF-8 data successfully round-trips and is understood

370

- ASCII and UTF-8 data successfully round-trips and is understood

372

by Unicode-oriented clients

371

by Unicode-oriented clients

373

- filenames and file contents in arbitrary other encodings can have

372

- filenames and file contents in arbitrary other encodings can have

374

be round-tripped or recovered by clueful clients

373

be round-tripped or recovered by clueful clients

375

- local strings that have a cached known UTF-8 encoding (aka

374

- local strings that have a cached known UTF-8 encoding (aka

376

localstr) get sent as UTF-8 so Unicode-oriented clients get the

375

localstr) get sent as UTF-8 so Unicode-oriented clients get the

377

Unicode data they want

376

Unicode data they want

378

- because we must preserve UTF-8 bytestring in places such as

377

- because we must preserve UTF-8 bytestring in places such as

379

filenames, metadata can't be roundtripped without help

378

filenames, metadata can't be roundtripped without help

380

379

381

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

380

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

382

arbitrary bytes into an internal Unicode format that can be

381

arbitrary bytes into an internal Unicode format that can be

383

re-encoded back into the original. Here we are exposing the

382

re-encoded back into the original. Here we are exposing the

384

internal surrogate encoding as a UTF-8 string.)

383

internal surrogate encoding as a UTF-8 string.)

385

'''

384

'''

386

385

387

if isinstance(s, localstr):

386

if isinstance(s, localstr):

388

return s._utf8

387

return s._utf8

389

388

390

try:

389

try:

391

s.decode('utf-8')

390

s.decode('utf-8')

392

return s

391

return s

393

except UnicodeDecodeError:

392

except UnicodeDecodeError:

394

# surrogate-encode any characters that don't round-trip

393

# surrogate-encode any characters that don't round-trip

395

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

394

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

396

r = ""

395

r = ""

397

pos = 0

396

pos = 0

398

for c in s:

397

for c in s:

399

if s2[pos:pos + 1] == c:

398

if s2[pos:pos + 1] == c:

400

r += c

399

r += c

401

pos += 1

400

pos += 1

402

else:

401

else:

403

r += unichr(0xdc00 + ord(c)).encode('utf-8')

402

r += unichr(0xdc00 + ord(c)).encode('utf-8')

404

return r

403

return r

405

404

406

def fromutf8b(s):

405

def fromutf8b(s):

407

'''Given a UTF-8b string, return a local, possibly-binary string.

406

'''Given a UTF-8b string, return a local, possibly-binary string.

408

407

409

return the original binary string. This

408

return the original binary string. This

410

is a round-trip process for strings like filenames, but metadata

409

is a round-trip process for strings like filenames, but metadata

411

that's was passed through tolocal will remain in UTF-8.

410

that's was passed through tolocal will remain in UTF-8.

412

411

413

>>> m = "\\xc3\\xa9\\x99abcd"

412

>>> m = "\\xc3\\xa9\\x99abcd"

414

>>> n = toutf8b(m)

413

>>> n = toutf8b(m)

415

>>> n

414

>>> n

416

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

415

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

417

>>> fromutf8b(n) == m

416

>>> fromutf8b(n) == m

418

True

417

True

419

'''

418

'''

420

419

421

# fast path - look for uDxxx prefixes in s

420

# fast path - look for uDxxx prefixes in s

422

if "\xed" not in s:

421

if "\xed" not in s:

423

return s

422

return s

424

423

425

u = s.decode("utf-8")

424

u = s.decode("utf-8")

426

r = ""

425

r = ""

427

for c in u:

426

for c in u:

428

if ord(c) & 0xff00 == 0xdc00:

427

if ord(c) & 0xff00 == 0xdc00:

429

r += chr(ord(c) & 0xff)

428

r += chr(ord(c) & 0xff)

430

else:

429

else:

431

r += c.encode("utf-8")

430

r += c.encode("utf-8")

432

return r

431

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error, parsers
             import unicodedata, locale, os
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from mercurial import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             asciilower = getattr(parsers, 'asciilower', asciilower)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
-                    s.decode('ascii') # throw exception for non-ASCII character
+                    return asciilower(s)
-                    return s.lower()
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     s.decode('ascii') # throw exception for non-ASCII character
                     return s.upper()
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             _jsonmap = {}
             def jsonescape(s):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 '''
                 if not _jsonmap:
                     for x in xrange(32):
                         _jsonmap[chr(x)] = "\u%04x" %x
                     for x in xrange(32, 256):
                         c = chr(x)
                         _jsonmap[c] = c
                     _jsonmap['\t'] = '\\t'
                     _jsonmap['\n'] = '\\n'
                     _jsonmap['\"'] = '\\"'
                     _jsonmap['\\'] = '\\\\'
                     _jsonmap['\b'] = '\\b'
                     _jsonmap['\f'] = '\\f'
                     _jsonmap['\r'] = '\\r'
                 return ''.join(_jsonmap[c] for c in toutf8b(s))
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     s.decode('utf-8')
                     return s
                 except UnicodeDecodeError:
                     # surrogate-encode any characters that don't round-trip
                     s2 = s.decode('utf-8', 'ignore').encode('utf-8')
                     r = ""
                     pos = 0
                     for c in s:
                         if s2[pos:pos + 1] == c:
                             r += c
                             pos += 1
                         else:
                             r += unichr(0xdc00 + ord(c)).encode('utf-8')
                     return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> n = toutf8b(m)
                 >>> n
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> fromutf8b(n) == m
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 u = s.decode("utf-8")
                 r = ""
                 for c in u:
                     if ord(c) & 0xff00 == 0xdc00:
                         r += chr(ord(c) & 0xff)
                     else:
                         r += c.encode("utf-8")
                 return r