upstream/mercurial-mirror Commit - r26875:cf47bdb2

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

11

# These unicode characters are ignored by HFS+ (Apple Technote 1150,

12

# "Unicode Subtleties"), so we need to ignore them in some places for

12

# "Unicode Subtleties"), so we need to ignore them in some places for

13

# sanity.

13

# sanity.

14

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

14

_ignore = [unichr(int(x, 16)).encode("utf-8") for x in

15

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

15

"200c 200d 200e 200f 202a 202b 202c 202d 202e "

16

"206a 206b 206c 206d 206e 206f feff".split()]

16

"206a 206b 206c 206d 206e 206f feff".split()]

17

# verify the next function will work

17

# verify the next function will work

18

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

18

assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])

19

20

def hfsignoreclean(s):

20

def hfsignoreclean(s):

21

"""Remove codepoints ignored by HFS+ from s.

21

"""Remove codepoints ignored by HFS+ from s.

22

23

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

23

>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))

24

'.hg'

24

'.hg'

25

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

25

>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))

26

'.hg'

26

'.hg'

27

"""

27

"""

28

if "\xe2" in s or "\xef" in s:

28

if "\xe2" in s or "\xef" in s:

29

for c in _ignore:

29

for c in _ignore:

30

s = s.replace(c, '')

30

s = s.replace(c, '')

31

return s

31

return s

32

33

def _getpreferredencoding():

33

def _getpreferredencoding():

34

'''

34

'''

35

On darwin, getpreferredencoding ignores the locale environment and

35

On darwin, getpreferredencoding ignores the locale environment and

36

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

36

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

37

for Python 2.7 and up. This is the same corrected code for earlier

37

for Python 2.7 and up. This is the same corrected code for earlier

38

Python versions.

38

Python versions.

39

40

However, we can't use a version check for this method, as some distributions

40

However, we can't use a version check for this method, as some distributions

41

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

41

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

42

encoding, as it is unlikely that this encoding is the actually expected.

42

encoding, as it is unlikely that this encoding is the actually expected.

43

'''

43

'''

44

try:

44

try:

45

locale.CODESET

45

locale.CODESET

46

except AttributeError:

46

except AttributeError:

47

# Fall back to parsing environment variables :-(

47

# Fall back to parsing environment variables :-(

48

return locale.getdefaultlocale()[1]

48

return locale.getdefaultlocale()[1]

49

50

oldloc = locale.setlocale(locale.LC_CTYPE)

50

oldloc = locale.setlocale(locale.LC_CTYPE)

51

locale.setlocale(locale.LC_CTYPE, "")

51

locale.setlocale(locale.LC_CTYPE, "")

52

result = locale.nl_langinfo(locale.CODESET)

52

result = locale.nl_langinfo(locale.CODESET)

53

locale.setlocale(locale.LC_CTYPE, oldloc)

53

locale.setlocale(locale.LC_CTYPE, oldloc)

54

55

return result

55

return result

56

57

_encodingfixers = {

57

_encodingfixers = {

58

'646': lambda: 'ascii',

58

'646': lambda: 'ascii',

59

'ANSI_X3.4-1968': lambda: 'ascii',

59

'ANSI_X3.4-1968': lambda: 'ascii',

60

'mac-roman': _getpreferredencoding

60

'mac-roman': _getpreferredencoding

61

}

61

}

62

63

try:

63

try:

64

encoding = os.environ.get("HGENCODING")

64

encoding = os.environ.get("HGENCODING")

65

if not encoding:

65

if not encoding:

66

encoding = locale.getpreferredencoding() or 'ascii'

66

encoding = locale.getpreferredencoding() or 'ascii'

67

encoding = _encodingfixers.get(encoding, lambda: encoding)()

67

encoding = _encodingfixers.get(encoding, lambda: encoding)()

68

except locale.Error:

68

except locale.Error:

69

encoding = 'ascii'

69

encoding = 'ascii'

70

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

70

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

71

fallbackencoding = 'ISO-8859-1'

71

fallbackencoding = 'ISO-8859-1'

72

73

class localstr(str):

73

class localstr(str):

74

'''This class allows strings that are unmodified to be

74

'''This class allows strings that are unmodified to be

75

round-tripped to the local encoding and back'''

75

round-tripped to the local encoding and back'''

76

def __new__(cls, u, l):

76

def __new__(cls, u, l):

77

s = str.__new__(cls, l)

77

s = str.__new__(cls, l)

78

s._utf8 = u

78

s._utf8 = u

79

return s

79

return s

80

def __hash__(self):

80

def __hash__(self):

81

return hash(self._utf8) # avoid collisions in local string space

81

return hash(self._utf8) # avoid collisions in local string space

82

83

def tolocal(s):

83

def tolocal(s):

84

"""

84

"""

85

Convert a string from internal UTF-8 to local encoding

85

Convert a string from internal UTF-8 to local encoding

86

87

All internal strings should be UTF-8 but some repos before the

87

All internal strings should be UTF-8 but some repos before the

88

implementation of locale support may contain latin1 or possibly

88

implementation of locale support may contain latin1 or possibly

89

other character sets. We attempt to decode everything strictly

89

other character sets. We attempt to decode everything strictly

90

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

90

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

91

replace unknown characters.

91

replace unknown characters.

92

93

The localstr class is used to cache the known UTF-8 encoding of

93

The localstr class is used to cache the known UTF-8 encoding of

94

strings next to their local representation to allow lossless

94

strings next to their local representation to allow lossless

95

round-trip conversion back to UTF-8.

95

round-trip conversion back to UTF-8.

96

97

>>> u = 'foo: \\xc3\\xa4' # utf-8

97

>>> u = 'foo: \\xc3\\xa4' # utf-8

98

>>> l = tolocal(u)

98

>>> l = tolocal(u)

99

>>> l

99

>>> l

100

'foo: ?'

100

'foo: ?'

101

>>> fromlocal(l)

101

>>> fromlocal(l)

102

'foo: \\xc3\\xa4'

102

'foo: \\xc3\\xa4'

103

>>> u2 = 'foo: \\xc3\\xa1'

103

>>> u2 = 'foo: \\xc3\\xa1'

104

>>> d = { l: 1, tolocal(u2): 2 }

104

>>> d = { l: 1, tolocal(u2): 2 }

105

>>> len(d) # no collision

105

>>> len(d) # no collision

106

2

106

2

107

>>> 'foo: ?' in d

107

>>> 'foo: ?' in d

108

False

108

False

109

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

109

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

110

>>> l = tolocal(l1)

110

>>> l = tolocal(l1)

111

>>> l

111

>>> l

112

'foo: ?'

112

'foo: ?'

113

>>> fromlocal(l) # magically in utf-8

113

>>> fromlocal(l) # magically in utf-8

114

'foo: \\xc3\\xa4'

114

'foo: \\xc3\\xa4'

115

"""

115

"""

116

117

try:

117

try:

118

try:

118

try:

119

# make sure string is actually stored in UTF-8

119

# make sure string is actually stored in UTF-8

120

u = s.decode('UTF-8')

120

u = s.decode('UTF-8')

121

if encoding == 'UTF-8':

121

if encoding == 'UTF-8':

122

# fast path

122

# fast path

123

return s

123

return s

124

r = u.encode(encoding, "replace")

124

r = u.encode(encoding, "replace")

125

if u == r.decode(encoding):

125

if u == r.decode(encoding):

126

# r is a safe, non-lossy encoding of s

126

# r is a safe, non-lossy encoding of s

127

return r

127

return r

128

return localstr(s, r)

128

return localstr(s, r)

129

except UnicodeDecodeError:

129

except UnicodeDecodeError:

130

# we should only get here if we're looking at an ancient changeset

130

# we should only get here if we're looking at an ancient changeset

131

try:

131

try:

132

u = s.decode(fallbackencoding)

132

u = s.decode(fallbackencoding)

133

r = u.encode(encoding, "replace")

133

r = u.encode(encoding, "replace")

134

if u == r.decode(encoding):

134

if u == r.decode(encoding):

135

# r is a safe, non-lossy encoding of s

135

# r is a safe, non-lossy encoding of s

136

return r

136

return r

137

return localstr(u.encode('UTF-8'), r)

137

return localstr(u.encode('UTF-8'), r)

138

except UnicodeDecodeError:

138

except UnicodeDecodeError:

139

u = s.decode("utf-8", "replace") # last ditch

139

u = s.decode("utf-8", "replace") # last ditch

140

return u.encode(encoding, "replace") # can't round-trip

140

return u.encode(encoding, "replace") # can't round-trip

141

except LookupError as k:

141

except LookupError as k:

142

raise error.Abort(k, hint="please check your locale settings")

142

raise error.Abort(k, hint="please check your locale settings")

143

144

def fromlocal(s):

144

def fromlocal(s):

145

"""

145

"""

146

Convert a string from the local character encoding to UTF-8

146

Convert a string from the local character encoding to UTF-8

147

148

We attempt to decode strings using the encoding mode set by

148

We attempt to decode strings using the encoding mode set by

149

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

149

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

150

characters will cause an error message. Other modes include

150

characters will cause an error message. Other modes include

151

'replace', which replaces unknown characters with a special

151

'replace', which replaces unknown characters with a special

152

Unicode character, and 'ignore', which drops the character.

152

Unicode character, and 'ignore', which drops the character.

153

"""

153

"""

154

155

# can we do a lossless round-trip?

155

# can we do a lossless round-trip?

156

if isinstance(s, localstr):

156

if isinstance(s, localstr):

157

return s._utf8

157

return s._utf8

158

159

try:

159

try:

160

return s.decode(encoding, encodingmode).encode("utf-8")

160

return s.decode(encoding, encodingmode).encode("utf-8")

161

except UnicodeDecodeError as inst:

161

except UnicodeDecodeError as inst:

162

sub = s[max(0, inst.start - 10):inst.start + 10]

162

sub = s[max(0, inst.start - 10):inst.start + 10]

163

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

163

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

164

except LookupError as k:

164

except LookupError as k:

165

raise error.Abort(k, hint="please check your locale settings")

165

raise error.Abort(k, hint="please check your locale settings")

166

167

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

167

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

168

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

168

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

169

and "WFA" or "WF")

169

and "WFA" or "WF")

170

171

def colwidth(s):

171

def colwidth(s):

172

"Find the column width of a string for display in the local encoding"

172

"Find the column width of a string for display in the local encoding"

173

return ucolwidth(s.decode(encoding, 'replace'))

173

return ucolwidth(s.decode(encoding, 'replace'))

174

175

def ucolwidth(d):

175

def ucolwidth(d):

176

"Find the column width of a Unicode string for display"

176

"Find the column width of a Unicode string for display"

177

eaw = getattr(unicodedata, 'east_asian_width', None)

177

eaw = getattr(unicodedata, 'east_asian_width', None)

178

if eaw is not None:

178

if eaw is not None:

179

return sum([eaw(c) in wide and 2 or 1 for c in d])

179

return sum([eaw(c) in wide and 2 or 1 for c in d])

180

return len(d)

180

return len(d)

181

182

def getcols(s, start, c):

182

def getcols(s, start, c):

183

'''Use colwidth to find a c-column substring of s starting at byte

183

'''Use colwidth to find a c-column substring of s starting at byte

184

index start'''

184

index start'''

185

for x in xrange(start + c, len(s)):

185

for x in xrange(start + c, len(s)):

186

t = s[start:x]

186

t = s[start:x]

187

if colwidth(t) == c:

187

if colwidth(t) == c:

188

return t

188

return t

189

190

def trim(s, width, ellipsis='', leftside=False):

190

def trim(s, width, ellipsis='', leftside=False):

191

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

191

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

192

193

If 'leftside' is True, left side of string 's' is trimmed.

193

If 'leftside' is True, left side of string 's' is trimmed.

194

'ellipsis' is always placed at trimmed side.

194

'ellipsis' is always placed at trimmed side.

195

196

>>> ellipsis = '+++'

196

>>> ellipsis = '+++'

197

>>> from mercurial import encoding

197

>>> from mercurial import encoding

198

>>> encoding.encoding = 'utf-8'

198

>>> encoding.encoding = 'utf-8'

199

>>> t= '1234567890'

199

>>> t= '1234567890'

200

>>> print trim(t, 12, ellipsis=ellipsis)

200

>>> print trim(t, 12, ellipsis=ellipsis)

201

1234567890

201

1234567890

202

>>> print trim(t, 10, ellipsis=ellipsis)

202

>>> print trim(t, 10, ellipsis=ellipsis)

203

1234567890

203

1234567890

204

>>> print trim(t, 8, ellipsis=ellipsis)

204

>>> print trim(t, 8, ellipsis=ellipsis)

205

12345+++

205

12345+++

206

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

206

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

207

+++67890

207

+++67890

208

>>> print trim(t, 8)

208

>>> print trim(t, 8)

209

12345678

209

12345678

210

>>> print trim(t, 8, leftside=True)

210

>>> print trim(t, 8, leftside=True)

211

34567890

211

34567890

212

>>> print trim(t, 3, ellipsis=ellipsis)

212

>>> print trim(t, 3, ellipsis=ellipsis)

213

+++

213

+++

214

>>> print trim(t, 1, ellipsis=ellipsis)

214

>>> print trim(t, 1, ellipsis=ellipsis)

215

+

215

+

216

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

216

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

217

>>> t = u.encode(encoding.encoding)

217

>>> t = u.encode(encoding.encoding)

218

>>> print trim(t, 12, ellipsis=ellipsis)

218

>>> print trim(t, 12, ellipsis=ellipsis)

219

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

219

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

220

>>> print trim(t, 10, ellipsis=ellipsis)

220

>>> print trim(t, 10, ellipsis=ellipsis)

221

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

221

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

222

>>> print trim(t, 8, ellipsis=ellipsis)

222

>>> print trim(t, 8, ellipsis=ellipsis)

223

\xe3\x81\x82\xe3\x81\x84+++

223

\xe3\x81\x82\xe3\x81\x84+++

224

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

224

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

225

+++\xe3\x81\x88\xe3\x81\x8a

225

+++\xe3\x81\x88\xe3\x81\x8a

226

>>> print trim(t, 5)

226

>>> print trim(t, 5)

227

\xe3\x81\x82\xe3\x81\x84

227

\xe3\x81\x82\xe3\x81\x84

228

>>> print trim(t, 5, leftside=True)

228

>>> print trim(t, 5, leftside=True)

229

\xe3\x81\x88\xe3\x81\x8a

229

\xe3\x81\x88\xe3\x81\x8a

230

>>> print trim(t, 4, ellipsis=ellipsis)

230

>>> print trim(t, 4, ellipsis=ellipsis)

231

+++

231

+++

232

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

232

>>> print trim(t, 4, ellipsis=ellipsis, leftside=True)

233

+++

233

+++

234

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

234

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

235

>>> print trim(t, 12, ellipsis=ellipsis)

235

>>> print trim(t, 12, ellipsis=ellipsis)

236

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

236

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

237

>>> print trim(t, 10, ellipsis=ellipsis)

237

>>> print trim(t, 10, ellipsis=ellipsis)

238

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

238

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

239

>>> print trim(t, 8, ellipsis=ellipsis)

239

>>> print trim(t, 8, ellipsis=ellipsis)

240

\x11\x22\x33\x44\x55+++

240

\x11\x22\x33\x44\x55+++

241

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

241

>>> print trim(t, 8, ellipsis=ellipsis, leftside=True)

242

+++\x66\x77\x88\x99\xaa

242

+++\x66\x77\x88\x99\xaa

243

>>> print trim(t, 8)

243

>>> print trim(t, 8)

244

\x11\x22\x33\x44\x55\x66\x77\x88

244

\x11\x22\x33\x44\x55\x66\x77\x88

245

>>> print trim(t, 8, leftside=True)

245

>>> print trim(t, 8, leftside=True)

246

\x33\x44\x55\x66\x77\x88\x99\xaa

246

\x33\x44\x55\x66\x77\x88\x99\xaa

247

>>> print trim(t, 3, ellipsis=ellipsis)

247

>>> print trim(t, 3, ellipsis=ellipsis)

248

+++

248

+++

249

>>> print trim(t, 1, ellipsis=ellipsis)

249

>>> print trim(t, 1, ellipsis=ellipsis)

250

+

250

+

251

"""

251

"""

252

try:

252

try:

253

u = s.decode(encoding)

253

u = s.decode(encoding)

254

except UnicodeDecodeError:

254

except UnicodeDecodeError:

255

if len(s) <= width: # trimming is not needed

255

if len(s) <= width: # trimming is not needed

256

return s

256

return s

257

width -= len(ellipsis)

257

width -= len(ellipsis)

258

if width <= 0: # no enough room even for ellipsis

258

if width <= 0: # no enough room even for ellipsis

259

return ellipsis[:width + len(ellipsis)]

259

return ellipsis[:width + len(ellipsis)]

260

if leftside:

260

if leftside:

261

return ellipsis + s[-width:]

261

return ellipsis + s[-width:]

262

return s[:width] + ellipsis

262

return s[:width] + ellipsis

263

264

if ucolwidth(u) <= width: # trimming is not needed

264

if ucolwidth(u) <= width: # trimming is not needed

265

return s

265

return s

266

267

width -= len(ellipsis)

267

width -= len(ellipsis)

268

if width <= 0: # no enough room even for ellipsis

268

if width <= 0: # no enough room even for ellipsis

269

return ellipsis[:width + len(ellipsis)]

269

return ellipsis[:width + len(ellipsis)]

270

271

if leftside:

271

if leftside:

272

uslice = lambda i: u[i:]

272

uslice = lambda i: u[i:]

273

concat = lambda s: ellipsis + s

273

concat = lambda s: ellipsis + s

274

else:

274

else:

275

uslice = lambda i: u[:-i]

275

uslice = lambda i: u[:-i]

276

concat = lambda s: s + ellipsis

276

concat = lambda s: s + ellipsis

277

for i in xrange(1, len(u)):

277

for i in xrange(1, len(u)):

278

usub = uslice(i)

278

usub = uslice(i)

279

if ucolwidth(usub) <= width:

279

if ucolwidth(usub) <= width:

280

return concat(usub.encode(encoding))

280

return concat(usub.encode(encoding))

281

return ellipsis # no enough room for multi-column characters

281

return ellipsis # no enough room for multi-column characters

282

283

def _asciilower(s):

283

def _asciilower(s):

284

'''convert a string to lowercase if ASCII

284

'''convert a string to lowercase if ASCII

285

286

Raises UnicodeDecodeError if non-ASCII characters are found.'''

286

Raises UnicodeDecodeError if non-ASCII characters are found.'''

287

s.decode('ascii')

287

s.decode('ascii')

288

return s.lower()

288

return s.lower()

289

290

def asciilower(s):

290

def asciilower(s):

291

# delay importing avoids cyclic dependency around "parsers" in

291

# delay importing avoids cyclic dependency around "parsers" in

292

# pure Python build (util => i18n => encoding => parsers => util)

292

# pure Python build (util => i18n => encoding => parsers => util)

293

import parsers

293

import parsers

294

impl = getattr(parsers, 'asciilower', _asciilower)

294

impl = getattr(parsers, 'asciilower', _asciilower)

295

global asciilower

295

global asciilower

296

asciilower = impl

296

asciilower = impl

297

return impl(s)

297

return impl(s)

298

299

def _asciiupper(s):

299

def _asciiupper(s):

300

'''convert a string to uppercase if ASCII

300

'''convert a string to uppercase if ASCII

301

302

Raises UnicodeDecodeError if non-ASCII characters are found.'''

302

Raises UnicodeDecodeError if non-ASCII characters are found.'''

303

s.decode('ascii')

303

s.decode('ascii')

304

return s.upper()

304

return s.upper()

305

306

def asciiupper(s):

306

def asciiupper(s):

307

# delay importing avoids cyclic dependency around "parsers" in

307

# delay importing avoids cyclic dependency around "parsers" in

308

# pure Python build (util => i18n => encoding => parsers => util)

308

# pure Python build (util => i18n => encoding => parsers => util)

309

import parsers

309

import parsers

310

impl = getattr(parsers, 'asciiupper', _asciiupper)

310

impl = getattr(parsers, 'asciiupper', _asciiupper)

311

global asciiupper

311

global asciiupper

312

asciiupper = impl

312

asciiupper = impl

313

return impl(s)

313

return impl(s)

314

315

def lower(s):

315

def lower(s):

316

"best-effort encoding-aware case-folding of local string s"

316

"best-effort encoding-aware case-folding of local string s"

317

try:

317

try:

318

return asciilower(s)

318

return asciilower(s)

319

except UnicodeDecodeError:

319

except UnicodeDecodeError:

320

pass

320

pass

321

try:

321

try:

322

if isinstance(s, localstr):

322

if isinstance(s, localstr):

323

u = s._utf8.decode("utf-8")

323

u = s._utf8.decode("utf-8")

324

else:

324

else:

325

u = s.decode(encoding, encodingmode)

325

u = s.decode(encoding, encodingmode)

326

327

lu = u.lower()

327

lu = u.lower()

328

if u == lu:

328

if u == lu:

329

return s # preserve localstring

329

return s # preserve localstring

330

return lu.encode(encoding)

330

return lu.encode(encoding)

331

except UnicodeError:

331

except UnicodeError:

332

return s.lower() # we don't know how to fold this except in ASCII

332

return s.lower() # we don't know how to fold this except in ASCII

333

except LookupError as k:

333

except LookupError as k:

334

raise error.Abort(k, hint="please check your locale settings")

334

raise error.Abort(k, hint="please check your locale settings")

335

336

def upper(s):

336

def upper(s):

337

"best-effort encoding-aware case-folding of local string s"

337

"best-effort encoding-aware case-folding of local string s"

338

try:

338

try:

339

return asciiupper(s)

339

return asciiupper(s)

340

except UnicodeDecodeError:

340

except UnicodeDecodeError:

341

return upperfallback(s)

341

return upperfallback(s)

342

343

def upperfallback(s):

343

def upperfallback(s):

344

try:

344

try:

345

if isinstance(s, localstr):

345

if isinstance(s, localstr):

346

u = s._utf8.decode("utf-8")

346

u = s._utf8.decode("utf-8")

347

else:

347

else:

348

u = s.decode(encoding, encodingmode)

348

u = s.decode(encoding, encodingmode)

349

350

uu = u.upper()

350

uu = u.upper()

351

if u == uu:

351

if u == uu:

352

return s # preserve localstring

352

return s # preserve localstring

353

return uu.encode(encoding)

353

return uu.encode(encoding)

354

except UnicodeError:

354

except UnicodeError:

355

return s.upper() # we don't know how to fold this except in ASCII

355

return s.upper() # we don't know how to fold this except in ASCII

356

except LookupError as k:

356

except LookupError as k:

357

raise error.Abort(k, hint="please check your locale settings")

357

raise error.Abort(k, hint="please check your locale settings")

358

359

class normcasespecs(object):

359

class normcasespecs(object):

360

'''what a platform's normcase does to ASCII strings

360

'''what a platform's normcase does to ASCII strings

361

362

This is specified per platform, and should be consistent with what normcase

362

This is specified per platform, and should be consistent with what normcase

363

on that platform actually does.

363

on that platform actually does.

364

365

lower: normcase lowercases ASCII strings

365

lower: normcase lowercases ASCII strings

366

upper: normcase uppercases ASCII strings

366

upper: normcase uppercases ASCII strings

367

other: the fallback function should always be called

367

other: the fallback function should always be called

368

369

This should be kept in sync with normcase_spec in util.h.'''

369

This should be kept in sync with normcase_spec in util.h.'''

370

lower = -1

370

lower = -1

371

upper = 1

371

upper = 1

372

other = 0

372

other = 0

373

374

_jsonmap = {}

374

_jsonmap = {}

375

376

def jsonescape(s):

376

def jsonescape(s):

377

'''returns a string suitable for JSON

377

'''returns a string suitable for JSON

378

379

JSON is problematic for us because it doesn't support non-Unicode

379

JSON is problematic for us because it doesn't support non-Unicode

380

bytes. To deal with this, we take the following approach:

380

bytes. To deal with this, we take the following approach:

381

382

- localstr objects are converted back to UTF-8

382

- localstr objects are converted back to UTF-8

383

- valid UTF-8/ASCII strings are passed as-is

383

- valid UTF-8/ASCII strings are passed as-is

384

- other strings are converted to UTF-8b surrogate encoding

384

- other strings are converted to UTF-8b surrogate encoding

385

- apply JSON-specified string escaping

385

- apply JSON-specified string escaping

386

387

(escapes are doubled in these tests)

387

(escapes are doubled in these tests)

388

389

>>> jsonescape('this is a test')

389

>>> jsonescape('this is a test')

390

'this is a test'

390

'this is a test'

391

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

391

>>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')

392

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

392

'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'

393

>>> jsonescape('a weird byte: \\xdd')

393

>>> jsonescape('a weird byte: \\xdd')

394

'a weird byte: \\xed\\xb3\\x9d'

394

'a weird byte: \\xed\\xb3\\x9d'

395

>>> jsonescape('utf-8: caf\\xc3\\xa9')

395

>>> jsonescape('utf-8: caf\\xc3\\xa9')

396

'utf-8: caf\\xc3\\xa9'

396

'utf-8: caf\\xc3\\xa9'

397

>>> jsonescape('')

397

>>> jsonescape('')

398

''

398

''

399

'''

399

'''

400

401

if not _jsonmap:

401

if not _jsonmap:

402

for x in xrange(32):

402

for x in xrange(32):

403

_jsonmap[chr(x)] = "\u%04x" %x

403

_jsonmap[chr(x)] = "\u%04x" %x

404

for x in xrange(32, 256):

404

for x in xrange(32, 256):

405

c = chr(x)

405

c = chr(x)

406

_jsonmap[c] = c

406

_jsonmap[c] = c

407

_jsonmap['\t'] = '\\t'

407

_jsonmap['\t'] = '\\t'

408

_jsonmap['\n'] = '\\n'

408

_jsonmap['\n'] = '\\n'

409

_jsonmap['\"'] = '\\"'

409

_jsonmap['\"'] = '\\"'

410

_jsonmap['\\'] = '\\\\'

410

_jsonmap['\\'] = '\\\\'

411

_jsonmap['\b'] = '\\b'

411

_jsonmap['\b'] = '\\b'

412

_jsonmap['\f'] = '\\f'

412

_jsonmap['\f'] = '\\f'

413

_jsonmap['\r'] = '\\r'

413

_jsonmap['\r'] = '\\r'

414

415

return ''.join(_jsonmap[c] for c in toutf8b(s))

415

return ''.join(_jsonmap[c] for c in toutf8b(s))

416

417

_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

418

419

def getutf8char(s, pos):

420

'''get the next full utf-8 character in the given string, starting at pos

421

422

Raises a UnicodeError if the given location does not start a valid

423

utf-8 character.

424

'''

425

426

# find how many bytes to attempt decoding from first nibble

427

l = _utf8len[ord(s[pos]) >> 4]

428

if not l: # ascii

429

return s[pos]

430

431

c = s[pos:pos + l]

432

# validate with attempted decode

433

c.decode("utf-8")

434

return c

435

417

def toutf8b(s):

436

def toutf8b(s):

418

'''convert a local, possibly-binary string into UTF-8b

437

'''convert a local, possibly-binary string into UTF-8b

419

438

420

This is intended as a generic method to preserve data when working

439

This is intended as a generic method to preserve data when working

421

with schemes like JSON and XML that have no provision for

440

with schemes like JSON and XML that have no provision for

422

arbitrary byte strings. As Mercurial often doesn't know

441

arbitrary byte strings. As Mercurial often doesn't know

423

what encoding data is in, we use so-called UTF-8b.

442

what encoding data is in, we use so-called UTF-8b.

424

443

425

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

444

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

426

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

445

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

427

uDC00-uDCFF.

446

uDC00-uDCFF.

428

447

429

Principles of operation:

448

Principles of operation:

430

449

431

- ASCII and UTF-8 data successfully round-trips and is understood

450

- ASCII and UTF-8 data successfully round-trips and is understood

432

by Unicode-oriented clients

451

by Unicode-oriented clients

433

- filenames and file contents in arbitrary other encodings can have

452

- filenames and file contents in arbitrary other encodings can have

434

be round-tripped or recovered by clueful clients

453

be round-tripped or recovered by clueful clients

435

- local strings that have a cached known UTF-8 encoding (aka

454

- local strings that have a cached known UTF-8 encoding (aka

436

localstr) get sent as UTF-8 so Unicode-oriented clients get the

455

localstr) get sent as UTF-8 so Unicode-oriented clients get the

437

Unicode data they want

456

Unicode data they want

438

- because we must preserve UTF-8 bytestring in places such as

457

- because we must preserve UTF-8 bytestring in places such as

439

filenames, metadata can't be roundtripped without help

458

filenames, metadata can't be roundtripped without help

440

459

441

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

460

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

442

arbitrary bytes into an internal Unicode format that can be

461

arbitrary bytes into an internal Unicode format that can be

443

re-encoded back into the original. Here we are exposing the

462

re-encoded back into the original. Here we are exposing the

444

internal surrogate encoding as a UTF-8 string.)

463

internal surrogate encoding as a UTF-8 string.)

445

'''

464

'''

446

465

447

if isinstance(s, localstr):

466

if isinstance(s, localstr):

448

return s._utf8

467

return s._utf8

449

468

450

try:

469

try:

451

s.decode('utf-8')

470

s.decode('utf-8')

452

return s

471

return s

453

except UnicodeDecodeError:

472

except UnicodeDecodeError:

454

# surrogate-encode any characters that don't round-trip

473

# surrogate-encode any characters that don't round-trip

455

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

474

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

456

r = ""

475

r = ""

457

pos = 0

476

pos = 0

458

for c in s:

477

for c in s:

459

if s2[pos:pos + 1] == c:

478

if s2[pos:pos + 1] == c:

460

r += c

479

r += c

461

pos += 1

480

pos += 1

462

else:

481

else:

463

r += unichr(0xdc00 + ord(c)).encode('utf-8')

482

r += unichr(0xdc00 + ord(c)).encode('utf-8')

464

return r

483

return r

465

484

466

def fromutf8b(s):

485

def fromutf8b(s):

467

'''Given a UTF-8b string, return a local, possibly-binary string.

486

'''Given a UTF-8b string, return a local, possibly-binary string.

468

487

469

return the original binary string. This

488

return the original binary string. This

470

is a round-trip process for strings like filenames, but metadata

489

is a round-trip process for strings like filenames, but metadata

471

that's was passed through tolocal will remain in UTF-8.

490

that's was passed through tolocal will remain in UTF-8.

472

491

473

>>> m = "\\xc3\\xa9\\x99abcd"

492

>>> m = "\\xc3\\xa9\\x99abcd"

474

>>> n = toutf8b(m)

493

>>> n = toutf8b(m)

475

>>> n

494

>>> n

476

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

495

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

477

>>> fromutf8b(n) == m

496

>>> fromutf8b(n) == m

478

True

497

True

479

'''

498

'''

480

499

481

# fast path - look for uDxxx prefixes in s

500

# fast path - look for uDxxx prefixes in s

482

if "\xed" not in s:

501

if "\xed" not in s:

483

return s

502

return s

484

503

485

u = s.decode("utf-8")

504

u = s.decode("utf-8")

486

r = ""

505

r = ""

487

for c in u:

506

for c in u:

488

if ord(c) & 0xff00 == 0xdc00:

507

if ord(c) & 0xff00 == 0xdc00:

489

r += chr(ord(c) & 0xff)

508

r += chr(ord(c) & 0xff)

490

else:

509

else:

491

r += c.encode("utf-8")

510

r += c.encode("utf-8")

492

return r

511

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
             # These unicode characters are ignored by HFS+ (Apple Technote 1150,
             # "Unicode Subtleties"), so we need to ignore them in some places for
             # sanity.
             _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
                        "200c 200d 200e 200f 202a 202b 202c 202d 202e "
                        "206a 206b 206c 206d 206e 206f feff".split()]
             # verify the next function will work
             assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
             def hfsignoreclean(s):
                 """Remove codepoints ignored by HFS+ from s.
                 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
                 '.hg'
                 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
                 '.hg'
                 """
                 if "\xe2" in s or "\xef" in s:
                     for c in _ignore:
                         s = s.replace(c, '')
                 return s
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError as inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
             def trim(s, width, ellipsis='', leftside=False):
                 """Trim string 's' to at most 'width' columns (including 'ellipsis').
                 If 'leftside' is True, left side of string 's' is trimmed.
                 'ellipsis' is always placed at trimmed side.
                 >>> ellipsis = '+++'
                 >>> from mercurial import encoding
                 >>> encoding.encoding = 'utf-8'
                 >>> t= '1234567890'
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 1234567890
                 >>> print trim(t, 8, ellipsis=ellipsis)
 +++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++67890
                 >>> print trim(t, 8)
                 12345678
                 >>> print trim(t, 8, leftside=True)
                 34567890
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
                 >>> t = u.encode(encoding.encoding)
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \xe3\x81\x82\xe3\x81\x84+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 5)
                 \xe3\x81\x82\xe3\x81\x84
                 >>> print trim(t, 5, leftside=True)
                 \xe3\x81\x88\xe3\x81\x8a
                 >>> print trim(t, 4, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
                 +++
                 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
                 >>> print trim(t, 12, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 10, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8, ellipsis=ellipsis)
                 \x11\x22\x33\x44\x55+++
                 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
                 +++\x66\x77\x88\x99\xaa
                 >>> print trim(t, 8)
                 \x11\x22\x33\x44\x55\x66\x77\x88
                 >>> print trim(t, 8, leftside=True)
                 \x33\x44\x55\x66\x77\x88\x99\xaa
                 >>> print trim(t, 3, ellipsis=ellipsis)
                 +++
                 >>> print trim(t, 1, ellipsis=ellipsis)
                 +
                 """
                 try:
                     u = s.decode(encoding)
                 except UnicodeDecodeError:
                     if len(s) <= width: # trimming is not needed
                         return s
                     width -= len(ellipsis)
                     if width <= 0: # no enough room even for ellipsis
                         return ellipsis[:width + len(ellipsis)]
                     if leftside:
                         return ellipsis + s[-width:]
                     return s[:width] + ellipsis
                 if ucolwidth(u) <= width: # trimming is not needed
                     return s
                 width -= len(ellipsis)
                 if width <= 0: # no enough room even for ellipsis
                     return ellipsis[:width + len(ellipsis)]
                 if leftside:
                     uslice = lambda i: u[i:]
                     concat = lambda s: ellipsis + s
                 else:
                     uslice = lambda i: u[:-i]
                     concat = lambda s: s + ellipsis
                 for i in xrange(1, len(u)):
                     usub = uslice(i)
                     if ucolwidth(usub) <= width:
                         return concat(usub.encode(encoding))
                 return ellipsis # no enough room for multi-column characters
             def _asciilower(s):
                 '''convert a string to lowercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.lower()
             def asciilower(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 import parsers
                 impl = getattr(parsers, 'asciilower', _asciilower)
                 global asciilower
                 asciilower = impl
                 return impl(s)
             def _asciiupper(s):
                 '''convert a string to uppercase if ASCII
                 Raises UnicodeDecodeError if non-ASCII characters are found.'''
                 s.decode('ascii')
                 return s.upper()
             def asciiupper(s):
                 # delay importing avoids cyclic dependency around "parsers" in
                 # pure Python build (util => i18n => encoding => parsers => util)
                 import parsers
                 impl = getattr(parsers, 'asciiupper', _asciiupper)
                 global asciiupper
                 asciiupper = impl
                 return impl(s)
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciilower(s)
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     return asciiupper(s)
                 except UnicodeDecodeError:
                     return upperfallback(s)
             def upperfallback(s):
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError as k:
                     raise error.Abort(k, hint="please check your locale settings")
             class normcasespecs(object):
                 '''what a platform's normcase does to ASCII strings
                 This is specified per platform, and should be consistent with what normcase
                 on that platform actually does.
                 lower: normcase lowercases ASCII strings
                 upper: normcase uppercases ASCII strings
                 other: the fallback function should always be called
                 This should be kept in sync with normcase_spec in util.h.'''
                 lower = -1
                 upper = 1
                 other = 0
             _jsonmap = {}
             def jsonescape(s):
                 '''returns a string suitable for JSON
                 JSON is problematic for us because it doesn't support non-Unicode
                 bytes. To deal with this, we take the following approach:
                 - localstr objects are converted back to UTF-8
                 - valid UTF-8/ASCII strings are passed as-is
                 - other strings are converted to UTF-8b surrogate encoding
                 - apply JSON-specified string escaping
                 (escapes are doubled in these tests)
                 >>> jsonescape('this is a test')
                 'this is a test'
                 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
                 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
                 >>> jsonescape('a weird byte: \\xdd')
                 'a weird byte: \\xed\\xb3\\x9d'
                 >>> jsonescape('utf-8: caf\\xc3\\xa9')
                 'utf-8: caf\\xc3\\xa9'
                 >>> jsonescape('')
                 ''
                 '''
                 if not _jsonmap:
                     for x in xrange(32):
                         _jsonmap[chr(x)] = "\u%04x" %x
                     for x in xrange(32, 256):
                         c = chr(x)
                         _jsonmap[c] = c
                     _jsonmap['\t'] = '\\t'
                     _jsonmap['\n'] = '\\n'
                     _jsonmap['\"'] = '\\"'
                     _jsonmap['\\'] = '\\\\'
                     _jsonmap['\b'] = '\\b'
                     _jsonmap['\f'] = '\\f'
                     _jsonmap['\r'] = '\\r'
                 return ''.join(_jsonmap[c] for c in toutf8b(s))
+            _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
+            def getutf8char(s, pos):
+                '''get the next full utf-8 character in the given string, starting at pos
+                Raises a UnicodeError if the given location does not start a valid
+                utf-8 character.
+                '''
+                # find how many bytes to attempt decoding from first nibble
+                l = _utf8len[ord(s[pos]) >> 4]
+                if not l: # ascii
+                    return s[pos]
+                c = s[pos:pos + l]
+                # validate with attempted decode
+                c.decode("utf-8")
+                return c
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     s.decode('utf-8')
                     return s
                 except UnicodeDecodeError:
                     # surrogate-encode any characters that don't round-trip
                     s2 = s.decode('utf-8', 'ignore').encode('utf-8')
                     r = ""
                     pos = 0
                     for c in s:
                         if s2[pos:pos + 1] == c:
                             r += c
                             pos += 1
                         else:
                             r += unichr(0xdc00 + ord(c)).encode('utf-8')
                     return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> n = toutf8b(m)
                 >>> n
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> fromutf8b(n) == m
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 u = s.decode("utf-8")
                 r = ""
                 for c in u:
                     if ord(c) & 0xff00 == 0xdc00:
                         r += chr(ord(c) & 0xff)
                     else:
                         r += c.encode("utf-8")
                 return r