upstream/mercurial-mirror Commit - r21856:d24969ee

1

# encoding.py - character transcoding support for Mercurial

1

# encoding.py - character transcoding support for Mercurial

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

import error

8

import error

9

import unicodedata, locale, os

9

import unicodedata, locale, os

10

11

def _getpreferredencoding():

11

def _getpreferredencoding():

12

'''

12

'''

13

On darwin, getpreferredencoding ignores the locale environment and

13

On darwin, getpreferredencoding ignores the locale environment and

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

14

always returns mac-roman. http://bugs.python.org/issue6202 fixes this

15

for Python 2.7 and up. This is the same corrected code for earlier

15

for Python 2.7 and up. This is the same corrected code for earlier

16

Python versions.

16

Python versions.

17

18

However, we can't use a version check for this method, as some distributions

18

However, we can't use a version check for this method, as some distributions

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

19

patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman

20

encoding, as it is unlikely that this encoding is the actually expected.

20

encoding, as it is unlikely that this encoding is the actually expected.

21

'''

21

'''

22

try:

22

try:

23

locale.CODESET

23

locale.CODESET

24

except AttributeError:

24

except AttributeError:

25

# Fall back to parsing environment variables :-(

25

# Fall back to parsing environment variables :-(

26

return locale.getdefaultlocale()[1]

26

return locale.getdefaultlocale()[1]

27

28

oldloc = locale.setlocale(locale.LC_CTYPE)

28

oldloc = locale.setlocale(locale.LC_CTYPE)

29

locale.setlocale(locale.LC_CTYPE, "")

29

locale.setlocale(locale.LC_CTYPE, "")

30

result = locale.nl_langinfo(locale.CODESET)

30

result = locale.nl_langinfo(locale.CODESET)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

31

locale.setlocale(locale.LC_CTYPE, oldloc)

32

33

return result

33

return result

34

35

_encodingfixers = {

35

_encodingfixers = {

36

'646': lambda: 'ascii',

36

'646': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

37

'ANSI_X3.4-1968': lambda: 'ascii',

38

'mac-roman': _getpreferredencoding

38

'mac-roman': _getpreferredencoding

39

}

39

}

40

41

try:

41

try:

42

encoding = os.environ.get("HGENCODING")

42

encoding = os.environ.get("HGENCODING")

43

if not encoding:

43

if not encoding:

44

encoding = locale.getpreferredencoding() or 'ascii'

44

encoding = locale.getpreferredencoding() or 'ascii'

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

45

encoding = _encodingfixers.get(encoding, lambda: encoding)()

46

except locale.Error:

46

except locale.Error:

47

encoding = 'ascii'

47

encoding = 'ascii'

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

48

encodingmode = os.environ.get("HGENCODINGMODE", "strict")

49

fallbackencoding = 'ISO-8859-1'

49

fallbackencoding = 'ISO-8859-1'

50

51

class localstr(str):

51

class localstr(str):

52

'''This class allows strings that are unmodified to be

52

'''This class allows strings that are unmodified to be

53

round-tripped to the local encoding and back'''

53

round-tripped to the local encoding and back'''

54

def __new__(cls, u, l):

54

def __new__(cls, u, l):

55

s = str.__new__(cls, l)

55

s = str.__new__(cls, l)

56

s._utf8 = u

56

s._utf8 = u

57

return s

57

return s

58

def __hash__(self):

58

def __hash__(self):

59

return hash(self._utf8) # avoid collisions in local string space

59

return hash(self._utf8) # avoid collisions in local string space

60

61

def tolocal(s):

61

def tolocal(s):

62

"""

62

"""

63

Convert a string from internal UTF-8 to local encoding

63

Convert a string from internal UTF-8 to local encoding

64

65

All internal strings should be UTF-8 but some repos before the

65

All internal strings should be UTF-8 but some repos before the

66

implementation of locale support may contain latin1 or possibly

66

implementation of locale support may contain latin1 or possibly

67

other character sets. We attempt to decode everything strictly

67

other character sets. We attempt to decode everything strictly

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

68

using UTF-8, then Latin-1, and failing that, we use UTF-8 and

69

replace unknown characters.

69

replace unknown characters.

70

71

The localstr class is used to cache the known UTF-8 encoding of

71

The localstr class is used to cache the known UTF-8 encoding of

72

strings next to their local representation to allow lossless

72

strings next to their local representation to allow lossless

73

round-trip conversion back to UTF-8.

73

round-trip conversion back to UTF-8.

74

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

75

>>> u = 'foo: \\xc3\\xa4' # utf-8

76

>>> l = tolocal(u)

76

>>> l = tolocal(u)

77

>>> l

77

>>> l

78

'foo: ?'

78

'foo: ?'

79

>>> fromlocal(l)

79

>>> fromlocal(l)

80

'foo: \\xc3\\xa4'

80

'foo: \\xc3\\xa4'

81

>>> u2 = 'foo: \\xc3\\xa1'

81

>>> u2 = 'foo: \\xc3\\xa1'

82

>>> d = { l: 1, tolocal(u2): 2 }

82

>>> d = { l: 1, tolocal(u2): 2 }

83

>>> len(d) # no collision

83

>>> len(d) # no collision

84

2

84

2

85

>>> 'foo: ?' in d

85

>>> 'foo: ?' in d

86

False

86

False

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

87

>>> l1 = 'foo: \\xe4' # historical latin1 fallback

88

>>> l = tolocal(l1)

88

>>> l = tolocal(l1)

89

>>> l

89

>>> l

90

'foo: ?'

90

'foo: ?'

91

>>> fromlocal(l) # magically in utf-8

91

>>> fromlocal(l) # magically in utf-8

92

'foo: \\xc3\\xa4'

92

'foo: \\xc3\\xa4'

93

"""

93

"""

94

95

try:

95

try:

96

try:

96

try:

97

# make sure string is actually stored in UTF-8

97

# make sure string is actually stored in UTF-8

98

u = s.decode('UTF-8')

98

u = s.decode('UTF-8')

99

if encoding == 'UTF-8':

99

if encoding == 'UTF-8':

100

# fast path

100

# fast path

101

return s

101

return s

102

r = u.encode(encoding, "replace")

102

r = u.encode(encoding, "replace")

103

if u == r.decode(encoding):

103

if u == r.decode(encoding):

104

# r is a safe, non-lossy encoding of s

104

# r is a safe, non-lossy encoding of s

105

return r

105

return r

106

return localstr(s, r)

106

return localstr(s, r)

107

except UnicodeDecodeError:

107

except UnicodeDecodeError:

108

# we should only get here if we're looking at an ancient changeset

108

# we should only get here if we're looking at an ancient changeset

109

try:

109

try:

110

u = s.decode(fallbackencoding)

110

u = s.decode(fallbackencoding)

111

r = u.encode(encoding, "replace")

111

r = u.encode(encoding, "replace")

112

if u == r.decode(encoding):

112

if u == r.decode(encoding):

113

# r is a safe, non-lossy encoding of s

113

# r is a safe, non-lossy encoding of s

114

return r

114

return r

115

return localstr(u.encode('UTF-8'), r)

115

return localstr(u.encode('UTF-8'), r)

116

except UnicodeDecodeError:

116

except UnicodeDecodeError:

117

u = s.decode("utf-8", "replace") # last ditch

117

u = s.decode("utf-8", "replace") # last ditch

118

return u.encode(encoding, "replace") # can't round-trip

118

return u.encode(encoding, "replace") # can't round-trip

119

except LookupError, k:

119

except LookupError, k:

120

raise error.Abort(k, hint="please check your locale settings")

120

raise error.Abort(k, hint="please check your locale settings")

121

122

def fromlocal(s):

122

def fromlocal(s):

123

"""

123

"""

124

Convert a string from the local character encoding to UTF-8

124

Convert a string from the local character encoding to UTF-8

125

126

We attempt to decode strings using the encoding mode set by

126

We attempt to decode strings using the encoding mode set by

127

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

127

HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown

128

characters will cause an error message. Other modes include

128

characters will cause an error message. Other modes include

129

'replace', which replaces unknown characters with a special

129

'replace', which replaces unknown characters with a special

130

Unicode character, and 'ignore', which drops the character.

130

Unicode character, and 'ignore', which drops the character.

131

"""

131

"""

132

133

# can we do a lossless round-trip?

133

# can we do a lossless round-trip?

134

if isinstance(s, localstr):

134

if isinstance(s, localstr):

135

return s._utf8

135

return s._utf8

136

137

try:

137

try:

138

return s.decode(encoding, encodingmode).encode("utf-8")

138

return s.decode(encoding, encodingmode).encode("utf-8")

139

except UnicodeDecodeError, inst:

139

except UnicodeDecodeError, inst:

140

sub = s[max(0, inst.start - 10):inst.start + 10]

140

sub = s[max(0, inst.start - 10):inst.start + 10]

141

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

141

raise error.Abort("decoding near '%s': %s!" % (sub, inst))

142

except LookupError, k:

142

except LookupError, k:

143

raise error.Abort(k, hint="please check your locale settings")

143

raise error.Abort(k, hint="please check your locale settings")

144

145

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

145

# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.

146

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

146

wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"

147

and "WFA" or "WF")

147

and "WFA" or "WF")

148

149

def colwidth(s):

149

def colwidth(s):

150

"Find the column width of a string for display in the local encoding"

150

"Find the column width of a string for display in the local encoding"

151

return ucolwidth(s.decode(encoding, 'replace'))

151

return ucolwidth(s.decode(encoding, 'replace'))

152

153

def ucolwidth(d):

153

def ucolwidth(d):

154

"Find the column width of a Unicode string for display"

154

"Find the column width of a Unicode string for display"

155

eaw = getattr(unicodedata, 'east_asian_width', None)

155

eaw = getattr(unicodedata, 'east_asian_width', None)

156

if eaw is not None:

156

if eaw is not None:

157

return sum([eaw(c) in wide and 2 or 1 for c in d])

157

return sum([eaw(c) in wide and 2 or 1 for c in d])

158

return len(d)

158

return len(d)

159

160

def getcols(s, start, c):

160

def getcols(s, start, c):

161

'''Use colwidth to find a c-column substring of s starting at byte

161

'''Use colwidth to find a c-column substring of s starting at byte

162

index start'''

162

index start'''

163

for x in xrange(start + c, len(s)):

163

for x in xrange(start + c, len(s)):

164

t = s[start:x]

164

t = s[start:x]

165

if colwidth(t) == c:

165

if colwidth(t) == c:

166

return t

166

return t

167

168

def trim(s, width, ellipsis=''):

169

"""Trim string 's' to at most 'width' columns (including 'ellipsis').

170

171

>>> ellipsis = '+++'

172

>>> from mercurial import encoding

173

>>> encoding.encoding = 'utf-8'

174

>>> t= '1234567890'

175

>>> print trim(t, 12, ellipsis=ellipsis)

176

1234567890

177

>>> print trim(t, 10, ellipsis=ellipsis)

178

1234567890

179

>>> print trim(t, 8, ellipsis=ellipsis)

180

12345+++

181

>>> print trim(t, 8)

182

12345678

183

>>> print trim(t, 3, ellipsis=ellipsis)

184

+++

185

>>> print trim(t, 1, ellipsis=ellipsis)

186

+

187

>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns

188

>>> t = u.encode(encoding.encoding)

189

>>> print trim(t, 12, ellipsis=ellipsis)

190

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

191

>>> print trim(t, 10, ellipsis=ellipsis)

192

\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a

193

>>> print trim(t, 8, ellipsis=ellipsis)

194

\xe3\x81\x82\xe3\x81\x84+++

195

>>> print trim(t, 5)

196

\xe3\x81\x82\xe3\x81\x84

197

>>> print trim(t, 4, ellipsis=ellipsis)

198

+++

199

>>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence

200

>>> print trim(t, 12, ellipsis=ellipsis)

201

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

202

>>> print trim(t, 10, ellipsis=ellipsis)

203

\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa

204

>>> print trim(t, 8, ellipsis=ellipsis)

205

\x11\x22\x33\x44\x55+++

206

>>> print trim(t, 8)

207

\x11\x22\x33\x44\x55\x66\x77\x88

208

>>> print trim(t, 3, ellipsis=ellipsis)

209

+++

210

>>> print trim(t, 1, ellipsis=ellipsis)

211

+

212

"""

213

try:

214

u = s.decode(encoding)

215

except UnicodeDecodeError:

216

if len(s) <= width: # trimming is not needed

217

return s

218

width -= len(ellipsis)

219

if width <= 0: # no enough room even for ellipsis

220

return ellipsis[:width + len(ellipsis)]

221

return s[:width] + ellipsis

222

223

if ucolwidth(u) <= width: # trimming is not needed

224

return s

225

226

width -= len(ellipsis)

227

if width <= 0: # no enough room even for ellipsis

228

return ellipsis[:width + len(ellipsis)]

229

230

uslice = lambda i: u[:-i]

231

concat = lambda s: s + ellipsis

232

for i in xrange(1, len(u)):

233

usub = uslice(i)

234

if ucolwidth(usub) <= width:

235

return concat(usub.encode(encoding))

236

return ellipsis # no enough room for multi-column characters

237

168

def lower(s):

238

def lower(s):

169

"best-effort encoding-aware case-folding of local string s"

239

"best-effort encoding-aware case-folding of local string s"

170

try:

240

try:

171

s.decode('ascii') # throw exception for non-ASCII character

241

s.decode('ascii') # throw exception for non-ASCII character

172

return s.lower()

242

return s.lower()

173

except UnicodeDecodeError:

243

except UnicodeDecodeError:

174

pass

244

pass

175

try:

245

try:

176

if isinstance(s, localstr):

246

if isinstance(s, localstr):

177

u = s._utf8.decode("utf-8")

247

u = s._utf8.decode("utf-8")

178

else:

248

else:

179

u = s.decode(encoding, encodingmode)

249

u = s.decode(encoding, encodingmode)

180

250

181

lu = u.lower()

251

lu = u.lower()

182

if u == lu:

252

if u == lu:

183

return s # preserve localstring

253

return s # preserve localstring

184

return lu.encode(encoding)

254

return lu.encode(encoding)

185

except UnicodeError:

255

except UnicodeError:

186

return s.lower() # we don't know how to fold this except in ASCII

256

return s.lower() # we don't know how to fold this except in ASCII

187

except LookupError, k:

257

except LookupError, k:

188

raise error.Abort(k, hint="please check your locale settings")

258

raise error.Abort(k, hint="please check your locale settings")

189

259

190

def upper(s):

260

def upper(s):

191

"best-effort encoding-aware case-folding of local string s"

261

"best-effort encoding-aware case-folding of local string s"

192

try:

262

try:

193

s.decode('ascii') # throw exception for non-ASCII character

263

s.decode('ascii') # throw exception for non-ASCII character

194

return s.upper()

264

return s.upper()

195

except UnicodeDecodeError:

265

except UnicodeDecodeError:

196

pass

266

pass

197

try:

267

try:

198

if isinstance(s, localstr):

268

if isinstance(s, localstr):

199

u = s._utf8.decode("utf-8")

269

u = s._utf8.decode("utf-8")

200

else:

270

else:

201

u = s.decode(encoding, encodingmode)

271

u = s.decode(encoding, encodingmode)

202

272

203

uu = u.upper()

273

uu = u.upper()

204

if u == uu:

274

if u == uu:

205

return s # preserve localstring

275

return s # preserve localstring

206

return uu.encode(encoding)

276

return uu.encode(encoding)

207

except UnicodeError:

277

except UnicodeError:

208

return s.upper() # we don't know how to fold this except in ASCII

278

return s.upper() # we don't know how to fold this except in ASCII

209

except LookupError, k:

279

except LookupError, k:

210

raise error.Abort(k, hint="please check your locale settings")

280

raise error.Abort(k, hint="please check your locale settings")

211

281

212

def toutf8b(s):

282

def toutf8b(s):

213

'''convert a local, possibly-binary string into UTF-8b

283

'''convert a local, possibly-binary string into UTF-8b

214

284

215

This is intended as a generic method to preserve data when working

285

This is intended as a generic method to preserve data when working

216

with schemes like JSON and XML that have no provision for

286

with schemes like JSON and XML that have no provision for

217

arbitrary byte strings. As Mercurial often doesn't know

287

arbitrary byte strings. As Mercurial often doesn't know

218

what encoding data is in, we use so-called UTF-8b.

288

what encoding data is in, we use so-called UTF-8b.

219

289

220

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

290

If a string is already valid UTF-8 (or ASCII), it passes unmodified.

221

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

291

Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,

222

uDC00-uDCFF.

292

uDC00-uDCFF.

223

293

224

Principles of operation:

294

Principles of operation:

225

295

226

- ASCII and UTF-8 data successfully round-trips and is understood

296

- ASCII and UTF-8 data successfully round-trips and is understood

227

by Unicode-oriented clients

297

by Unicode-oriented clients

228

- filenames and file contents in arbitrary other encodings can have

298

- filenames and file contents in arbitrary other encodings can have

229

be round-tripped or recovered by clueful clients

299

be round-tripped or recovered by clueful clients

230

- local strings that have a cached known UTF-8 encoding (aka

300

- local strings that have a cached known UTF-8 encoding (aka

231

localstr) get sent as UTF-8 so Unicode-oriented clients get the

301

localstr) get sent as UTF-8 so Unicode-oriented clients get the

232

Unicode data they want

302

Unicode data they want

233

- because we must preserve UTF-8 bytestring in places such as

303

- because we must preserve UTF-8 bytestring in places such as

234

filenames, metadata can't be roundtripped without help

304

filenames, metadata can't be roundtripped without help

235

305

236

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

306

(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and

237

arbitrary bytes into an internal Unicode format that can be

307

arbitrary bytes into an internal Unicode format that can be

238

re-encoded back into the original. Here we are exposing the

308

re-encoded back into the original. Here we are exposing the

239

internal surrogate encoding as a UTF-8 string.)

309

internal surrogate encoding as a UTF-8 string.)

240

'''

310

'''

241

311

242

if isinstance(s, localstr):

312

if isinstance(s, localstr):

243

return s._utf8

313

return s._utf8

244

314

245

try:

315

try:

246

if s.decode('utf-8'):

316

if s.decode('utf-8'):

247

return s

317

return s

248

except UnicodeDecodeError:

318

except UnicodeDecodeError:

249

# surrogate-encode any characters that don't round-trip

319

# surrogate-encode any characters that don't round-trip

250

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

320

s2 = s.decode('utf-8', 'ignore').encode('utf-8')

251

r = ""

321

r = ""

252

pos = 0

322

pos = 0

253

for c in s:

323

for c in s:

254

if s2[pos:pos + 1] == c:

324

if s2[pos:pos + 1] == c:

255

r += c

325

r += c

256

pos += 1

326

pos += 1

257

else:

327

else:

258

r += unichr(0xdc00 + ord(c)).encode('utf-8')

328

r += unichr(0xdc00 + ord(c)).encode('utf-8')

259

return r

329

return r

260

330

261

def fromutf8b(s):

331

def fromutf8b(s):

262

'''Given a UTF-8b string, return a local, possibly-binary string.

332

'''Given a UTF-8b string, return a local, possibly-binary string.

263

333

264

return the original binary string. This

334

return the original binary string. This

265

is a round-trip process for strings like filenames, but metadata

335

is a round-trip process for strings like filenames, but metadata

266

that's was passed through tolocal will remain in UTF-8.

336

that's was passed through tolocal will remain in UTF-8.

267

337

268

>>> m = "\\xc3\\xa9\\x99abcd"

338

>>> m = "\\xc3\\xa9\\x99abcd"

269

>>> n = toutf8b(m)

339

>>> n = toutf8b(m)

270

>>> n

340

>>> n

271

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

341

'\\xc3\\xa9\\xed\\xb2\\x99abcd'

272

>>> fromutf8b(n) == m

342

>>> fromutf8b(n) == m

273

True

343

True

274

'''

344

'''

275

345

276

# fast path - look for uDxxx prefixes in s

346

# fast path - look for uDxxx prefixes in s

277

if "\xed" not in s:

347

if "\xed" not in s:

278

return s

348

return s

279

349

280

u = s.decode("utf-8")

350

u = s.decode("utf-8")

281

r = ""

351

r = ""

282

for c in u:

352

for c in u:

283

if ord(c) & 0xff00 == 0xdc00:

353

if ord(c) & 0xff00 == 0xdc00:

284

r += chr(ord(c) & 0xff)

354

r += chr(ord(c) & 0xff)

285

else:

355

else:

286

r += c.encode("utf-8")

356

r += c.encode("utf-8")

287

return r

357

return r

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # encoding.py - character transcoding support for Mercurial
             #
             #  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             import error
             import unicodedata, locale, os
             def _getpreferredencoding():
                 '''
                 On darwin, getpreferredencoding ignores the locale environment and
                 always returns mac-roman. http://bugs.python.org/issue6202 fixes this
                 for Python 2.7 and up. This is the same corrected code for earlier
                 Python versions.
                 However, we can't use a version check for this method, as some distributions
                 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
                 encoding, as it is unlikely that this encoding is the actually expected.
                 '''
                 try:
                     locale.CODESET
                 except AttributeError:
                     # Fall back to parsing environment variables :-(
                     return locale.getdefaultlocale()[1]
                 oldloc = locale.setlocale(locale.LC_CTYPE)
                 locale.setlocale(locale.LC_CTYPE, "")
                 result = locale.nl_langinfo(locale.CODESET)
                 locale.setlocale(locale.LC_CTYPE, oldloc)
                 return result
             _encodingfixers = {
                 '646': lambda: 'ascii',
                 'ANSI_X3.4-1968': lambda: 'ascii',
                 'mac-roman': _getpreferredencoding
             }
             try:
                 encoding = os.environ.get("HGENCODING")
                 if not encoding:
                     encoding = locale.getpreferredencoding() or 'ascii'
                     encoding = _encodingfixers.get(encoding, lambda: encoding)()
             except locale.Error:
                 encoding = 'ascii'
             encodingmode = os.environ.get("HGENCODINGMODE", "strict")
             fallbackencoding = 'ISO-8859-1'
             class localstr(str):
                 '''This class allows strings that are unmodified to be
                 round-tripped to the local encoding and back'''
                 def __new__(cls, u, l):
                     s = str.__new__(cls, l)
                     s._utf8 = u
                     return s
                 def __hash__(self):
                     return hash(self._utf8) # avoid collisions in local string space
             def tolocal(s):
                 """
                 Convert a string from internal UTF-8 to local encoding
                 All internal strings should be UTF-8 but some repos before the
                 implementation of locale support may contain latin1 or possibly
                 other character sets. We attempt to decode everything strictly
                 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
                 replace unknown characters.
                 The localstr class is used to cache the known UTF-8 encoding of
                 strings next to their local representation to allow lossless
                 round-trip conversion back to UTF-8.
                 >>> u = 'foo: \\xc3\\xa4' # utf-8
                 >>> l = tolocal(u)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l)
                 'foo: \\xc3\\xa4'
                 >>> u2 = 'foo: \\xc3\\xa1'
                 >>> d = { l: 1, tolocal(u2): 2 }
                 >>> len(d) # no collision
                 >>> 'foo: ?' in d
                 False
                 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
                 >>> l = tolocal(l1)
                 >>> l
                 'foo: ?'
                 >>> fromlocal(l) # magically in utf-8
                 'foo: \\xc3\\xa4'
                 """
                 try:
                     try:
                         # make sure string is actually stored in UTF-8
                         u = s.decode('UTF-8')
                         if encoding == 'UTF-8':
                             # fast path
                             return s
                         r = u.encode(encoding, "replace")
                         if u == r.decode(encoding):
                             # r is a safe, non-lossy encoding of s
                             return r
                         return localstr(s, r)
                     except UnicodeDecodeError:
                         # we should only get here if we're looking at an ancient changeset
                         try:
                             u = s.decode(fallbackencoding)
                             r = u.encode(encoding, "replace")
                             if u == r.decode(encoding):
                                 # r is a safe, non-lossy encoding of s
                                 return r
                             return localstr(u.encode('UTF-8'), r)
                         except UnicodeDecodeError:
                             u = s.decode("utf-8", "replace") # last ditch
                             return u.encode(encoding, "replace") # can't round-trip
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def fromlocal(s):
                 """
                 Convert a string from the local character encoding to UTF-8
                 We attempt to decode strings using the encoding mode set by
                 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
                 characters will cause an error message. Other modes include
                 'replace', which replaces unknown characters with a special
                 Unicode character, and 'ignore', which drops the character.
                 """
                 # can we do a lossless round-trip?
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     return s.decode(encoding, encodingmode).encode("utf-8")
                 except UnicodeDecodeError, inst:
                     sub = s[max(0, inst.start - 10):inst.start + 10]
                     raise error.Abort("decoding near '%s': %s!" % (sub, inst))
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
             wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
                     and "WFA" or "WF")
             def colwidth(s):
                 "Find the column width of a string for display in the local encoding"
                 return ucolwidth(s.decode(encoding, 'replace'))
             def ucolwidth(d):
                 "Find the column width of a Unicode string for display"
                 eaw = getattr(unicodedata, 'east_asian_width', None)
                 if eaw is not None:
                     return sum([eaw(c) in wide and 2 or 1 for c in d])
                 return len(d)
             def getcols(s, start, c):
                 '''Use colwidth to find a c-column substring of s starting at byte
                 index start'''
                 for x in xrange(start + c, len(s)):
                     t = s[start:x]
                     if colwidth(t) == c:
                         return t
+            def trim(s, width, ellipsis=''):
+                """Trim string 's' to at most 'width' columns (including 'ellipsis').
+                >>> ellipsis = '+++'
+                >>> from mercurial import encoding
+                >>> encoding.encoding = 'utf-8'
+                >>> t= '1234567890'
+                >>> print trim(t, 12, ellipsis=ellipsis)
+                1234567890
+                >>> print trim(t, 10, ellipsis=ellipsis)
+                1234567890
+                >>> print trim(t, 8, ellipsis=ellipsis)
++++
+                >>> print trim(t, 8)
+                12345678
+                >>> print trim(t, 3, ellipsis=ellipsis)
+                +++
+                >>> print trim(t, 1, ellipsis=ellipsis)
+                +
+                >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
+                >>> t = u.encode(encoding.encoding)
+                >>> print trim(t, 12, ellipsis=ellipsis)
+                \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
+                >>> print trim(t, 10, ellipsis=ellipsis)
+                \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
+                >>> print trim(t, 8, ellipsis=ellipsis)
+                \xe3\x81\x82\xe3\x81\x84+++
+                >>> print trim(t, 5)
+                \xe3\x81\x82\xe3\x81\x84
+                >>> print trim(t, 4, ellipsis=ellipsis)
+                +++
+                >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
+                >>> print trim(t, 12, ellipsis=ellipsis)
+                \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
+                >>> print trim(t, 10, ellipsis=ellipsis)
+                \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
+                >>> print trim(t, 8, ellipsis=ellipsis)
+                \x11\x22\x33\x44\x55+++
+                >>> print trim(t, 8)
+                \x11\x22\x33\x44\x55\x66\x77\x88
+                >>> print trim(t, 3, ellipsis=ellipsis)
+                +++
+                >>> print trim(t, 1, ellipsis=ellipsis)
+                +
+                """
+                try:
+                    u = s.decode(encoding)
+                except UnicodeDecodeError:
+                    if len(s) <= width: # trimming is not needed
+                        return s
+                    width -= len(ellipsis)
+                    if width <= 0: # no enough room even for ellipsis
+                        return ellipsis[:width + len(ellipsis)]
+                    return s[:width] + ellipsis
+                if ucolwidth(u) <= width: # trimming is not needed
+                    return s
+                width -= len(ellipsis)
+                if width <= 0: # no enough room even for ellipsis
+                    return ellipsis[:width + len(ellipsis)]
+                uslice = lambda i: u[:-i]
+                concat = lambda s: s + ellipsis
+                for i in xrange(1, len(u)):
+                    usub = uslice(i)
+                    if ucolwidth(usub) <= width:
+                        return concat(usub.encode(encoding))
+                return ellipsis # no enough room for multi-column characters
             def lower(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     s.decode('ascii') # throw exception for non-ASCII character
                     return s.lower()
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     lu = u.lower()
                     if u == lu:
                         return s # preserve localstring
                     return lu.encode(encoding)
                 except UnicodeError:
                     return s.lower() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def upper(s):
                 "best-effort encoding-aware case-folding of local string s"
                 try:
                     s.decode('ascii') # throw exception for non-ASCII character
                     return s.upper()
                 except UnicodeDecodeError:
                     pass
                 try:
                     if isinstance(s, localstr):
                         u = s._utf8.decode("utf-8")
                     else:
                         u = s.decode(encoding, encodingmode)
                     uu = u.upper()
                     if u == uu:
                         return s # preserve localstring
                     return uu.encode(encoding)
                 except UnicodeError:
                     return s.upper() # we don't know how to fold this except in ASCII
                 except LookupError, k:
                     raise error.Abort(k, hint="please check your locale settings")
             def toutf8b(s):
                 '''convert a local, possibly-binary string into UTF-8b
                 This is intended as a generic method to preserve data when working
                 with schemes like JSON and XML that have no provision for
                 arbitrary byte strings. As Mercurial often doesn't know
                 what encoding data is in, we use so-called UTF-8b.
                 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
                 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
                 uDC00-uDCFF.
                 Principles of operation:
                 - ASCII and UTF-8 data successfully round-trips and is understood
                   by Unicode-oriented clients
                 - filenames and file contents in arbitrary other encodings can have
                   be round-tripped or recovered by clueful clients
                 - local strings that have a cached known UTF-8 encoding (aka
                   localstr) get sent as UTF-8 so Unicode-oriented clients get the
                   Unicode data they want
                 - because we must preserve UTF-8 bytestring in places such as
                   filenames, metadata can't be roundtripped without help
                 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
                 arbitrary bytes into an internal Unicode format that can be
                 re-encoded back into the original. Here we are exposing the
                 internal surrogate encoding as a UTF-8 string.)
                 '''
                 if isinstance(s, localstr):
                     return s._utf8
                 try:
                     if s.decode('utf-8'):
                         return s
                 except UnicodeDecodeError:
                     # surrogate-encode any characters that don't round-trip
                     s2 = s.decode('utf-8', 'ignore').encode('utf-8')
                     r = ""
                     pos = 0
                     for c in s:
                         if s2[pos:pos + 1] == c:
                             r += c
                             pos += 1
                         else:
                             r += unichr(0xdc00 + ord(c)).encode('utf-8')
                     return r
             def fromutf8b(s):
                 '''Given a UTF-8b string, return a local, possibly-binary string.
                 return the original binary string. This
                 is a round-trip process for strings like filenames, but metadata
                 that's was passed through tolocal will remain in UTF-8.
                 >>> m = "\\xc3\\xa9\\x99abcd"
                 >>> n = toutf8b(m)
                 >>> n
                 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
                 >>> fromutf8b(n) == m
                 True
                 '''
                 # fast path - look for uDxxx prefixes in s
                 if "\xed" not in s:
                     return s
                 u = s.decode("utf-8")
                 r = ""
                 for c in u:
                     if ord(c) & 0xff00 == 0xdc00:
                         r += chr(ord(c) & 0xff)
                     else:
                         r += c.encode("utf-8")
                 return r