upstream/mercurial-mirror Commit - r37172:f8e1f48d

1

# stringutil.py - utility for generic string formatting, parsing, etc.

1

# stringutil.py - utility for generic string formatting, parsing, etc.

2

#

2

#

3

4

5

6

#

6

#

7

# This software may be used and distributed according to the terms of the

7

# This software may be used and distributed according to the terms of the

8

# GNU General Public License version 2 or any later version.

8

# GNU General Public License version 2 or any later version.

9

10

from __future__ import absolute_import

10

from __future__ import absolute_import

11

12

import codecs

12

import codecs

13

import re as remod

13

import re as remod

14

import textwrap

14

import textwrap

15

16

from ..i18n import _

16

from ..i18n import _

17

18

from .. import (

18

from .. import (

19

encoding,

19

encoding,

20

error,

20

error,

21

pycompat,

21

pycompat,

22

)

22

)

23

24

_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}

24

_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}

25

_DATA_ESCAPE_MAP.update({

25

_DATA_ESCAPE_MAP.update({

26

b'\\': b'\\\\',

26

b'\\': b'\\\\',

27

b'\r': br'\r',

27

b'\r': br'\r',

28

b'\n': br'\n',

28

b'\n': br'\n',

29

})

29

})

30

_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')

30

_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')

31

32

def escapedata(s):

32

def escapedata(s):

33

if isinstance(s, bytearray):

33

if isinstance(s, bytearray):

34

s = bytes(s)

34

s = bytes(s)

35

36

return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)

36

return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)

37

38

def binary(s):

38

def binary(s):

39

"""return true if a string is binary data"""

39

"""return true if a string is binary data"""

40

return bool(s and '\0' in s)

40

return bool(s and '\0' in s)

41

42

def stringmatcher(pattern, casesensitive=True):

42

def stringmatcher(pattern, casesensitive=True):

43

"""

43

"""

44

accepts a string, possibly starting with 're:' or 'literal:' prefix.

44

accepts a string, possibly starting with 're:' or 'literal:' prefix.

45

returns the matcher name, pattern, and matcher function.

45

returns the matcher name, pattern, and matcher function.

46

missing or unknown prefixes are treated as literal matches.

46

missing or unknown prefixes are treated as literal matches.

47

48

helper for tests:

48

helper for tests:

49

>>> def test(pattern, *tests):

49

>>> def test(pattern, *tests):

50

... kind, pattern, matcher = stringmatcher(pattern)

50

... kind, pattern, matcher = stringmatcher(pattern)

51

... return (kind, pattern, [bool(matcher(t)) for t in tests])

51

... return (kind, pattern, [bool(matcher(t)) for t in tests])

52

>>> def itest(pattern, *tests):

52

>>> def itest(pattern, *tests):

53

... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)

53

... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)

54

... return (kind, pattern, [bool(matcher(t)) for t in tests])

54

... return (kind, pattern, [bool(matcher(t)) for t in tests])

55

56

exact matching (no prefix):

56

exact matching (no prefix):

57

>>> test(b'abcdefg', b'abc', b'def', b'abcdefg')

57

>>> test(b'abcdefg', b'abc', b'def', b'abcdefg')

58

('literal', 'abcdefg', [False, False, True])

58

('literal', 'abcdefg', [False, False, True])

59

60

regex matching ('re:' prefix)

60

regex matching ('re:' prefix)

61

>>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')

61

>>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')

62

('re', 'a.+b', [False, False, True])

62

('re', 'a.+b', [False, False, True])

63

64

force exact matches ('literal:' prefix)

64

force exact matches ('literal:' prefix)

65

>>> test(b'literal:re:foobar', b'foobar', b're:foobar')

65

>>> test(b'literal:re:foobar', b'foobar', b're:foobar')

66

('literal', 're:foobar', [False, True])

66

('literal', 're:foobar', [False, True])

67

68

unknown prefixes are ignored and treated as literals

68

unknown prefixes are ignored and treated as literals

69

>>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')

69

>>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')

70

('literal', 'foo:bar', [False, False, True])

70

('literal', 'foo:bar', [False, False, True])

71

72

case insensitive regex matches

72

case insensitive regex matches

73

>>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')

73

>>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')

74

('re', 'A.+b', [False, False, True])

74

('re', 'A.+b', [False, False, True])

75

76

case insensitive literal matches

76

case insensitive literal matches

77

>>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')

77

>>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')

78

('literal', 'ABCDEFG', [False, False, True])

78

('literal', 'ABCDEFG', [False, False, True])

79

"""

79

"""

80

if pattern.startswith('re:'):

80

if pattern.startswith('re:'):

81

pattern = pattern[3:]

81

pattern = pattern[3:]

82

try:

82

try:

83

flags = 0

83

flags = 0

84

if not casesensitive:

84

if not casesensitive:

85

flags = remod.I

85

flags = remod.I

86

regex = remod.compile(pattern, flags)

86

regex = remod.compile(pattern, flags)

87

except remod.error as e:

87

except remod.error as e:

88

raise error.ParseError(_('invalid regular expression: %s')

88

raise error.ParseError(_('invalid regular expression: %s')

89

% e)

89

% e)

90

return 're', pattern, regex.search

90

return 're', pattern, regex.search

91

elif pattern.startswith('literal:'):

91

elif pattern.startswith('literal:'):

92

pattern = pattern[8:]

92

pattern = pattern[8:]

93

94

match = pattern.__eq__

94

match = pattern.__eq__

95

96

if not casesensitive:

96

if not casesensitive:

97

ipat = encoding.lower(pattern)

97

ipat = encoding.lower(pattern)

98

match = lambda s: ipat == encoding.lower(s)

98

match = lambda s: ipat == encoding.lower(s)

99

return 'literal', pattern, match

99

return 'literal', pattern, match

100

101

def shortuser(user):

101

def shortuser(user):

102

"""Return a short representation of a user name or email address."""

102

"""Return a short representation of a user name or email address."""

103

f = user.find('@')

103

f = user.find('@')

104

if f >= 0:

104

if f >= 0:

105

user = user[:f]

105

user = user[:f]

106

f = user.find('<')

106

f = user.find('<')

107

if f >= 0:

107

if f >= 0:

108

user = user[f + 1:]

108

user = user[f + 1:]

109

f = user.find(' ')

109

f = user.find(' ')

110

if f >= 0:

110

if f >= 0:

111

user = user[:f]

111

user = user[:f]

112

f = user.find('.')

112

f = user.find('.')

113

if f >= 0:

113

if f >= 0:

114

user = user[:f]

114

user = user[:f]

115

return user

115

return user

116

117

def emailuser(user):

117

def emailuser(user):

118

"""Return the user portion of an email address."""

118

"""Return the user portion of an email address."""

119

f = user.find('@')

119

f = user.find('@')

120

if f >= 0:

120

if f >= 0:

121

user = user[:f]

121

user = user[:f]

122

f = user.find('<')

122

f = user.find('<')

123

if f >= 0:

123

if f >= 0:

124

user = user[f + 1:]

124

user = user[f + 1:]

125

return user

125

return user

126

127

def email(author):

127

def email(author):

128

'''get email of author.'''

128

'''get email of author.'''

129

r = author.find('>')

129

r = author.find('>')

130

if r == -1:

130

if r == -1:

131

r = None

131

r = None

132

return author[author.find('<') + 1:r]

132

return author[author.find('<') + 1:r]

133

134

_correctauthorformat = remod.compile(br'^[^<]+\s\<[^<>]+@[^<>]+\>$')

135

136

def isauthorwellformed(author):

137

'''Return True if the author field is well formed

138

(ie "Contributor Name <contrib@email.dom>")

139

140

>>> isauthorwellformed(b'Good Author <good@author.com>')

141

True

142

>>> isauthorwellformed(b'Author <good@author.com>')

143

True

144

>>> isauthorwellformed(b'Bad Author')

145

False

146

>>> isauthorwellformed(b'Bad Author <author@author.com')

147

False

148

>>> isauthorwellformed(b'Bad Author author@author.com')

149

False

150

>>> isauthorwellformed(b'<author@author.com>')

151

False

152

>>> isauthorwellformed(b'Bad Author <author>')

153

False

154

'''

155

return _correctauthorformat.match(author) is not None

156

134

def ellipsis(text, maxlength=400):

157

def ellipsis(text, maxlength=400):

135

"""Trim string to at most maxlength (default: 400) columns in display."""

158

"""Trim string to at most maxlength (default: 400) columns in display."""

136

return encoding.trim(text, maxlength, ellipsis='...')

159

return encoding.trim(text, maxlength, ellipsis='...')

137

160

138

def escapestr(s):

161

def escapestr(s):

139

# call underlying function of s.encode('string_escape') directly for

162

# call underlying function of s.encode('string_escape') directly for

140

# Python 3 compatibility

163

# Python 3 compatibility

141

return codecs.escape_encode(s)[0]

164

return codecs.escape_encode(s)[0]

142

165

143

def unescapestr(s):

166

def unescapestr(s):

144

return codecs.escape_decode(s)[0]

167

return codecs.escape_decode(s)[0]

145

168

146

def forcebytestr(obj):

169

def forcebytestr(obj):

147

"""Portably format an arbitrary object (e.g. exception) into a byte

170

"""Portably format an arbitrary object (e.g. exception) into a byte

148

string."""

171

string."""

149

try:

172

try:

150

return pycompat.bytestr(obj)

173

return pycompat.bytestr(obj)

151

except UnicodeEncodeError:

174

except UnicodeEncodeError:

152

# non-ascii string, may be lossy

175

# non-ascii string, may be lossy

153

return pycompat.bytestr(encoding.strtolocal(str(obj)))

176

return pycompat.bytestr(encoding.strtolocal(str(obj)))

154

177

155

def uirepr(s):

178

def uirepr(s):

156

# Avoid double backslash in Windows path repr()

179

# Avoid double backslash in Windows path repr()

157

return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')

180

return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')

158

181

159

# delay import of textwrap

182

# delay import of textwrap

160

def _MBTextWrapper(**kwargs):

183

def _MBTextWrapper(**kwargs):

161

class tw(textwrap.TextWrapper):

184

class tw(textwrap.TextWrapper):

162

"""

185

"""

163

Extend TextWrapper for width-awareness.

186

Extend TextWrapper for width-awareness.

164

187

165

Neither number of 'bytes' in any encoding nor 'characters' is

188

Neither number of 'bytes' in any encoding nor 'characters' is

166

appropriate to calculate terminal columns for specified string.

189

appropriate to calculate terminal columns for specified string.

167

190

168

Original TextWrapper implementation uses built-in 'len()' directly,

191

Original TextWrapper implementation uses built-in 'len()' directly,

169

so overriding is needed to use width information of each characters.

192

so overriding is needed to use width information of each characters.

170

193

171

In addition, characters classified into 'ambiguous' width are

194

In addition, characters classified into 'ambiguous' width are

172

treated as wide in East Asian area, but as narrow in other.

195

treated as wide in East Asian area, but as narrow in other.

173

196

174

This requires use decision to determine width of such characters.

197

This requires use decision to determine width of such characters.

175

"""

198

"""

176

def _cutdown(self, ucstr, space_left):

199

def _cutdown(self, ucstr, space_left):

177

l = 0

200

l = 0

178

colwidth = encoding.ucolwidth

201

colwidth = encoding.ucolwidth

179

for i in xrange(len(ucstr)):

202

for i in xrange(len(ucstr)):

180

l += colwidth(ucstr[i])

203

l += colwidth(ucstr[i])

181

if space_left < l:

204

if space_left < l:

182

return (ucstr[:i], ucstr[i:])

205

return (ucstr[:i], ucstr[i:])

183

return ucstr, ''

206

return ucstr, ''

184

207

185

# overriding of base class

208

# overriding of base class

186

def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):

209

def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):

187

space_left = max(width - cur_len, 1)

210

space_left = max(width - cur_len, 1)

188

211

189

if self.break_long_words:

212

if self.break_long_words:

190

cut, res = self._cutdown(reversed_chunks[-1], space_left)

213

cut, res = self._cutdown(reversed_chunks[-1], space_left)

191

cur_line.append(cut)

214

cur_line.append(cut)

192

reversed_chunks[-1] = res

215

reversed_chunks[-1] = res

193

elif not cur_line:

216

elif not cur_line:

194

cur_line.append(reversed_chunks.pop())

217

cur_line.append(reversed_chunks.pop())

195

218

196

# this overriding code is imported from TextWrapper of Python 2.6

219

# this overriding code is imported from TextWrapper of Python 2.6

197

# to calculate columns of string by 'encoding.ucolwidth()'

220

# to calculate columns of string by 'encoding.ucolwidth()'

198

def _wrap_chunks(self, chunks):

221

def _wrap_chunks(self, chunks):

199

colwidth = encoding.ucolwidth

222

colwidth = encoding.ucolwidth

200

223

201

lines = []

224

lines = []

202

if self.width <= 0:

225

if self.width <= 0:

203

raise ValueError("invalid width %r (must be > 0)" % self.width)

226

raise ValueError("invalid width %r (must be > 0)" % self.width)

204

227

205

# Arrange in reverse order so items can be efficiently popped

228

# Arrange in reverse order so items can be efficiently popped

206

# from a stack of chucks.

229

# from a stack of chucks.

207

chunks.reverse()

230

chunks.reverse()

208

231

209

while chunks:

232

while chunks:

210

233

211

# Start the list of chunks that will make up the current line.

234

# Start the list of chunks that will make up the current line.

212

# cur_len is just the length of all the chunks in cur_line.

235

# cur_len is just the length of all the chunks in cur_line.

213

cur_line = []

236

cur_line = []

214

cur_len = 0

237

cur_len = 0

215

238

216

# Figure out which static string will prefix this line.

239

# Figure out which static string will prefix this line.

217

if lines:

240

if lines:

218

indent = self.subsequent_indent

241

indent = self.subsequent_indent

219

else:

242

else:

220

indent = self.initial_indent

243

indent = self.initial_indent

221

244

222

# Maximum width for this line.

245

# Maximum width for this line.

223

width = self.width - len(indent)

246

width = self.width - len(indent)

224

247

225

# First chunk on line is whitespace -- drop it, unless this

248

# First chunk on line is whitespace -- drop it, unless this

226

# is the very beginning of the text (i.e. no lines started yet).

249

# is the very beginning of the text (i.e. no lines started yet).

227

if self.drop_whitespace and chunks[-1].strip() == r'' and lines:

250

if self.drop_whitespace and chunks[-1].strip() == r'' and lines:

228

del chunks[-1]

251

del chunks[-1]

229

252

230

while chunks:

253

while chunks:

231

l = colwidth(chunks[-1])

254

l = colwidth(chunks[-1])

232

255

233

# Can at least squeeze this chunk onto the current line.

256

# Can at least squeeze this chunk onto the current line.

234

if cur_len + l <= width:

257

if cur_len + l <= width:

235

cur_line.append(chunks.pop())

258

cur_line.append(chunks.pop())

236

cur_len += l

259

cur_len += l

237

260

238

# Nope, this line is full.

261

# Nope, this line is full.

239

else:

262

else:

240

break

263

break

241

264

242

# The current line is full, and the next chunk is too big to

265

# The current line is full, and the next chunk is too big to

243

# fit on *any* line (not just this one).

266

# fit on *any* line (not just this one).

244

if chunks and colwidth(chunks[-1]) > width:

267

if chunks and colwidth(chunks[-1]) > width:

245

self._handle_long_word(chunks, cur_line, cur_len, width)

268

self._handle_long_word(chunks, cur_line, cur_len, width)

246

269

247

# If the last chunk on this line is all whitespace, drop it.

270

# If the last chunk on this line is all whitespace, drop it.

248

if (self.drop_whitespace and

271

if (self.drop_whitespace and

249

cur_line and cur_line[-1].strip() == r''):

272

cur_line and cur_line[-1].strip() == r''):

250

del cur_line[-1]

273

del cur_line[-1]

251

274

252

# Convert current line back to a string and store it in list

275

# Convert current line back to a string and store it in list

253

# of all lines (return value).

276

# of all lines (return value).

254

if cur_line:

277

if cur_line:

255

lines.append(indent + r''.join(cur_line))

278

lines.append(indent + r''.join(cur_line))

256

279

257

return lines

280

return lines

258

281

259

global _MBTextWrapper

282

global _MBTextWrapper

260

_MBTextWrapper = tw

283

_MBTextWrapper = tw

261

return tw(**kwargs)

284

return tw(**kwargs)

262

285

263

def wrap(line, width, initindent='', hangindent=''):

286

def wrap(line, width, initindent='', hangindent=''):

264

maxindent = max(len(hangindent), len(initindent))

287

maxindent = max(len(hangindent), len(initindent))

265

if width <= maxindent:

288

if width <= maxindent:

266

# adjust for weird terminal size

289

# adjust for weird terminal size

267

width = max(78, maxindent + 1)

290

width = max(78, maxindent + 1)

268

line = line.decode(pycompat.sysstr(encoding.encoding),

291

line = line.decode(pycompat.sysstr(encoding.encoding),

269