##// END OF EJS Templates
minirst: refactor/simplify findblocks
Martin Geisler -
r12651:17f28de1 default
parent child Browse files
Show More
@@ -1,441 +1,437
1 1 # minirst.py - minimal reStructuredText parser
2 2 #
3 3 # Copyright 2009, 2010 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 """simplified reStructuredText parser.
9 9
10 10 This parser knows just enough about reStructuredText to parse the
11 11 Mercurial docstrings.
12 12
13 13 It cheats in a major way: nested blocks are not really nested. They
14 14 are just indented blocks that look like they are nested. This relies
15 15 on the user to keep the right indentation for the blocks.
16 16
17 17 It only supports a small subset of reStructuredText:
18 18
19 19 - sections
20 20
21 21 - paragraphs
22 22
23 23 - literal blocks
24 24
25 25 - definition lists
26 26
27 27 - specific admonitions
28 28
29 29 - bullet lists (items must start with '-')
30 30
31 31 - enumerated lists (no autonumbering)
32 32
33 33 - field lists (colons cannot be escaped)
34 34
35 35 - option lists (supports only long options without arguments)
36 36
37 37 - inline literals (no other inline markup is not recognized)
38 38 """
39 39
40 40 import re, sys
41 41 import util, encoding
42 42 from i18n import _
43 43
44 44
45 45 def replace(text, substs):
46 46 utext = text.decode(encoding.encoding)
47 47 for f, t in substs:
48 48 utext = utext.replace(f, t)
49 49 return utext.encode(encoding.encoding)
50 50
51
52 _blockre = re.compile(r"\n(?:\s*\n)+")
53
51 54 def findblocks(text):
52 55 """Find continuous blocks of lines in text.
53 56
54 57 Returns a list of dictionaries representing the blocks. Each block
55 58 has an 'indent' field and a 'lines' field.
56 59 """
57 blocks = [[]]
58 lines = text.splitlines()
59 for line in lines:
60 if line.strip():
61 blocks[-1].append(line)
62 elif blocks[-1]:
63 blocks.append([])
64 if not blocks[-1]:
65 del blocks[-1]
66
67 for i, block in enumerate(blocks):
68 indent = min((len(l) - len(l.lstrip())) for l in block)
69 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
60 blocks = []
61 for b in _blockre.split(text.strip()):
62 lines = b.splitlines()
63 indent = min((len(l) - len(l.lstrip())) for l in lines)
64 lines = [l[indent:] for l in lines]
65 blocks.append(dict(indent=indent, lines=lines))
70 66 return blocks
71 67
72 68
73 69 def findliteralblocks(blocks):
74 70 """Finds literal blocks and adds a 'type' field to the blocks.
75 71
76 72 Literal blocks are given the type 'literal', all other blocks are
77 73 given type the 'paragraph'.
78 74 """
79 75 i = 0
80 76 while i < len(blocks):
81 77 # Searching for a block that looks like this:
82 78 #
83 79 # +------------------------------+
84 80 # | paragraph |
85 81 # | (ends with "::") |
86 82 # +------------------------------+
87 83 # +---------------------------+
88 84 # | indented literal block |
89 85 # +---------------------------+
90 86 blocks[i]['type'] = 'paragraph'
91 87 if blocks[i]['lines'][-1].endswith('::') and i + 1 < len(blocks):
92 88 indent = blocks[i]['indent']
93 89 adjustment = blocks[i + 1]['indent'] - indent
94 90
95 91 if blocks[i]['lines'] == ['::']:
96 92 # Expanded form: remove block
97 93 del blocks[i]
98 94 i -= 1
99 95 elif blocks[i]['lines'][-1].endswith(' ::'):
100 96 # Partially minimized form: remove space and both
101 97 # colons.
102 98 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
103 99 else:
104 100 # Fully minimized form: remove just one colon.
105 101 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
106 102
107 103 # List items are formatted with a hanging indent. We must
108 104 # correct for this here while we still have the original
109 105 # information on the indentation of the subsequent literal
110 106 # blocks available.
111 107 m = _bulletre.match(blocks[i]['lines'][0])
112 108 if m:
113 109 indent += m.end()
114 110 adjustment -= m.end()
115 111
116 112 # Mark the following indented blocks.
117 113 while i + 1 < len(blocks) and blocks[i + 1]['indent'] > indent:
118 114 blocks[i + 1]['type'] = 'literal'
119 115 blocks[i + 1]['indent'] -= adjustment
120 116 i += 1
121 117 i += 1
122 118 return blocks
123 119
124 120 _bulletre = re.compile(r'(-|[0-9A-Za-z]+\.|\(?[0-9A-Za-z]+\)|\|) ')
125 121 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
126 122 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):[ ]+(.*)')
127 123 _definitionre = re.compile(r'[^ ]')
128 124
129 125 def splitparagraphs(blocks):
130 126 """Split paragraphs into lists."""
131 127 # Tuples with (list type, item regexp, single line items?). Order
132 128 # matters: definition lists has the least specific regexp and must
133 129 # come last.
134 130 listtypes = [('bullet', _bulletre, True),
135 131 ('option', _optionre, True),
136 132 ('field', _fieldre, True),
137 133 ('definition', _definitionre, False)]
138 134
139 135 def match(lines, i, itemre, singleline):
140 136 """Does itemre match an item at line i?
141 137
142 138 A list item can be followed by an idented line or another list
143 139 item (but only if singleline is True).
144 140 """
145 141 line1 = lines[i]
146 142 line2 = i + 1 < len(lines) and lines[i + 1] or ''
147 143 if not itemre.match(line1):
148 144 return False
149 145 if singleline:
150 146 return line2 == '' or line2[0] == ' ' or itemre.match(line2)
151 147 else:
152 148 return line2.startswith(' ')
153 149
154 150 i = 0
155 151 while i < len(blocks):
156 152 if blocks[i]['type'] == 'paragraph':
157 153 lines = blocks[i]['lines']
158 154 for type, itemre, singleline in listtypes:
159 155 if match(lines, 0, itemre, singleline):
160 156 items = []
161 157 for j, line in enumerate(lines):
162 158 if match(lines, j, itemre, singleline):
163 159 items.append(dict(type=type, lines=[],
164 160 indent=blocks[i]['indent']))
165 161 items[-1]['lines'].append(line)
166 162 blocks[i:i + 1] = items
167 163 break
168 164 i += 1
169 165 return blocks
170 166
171 167
172 168 _fieldwidth = 12
173 169
174 170 def updatefieldlists(blocks):
175 171 """Find key and maximum key width for field lists."""
176 172 i = 0
177 173 while i < len(blocks):
178 174 if blocks[i]['type'] != 'field':
179 175 i += 1
180 176 continue
181 177
182 178 keywidth = 0
183 179 j = i
184 180 while j < len(blocks) and blocks[j]['type'] == 'field':
185 181 m = _fieldre.match(blocks[j]['lines'][0])
186 182 key, rest = m.groups()
187 183 blocks[j]['lines'][0] = rest
188 184 blocks[j]['key'] = key
189 185 keywidth = max(keywidth, len(key))
190 186 j += 1
191 187
192 188 for block in blocks[i:j]:
193 189 block['keywidth'] = keywidth
194 190 i = j + 1
195 191
196 192 return blocks
197 193
198 194
199 195 def prunecontainers(blocks, keep):
200 196 """Prune unwanted containers.
201 197
202 198 The blocks must have a 'type' field, i.e., they should have been
203 199 run through findliteralblocks first.
204 200 """
205 201 pruned = []
206 202 i = 0
207 203 while i + 1 < len(blocks):
208 204 # Searching for a block that looks like this:
209 205 #
210 206 # +-------+---------------------------+
211 207 # | ".. container ::" type |
212 208 # +---+ |
213 209 # | blocks |
214 210 # +-------------------------------+
215 211 if (blocks[i]['type'] == 'paragraph' and
216 212 blocks[i]['lines'][0].startswith('.. container::')):
217 213 indent = blocks[i]['indent']
218 214 adjustment = blocks[i + 1]['indent'] - indent
219 215 containertype = blocks[i]['lines'][0][15:]
220 216 prune = containertype not in keep
221 217 if prune:
222 218 pruned.append(containertype)
223 219
224 220 # Always delete "..container:: type" block
225 221 del blocks[i]
226 222 j = i
227 223 while j < len(blocks) and blocks[j]['indent'] > indent:
228 224 if prune:
229 225 del blocks[j]
230 226 i -= 1 # adjust outer index
231 227 else:
232 228 blocks[j]['indent'] -= adjustment
233 229 j += 1
234 230 i += 1
235 231 return blocks, pruned
236 232
237 233
238 234 _sectionre = re.compile(r"""^([-=`:.'"~^_*+#])\1+$""")
239 235
240 236 def findsections(blocks):
241 237 """Finds sections.
242 238
243 239 The blocks must have a 'type' field, i.e., they should have been
244 240 run through findliteralblocks first.
245 241 """
246 242 for block in blocks:
247 243 # Searching for a block that looks like this:
248 244 #
249 245 # +------------------------------+
250 246 # | Section title |
251 247 # | ------------- |
252 248 # +------------------------------+
253 249 if (block['type'] == 'paragraph' and
254 250 len(block['lines']) == 2 and
255 251 len(block['lines'][0]) == len(block['lines'][1]) and
256 252 _sectionre.match(block['lines'][1])):
257 253 block['underline'] = block['lines'][1][0]
258 254 block['type'] = 'section'
259 255 del block['lines'][1]
260 256 return blocks
261 257
262 258
263 259 def inlineliterals(blocks):
264 260 substs = [('``', '"')]
265 261 for b in blocks:
266 262 if b['type'] in ('paragraph', 'section'):
267 263 b['lines'] = [replace(l, substs) for l in b['lines']]
268 264 return blocks
269 265
270 266
271 267 def hgrole(blocks):
272 268 substs = [(':hg:`', '"hg '), ('`', '"')]
273 269 for b in blocks:
274 270 if b['type'] in ('paragraph', 'section'):
275 271 # Turn :hg:`command` into "hg command". This also works
276 272 # when there is a line break in the command and relies on
277 273 # the fact that we have no stray back-quotes in the input
278 274 # (run the blocks through inlineliterals first).
279 275 b['lines'] = [replace(l, substs) for l in b['lines']]
280 276 return blocks
281 277
282 278
283 279 def addmargins(blocks):
284 280 """Adds empty blocks for vertical spacing.
285 281
286 282 This groups bullets, options, and definitions together with no vertical
287 283 space between them, and adds an empty block between all other blocks.
288 284 """
289 285 i = 1
290 286 while i < len(blocks):
291 287 if (blocks[i]['type'] == blocks[i - 1]['type'] and
292 288 blocks[i]['type'] in ('bullet', 'option', 'field')):
293 289 i += 1
294 290 else:
295 291 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
296 292 i += 2
297 293 return blocks
298 294
299 295 _admonitionre = re.compile(r"\.\. (admonition|attention|caution|danger|"
300 296 r"error|hint|important|note|tip|warning)::",
301 297 flags=re.IGNORECASE)
302 298
303 299 def findadmonitions(blocks):
304 300 """
305 301 Makes the type of the block an admonition block if
306 302 the first line is an admonition directive
307 303 """
308 304 i = 0
309 305 while i < len(blocks):
310 306 m = _admonitionre.match(blocks[i]['lines'][0])
311 307 if m:
312 308 blocks[i]['type'] = 'admonition'
313 309 admonitiontitle = blocks[i]['lines'][0][3:m.end() - 2].lower()
314 310
315 311 firstline = blocks[i]['lines'][0][m.end() + 1:]
316 312 if firstline:
317 313 blocks[i]['lines'].insert(1, ' ' + firstline)
318 314
319 315 blocks[i]['admonitiontitle'] = admonitiontitle
320 316 del blocks[i]['lines'][0]
321 317 i = i + 1
322 318 return blocks
323 319
324 320 def formatblock(block, width):
325 321 """Format a block according to width."""
326 322 if width <= 0:
327 323 width = 78
328 324 indent = ' ' * block['indent']
329 325 if block['type'] == 'admonition':
330 326 titles = {'attention': _('Attention:'),
331 327 'caution': _('Caution:'),
332 328 'danger': _('!Danger!') ,
333 329 'error': _('Error:'),
334 330 'hint': _('Hint:'),
335 331 'important': _('Important:'),
336 332 'note': _('Note:'),
337 333 'tip': _('Tip:'),
338 334 'warning': _('Warning!')}
339 335
340 336 admonition = titles[block['admonitiontitle']]
341 337 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
342 338
343 339 defindent = indent + hang * ' '
344 340 text = ' '.join(map(str.strip, block['lines']))
345 341 return '%s\n%s' % (indent + admonition, util.wrap(text, width=width,
346 342 initindent=defindent,
347 343 hangindent=defindent))
348 344 if block['type'] == 'margin':
349 345 return ''
350 346 if block['type'] == 'literal':
351 347 indent += ' '
352 348 return indent + ('\n' + indent).join(block['lines'])
353 349 if block['type'] == 'section':
354 350 underline = len(block['lines'][0]) * block['underline']
355 351 return "%s%s\n%s%s" % (indent, block['lines'][0],indent, underline)
356 352 if block['type'] == 'definition':
357 353 term = indent + block['lines'][0]
358 354 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
359 355 defindent = indent + hang * ' '
360 356 text = ' '.join(map(str.strip, block['lines'][1:]))
361 357 return '%s\n%s' % (term, util.wrap(text, width=width,
362 358 initindent=defindent,
363 359 hangindent=defindent))
364 360 subindent = indent
365 361 if block['type'] == 'bullet':
366 362 if block['lines'][0].startswith('| '):
367 363 # Remove bullet for line blocks and add no extra
368 364 # indention.
369 365 block['lines'][0] = block['lines'][0][2:]
370 366 else:
371 367 m = _bulletre.match(block['lines'][0])
372 368 subindent = indent + m.end() * ' '
373 369 elif block['type'] == 'field':
374 370 keywidth = block['keywidth']
375 371 key = block['key']
376 372
377 373 subindent = indent + _fieldwidth * ' '
378 374 if len(key) + 2 > _fieldwidth:
379 375 # key too large, use full line width
380 376 key = key.ljust(width)
381 377 elif keywidth + 2 < _fieldwidth:
382 378 # all keys are small, add only two spaces
383 379 key = key.ljust(keywidth + 2)
384 380 subindent = indent + (keywidth + 2) * ' '
385 381 else:
386 382 # mixed sizes, use fieldwidth for this one
387 383 key = key.ljust(_fieldwidth)
388 384 block['lines'][0] = key + block['lines'][0]
389 385 elif block['type'] == 'option':
390 386 m = _optionre.match(block['lines'][0])
391 387 option, arg, rest = m.groups()
392 388 subindent = indent + (len(option) + len(arg)) * ' '
393 389
394 390 text = ' '.join(map(str.strip, block['lines']))
395 391 return util.wrap(text, width=width,
396 392 initindent=indent,
397 393 hangindent=subindent)
398 394
399 395
400 396 def format(text, width, indent=0, keep=None):
401 397 """Parse and format the text according to width."""
402 398 blocks = findblocks(text)
403 399 for b in blocks:
404 400 b['indent'] += indent
405 401 blocks = findliteralblocks(blocks)
406 402 blocks, pruned = prunecontainers(blocks, keep or [])
407 403 blocks = findsections(blocks)
408 404 blocks = inlineliterals(blocks)
409 405 blocks = hgrole(blocks)
410 406 blocks = splitparagraphs(blocks)
411 407 blocks = updatefieldlists(blocks)
412 408 blocks = addmargins(blocks)
413 409 blocks = findadmonitions(blocks)
414 410 text = '\n'.join(formatblock(b, width) for b in blocks)
415 411 if keep is None:
416 412 return text
417 413 else:
418 414 return text, pruned
419 415
420 416
421 417 if __name__ == "__main__":
422 418 from pprint import pprint
423 419
424 420 def debug(func, *args):
425 421 blocks = func(*args)
426 422 print "*** after %s:" % func.__name__
427 423 pprint(blocks)
428 424 print
429 425 return blocks
430 426
431 427 text = open(sys.argv[1]).read()
432 428 blocks = debug(findblocks, text)
433 429 blocks = debug(findliteralblocks, blocks)
434 430 blocks, pruned = debug(prunecontainers, blocks, sys.argv[2:])
435 431 blocks = debug(inlineliterals, blocks)
436 432 blocks = debug(splitparagraphs, blocks)
437 433 blocks = debug(updatefieldlists, blocks)
438 434 blocks = debug(findsections, blocks)
439 435 blocks = debug(addmargins, blocks)
440 436 blocks = debug(findadmonitions, blocks)
441 437 print '\n'.join(formatblock(b, 30) for b in blocks)
General Comments 0
You need to be logged in to leave comments. Login now