##// END OF EJS Templates
minirst: combine list parsing in one function...
Martin Geisler -
r9737:5f101af4 default
parent child Browse files
Show More
@@ -1,354 +1,272 b''
1 1 # minirst.py - minimal reStructuredText parser
2 2 #
3 3 # Copyright 2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2, incorporated herein by reference.
7 7
8 8 """simplified reStructuredText parser.
9 9
10 10 This parser knows just enough about reStructuredText to parse the
11 11 Mercurial docstrings.
12 12
13 13 It cheats in a major way: nested blocks are not really nested. They
14 14 are just indented blocks that look like they are nested. This relies
15 15 on the user to keep the right indentation for the blocks.
16 16
17 17 It only supports a small subset of reStructuredText:
18 18
19 19 - paragraphs
20 20
21 21 - definition lists (must use ' ' to indent definitions)
22 22
23 23 - lists (items must start with '-')
24 24
25 25 - field lists (colons cannot be escaped)
26 26
27 27 - literal blocks
28 28
29 29 - option lists (supports only long options without arguments)
30 30
31 31 - inline markup is not recognized at all.
32 32 """
33 33
34 34 import re, sys, textwrap
35 35
36 36
37 37 def findblocks(text):
38 38 """Find continuous blocks of lines in text.
39 39
40 40 Returns a list of dictionaries representing the blocks. Each block
41 41 has an 'indent' field and a 'lines' field.
42 42 """
43 43 blocks = [[]]
44 44 lines = text.splitlines()
45 45 for line in lines:
46 46 if line.strip():
47 47 blocks[-1].append(line)
48 48 elif blocks[-1]:
49 49 blocks.append([])
50 50 if not blocks[-1]:
51 51 del blocks[-1]
52 52
53 53 for i, block in enumerate(blocks):
54 54 indent = min((len(l) - len(l.lstrip())) for l in block)
55 55 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
56 56 return blocks
57 57
58 58
59 59 def findliteralblocks(blocks):
60 60 """Finds literal blocks and adds a 'type' field to the blocks.
61 61
62 62 Literal blocks are given the type 'literal', all other blocks are
63 63 given type the 'paragraph'.
64 64 """
65 65 i = 0
66 66 while i < len(blocks):
67 67 # Searching for a block that looks like this:
68 68 #
69 69 # +------------------------------+
70 70 # | paragraph |
71 71 # | (ends with "::") |
72 72 # +------------------------------+
73 73 # +---------------------------+
74 74 # | indented literal block |
75 75 # +---------------------------+
76 76 blocks[i]['type'] = 'paragraph'
77 77 if blocks[i]['lines'][-1].endswith('::') and i+1 < len(blocks):
78 78 indent = blocks[i]['indent']
79 79 adjustment = blocks[i+1]['indent'] - indent
80 80
81 81 if blocks[i]['lines'] == ['::']:
82 82 # Expanded form: remove block
83 83 del blocks[i]
84 84 i -= 1
85 85 elif blocks[i]['lines'][-1].endswith(' ::'):
86 86 # Partially minimized form: remove space and both
87 87 # colons.
88 88 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
89 89 else:
90 90 # Fully minimized form: remove just one colon.
91 91 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
92 92
93 93 # List items are formatted with a hanging indent. We must
94 94 # correct for this here while we still have the original
95 95 # information on the indentation of the subsequent literal
96 96 # blocks available.
97 97 if blocks[i]['lines'][0].startswith('- '):
98 98 indent += 2
99 99 adjustment -= 2
100 100
101 101 # Mark the following indented blocks.
102 102 while i+1 < len(blocks) and blocks[i+1]['indent'] > indent:
103 103 blocks[i+1]['type'] = 'literal'
104 104 blocks[i+1]['indent'] -= adjustment
105 105 i += 1
106 106 i += 1
107 107 return blocks
108 108
109 _bulletre = re.compile(r'- ')
110 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
111 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):( +)(.*)')
112 _definitionre = re.compile(r'[^ ]')
113
114 def splitparagraphs(blocks):
115 """Split paragraphs into lists."""
116 # Tuples with (list type, item regexp, single line items?). Order
117 # matters: definition lists has the least specific regexp and must
118 # come last.
119 listtypes = [('bullet', _bulletre, True),
120 ('option', _optionre, True),
121 ('field', _fieldre, True),
122 ('definition', _definitionre, False)]
123
124 def match(lines, i, itemre, singleline):
125 """Does itemre match an item at line i?
126
127 A list item can be followed by an idented line or another list
128 item (but only if singleline is True).
129 """
130 line1 = lines[i]
131 line2 = i+1 < len(lines) and lines[i+1] or ''
132 if not itemre.match(line1):
133 return False
134 if singleline:
135 return line2 == '' or line2[0] == ' ' or itemre.match(line2)
136 else:
137 return line2.startswith(' ')
138
139 i = 0
140 while i < len(blocks):
141 if blocks[i]['type'] == 'paragraph':
142 lines = blocks[i]['lines']
143 for type, itemre, singleline in listtypes:
144 if match(lines, 0, itemre, singleline):
145 items = []
146 for j, line in enumerate(lines):
147 if match(lines, j, itemre, singleline):
148 items.append(dict(type=type, lines=[],
149 indent=blocks[i]['indent']))
150 items[-1]['lines'].append(line)
151 blocks[i:i+1] = items
152 break
153 i += 1
154 return blocks
155
109 156
110 157 def findsections(blocks):
111 158 """Finds sections.
112 159
113 160 The blocks must have a 'type' field, i.e., they should have been
114 161 run through findliteralblocks first.
115 162 """
116 163 for block in blocks:
117 164 # Searching for a block that looks like this:
118 165 #
119 166 # +------------------------------+
120 167 # | Section title |
121 168 # | ------------- |
122 169 # +------------------------------+
123 170 if (block['type'] == 'paragraph' and
124 171 len(block['lines']) == 2 and
125 172 block['lines'][1] == '-' * len(block['lines'][0])):
126 173 block['type'] = 'section'
127 174 return blocks
128 175
129 176
130 def findbulletlists(blocks):
131 """Finds bullet lists.
132
133 The blocks must have a 'type' field, i.e., they should have been
134 run through findliteralblocks first.
135 """
136 i = 0
137 while i < len(blocks):
138 # Searching for a paragraph that looks like this:
139 #
140 # +------+-----------------------+
141 # | "- " | list item |
142 # +------| (body elements)+ |
143 # +-----------------------+
144 if (blocks[i]['type'] == 'paragraph' and
145 blocks[i]['lines'][0].startswith('- ')):
146 items = []
147 for line in blocks[i]['lines']:
148 if line.startswith('- '):
149 items.append(dict(type='bullet', lines=[],
150 indent=blocks[i]['indent']))
151 line = line[2:]
152 items[-1]['lines'].append(line)
153 blocks[i:i+1] = items
154 i += len(items) - 1
155 i += 1
156 return blocks
157
158
159 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
160 def findoptionlists(blocks):
161 """Finds option lists.
162
163 The blocks must have a 'type' field, i.e., they should have been
164 run through findliteralblocks first.
165 """
166 i = 0
167 while i < len(blocks):
168 # Searching for a paragraph that looks like this:
169 #
170 # +----------------------------+-------------+
171 # | "--" option " " | description |
172 # +-------+--------------------+ |
173 # | (body elements)+ |
174 # +----------------------------------+
175 if (blocks[i]['type'] == 'paragraph' and
176 _optionre.match(blocks[i]['lines'][0])):
177 options = []
178 for line in blocks[i]['lines']:
179 m = _optionre.match(line)
180 if m:
181 option, arg, rest = m.groups()
182 width = len(option) + len(arg)
183 options.append(dict(type='option', lines=[],
184 indent=blocks[i]['indent'],
185 width=width))
186 options[-1]['lines'].append(line)
187 blocks[i:i+1] = options
188 i += len(options) - 1
189 i += 1
190 return blocks
191
192
193 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):( +)(.*)')
194 def findfieldlists(blocks):
195 """Finds fields lists.
196
197 The blocks must have a 'type' field, i.e., they should have been
198 run through findliteralblocks first.
199 """
200 i = 0
201 while i < len(blocks):
202 # Searching for a paragraph that looks like this:
203 #
204 #
205 # +--------------------+----------------------+
206 # | ":" field name ":" | field body |
207 # +-------+------------+ |
208 # | (body elements)+ |
209 # +-----------------------------------+
210 if (blocks[i]['type'] == 'paragraph' and
211 _fieldre.match(blocks[i]['lines'][0])):
212 indent = blocks[i]['indent']
213 fields = []
214 for line in blocks[i]['lines']:
215 m = _fieldre.match(line)
216 if m:
217 key, spaces, rest = m.groups()
218 width = 2 + len(key) + len(spaces)
219 fields.append(dict(type='field', lines=[],
220 indent=indent, width=width))
221 # Turn ":foo: bar" into "foo bar".
222 line = '%s %s%s' % (key, spaces, rest)
223 fields[-1]['lines'].append(line)
224 blocks[i:i+1] = fields
225 i += len(fields) - 1
226 i += 1
227 return blocks
228
229
230 def finddefinitionlists(blocks):
231 """Finds definition lists.
232
233 The blocks must have a 'type' field, i.e., they should have been
234 run through findliteralblocks first.
235 """
236 i = 0
237 while i < len(blocks):
238 # Searching for a paragraph that looks like this:
239 #
240 # +----------------------------+
241 # | term |
242 # +--+-------------------------+--+
243 # | definition |
244 # | (body elements)+ |
245 # +----------------------------+
246 if (blocks[i]['type'] == 'paragraph' and
247 len(blocks[i]['lines']) > 1 and
248 not blocks[i]['lines'][0].startswith(' ') and
249 blocks[i]['lines'][1].startswith(' ')):
250 definitions = []
251 for line in blocks[i]['lines']:
252 if not line.startswith(' '):
253 definitions.append(dict(type='definition', lines=[],
254 indent=blocks[i]['indent']))
255 definitions[-1]['lines'].append(line)
256 definitions[-1]['hang'] = len(line) - len(line.lstrip())
257 blocks[i:i+1] = definitions
258 i += len(definitions) - 1
259 i += 1
260 return blocks
261
262
263 177 def inlineliterals(blocks):
264 178 for b in blocks:
265 179 if b['type'] == 'paragraph':
266 180 b['lines'] = [l.replace('``', '"') for l in b['lines']]
267 181 return blocks
268 182
269 183
270 184 def addmargins(blocks):
271 185 """Adds empty blocks for vertical spacing.
272 186
273 187 This groups bullets, options, and definitions together with no vertical
274 188 space between them, and adds an empty block between all other blocks.
275 189 """
276 190 i = 1
277 191 while i < len(blocks):
278 192 if (blocks[i]['type'] == blocks[i-1]['type'] and
279 193 blocks[i]['type'] in ('bullet', 'option', 'field', 'definition')):
280 194 i += 1
281 195 else:
282 196 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
283 197 i += 2
284 198 return blocks
285 199
286 200
287 201 def formatblock(block, width):
288 202 """Format a block according to width."""
289 203 if width <= 0:
290 204 width = 78
291 205 indent = ' ' * block['indent']
292 206 if block['type'] == 'margin':
293 207 return ''
294 208 if block['type'] == 'literal':
295 209 indent += ' '
296 210 return indent + ('\n' + indent).join(block['lines'])
297 211 if block['type'] == 'section':
298 212 return indent + ('\n' + indent).join(block['lines'])
299 213 if block['type'] == 'definition':
300 214 term = indent + block['lines'][0]
301 defindent = indent + block['hang'] * ' '
215 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
216 defindent = indent + hang * ' '
302 217 text = ' '.join(map(str.strip, block['lines'][1:]))
303 218 return "%s\n%s" % (term, textwrap.fill(text, width=width,
304 219 initial_indent=defindent,
305 220 subsequent_indent=defindent))
306 221 initindent = subindent = indent
307 text = ' '.join(map(str.strip, block['lines']))
308 222 if block['type'] == 'bullet':
309 initindent = indent + '- '
310 223 subindent = indent + ' '
311 elif block['type'] in ('option', 'field'):
312 subindent = indent + block['width'] * ' '
224 elif block['type'] == 'field':
225 m = _fieldre.match(block['lines'][0])
226 if m:
227 key, spaces, rest = m.groups()
228 # Turn ":foo: bar" into "foo bar".
229 block['lines'][0] = '%s %s%s' % (key, spaces, rest)
230 subindent = indent + (2 + len(key) + len(spaces)) * ' '
231 elif block['type'] == 'option':
232 m = _optionre.match(block['lines'][0])
233 if m:
234 option, arg, rest = m.groups()
235 subindent = indent + (len(option) + len(arg)) * ' '
313 236
237 text = ' '.join(map(str.strip, block['lines']))
314 238 return textwrap.fill(text, width=width,
315 239 initial_indent=initindent,
316 240 subsequent_indent=subindent)
317 241
318 242
319 243 def format(text, width, indent=0):
320 244 """Parse and format the text according to width."""
321 245 blocks = findblocks(text)
322 246 for b in blocks:
323 247 b['indent'] += indent
324 248 blocks = findliteralblocks(blocks)
325 249 blocks = inlineliterals(blocks)
250 blocks = splitparagraphs(blocks)
326 251 blocks = findsections(blocks)
327 blocks = findbulletlists(blocks)
328 blocks = findoptionlists(blocks)
329 blocks = findfieldlists(blocks)
330 blocks = finddefinitionlists(blocks)
331 252 blocks = addmargins(blocks)
332 253 return '\n'.join(formatblock(b, width) for b in blocks)
333 254
334 255
335 256 if __name__ == "__main__":
336 257 from pprint import pprint
337 258
338 259 def debug(func, blocks):
339 260 blocks = func(blocks)
340 261 print "*** after %s:" % func.__name__
341 262 pprint(blocks)
342 263 print
343 264 return blocks
344 265
345 266 text = open(sys.argv[1]).read()
346 267 blocks = debug(findblocks, text)
347 268 blocks = debug(findliteralblocks, blocks)
269 blocks = debug(splitparagraphs, blocks)
348 270 blocks = debug(findsections, blocks)
349 blocks = debug(findbulletlists, blocks)
350 blocks = debug(findoptionlists, blocks)
351 blocks = debug(findfieldlists, blocks)
352 blocks = debug(finddefinitionlists, blocks)
353 271 blocks = debug(addmargins, blocks)
354 272 print '\n'.join(formatblock(b, 30) for b in blocks)
General Comments 0
You need to be logged in to leave comments. Login now