##// END OF EJS Templates
minirst: don't test regexps twice...
Martin Geisler -
r10064:6f30c357 default
parent child Browse files
Show More
@@ -1,280 +1,277 b''
1 1 # minirst.py - minimal reStructuredText parser
2 2 #
3 3 # Copyright 2009 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2, incorporated herein by reference.
7 7
8 8 """simplified reStructuredText parser.
9 9
10 10 This parser knows just enough about reStructuredText to parse the
11 11 Mercurial docstrings.
12 12
13 13 It cheats in a major way: nested blocks are not really nested. They
14 14 are just indented blocks that look like they are nested. This relies
15 15 on the user to keep the right indentation for the blocks.
16 16
17 17 It only supports a small subset of reStructuredText:
18 18
19 19 - sections
20 20
21 21 - paragraphs
22 22
23 23 - literal blocks
24 24
25 25 - definition lists
26 26
27 27 - bullet lists (items must start with '-')
28 28
29 29 - enumerated lists (no autonumbering)
30 30
31 31 - field lists (colons cannot be escaped)
32 32
33 33 - option lists (supports only long options without arguments)
34 34
35 35 - inline literals (no other inline markup is not recognized)
36 36 """
37 37
38 38 import re, sys, textwrap
39 39
40 40
41 41 def findblocks(text):
42 42 """Find continuous blocks of lines in text.
43 43
44 44 Returns a list of dictionaries representing the blocks. Each block
45 45 has an 'indent' field and a 'lines' field.
46 46 """
47 47 blocks = [[]]
48 48 lines = text.splitlines()
49 49 for line in lines:
50 50 if line.strip():
51 51 blocks[-1].append(line)
52 52 elif blocks[-1]:
53 53 blocks.append([])
54 54 if not blocks[-1]:
55 55 del blocks[-1]
56 56
57 57 for i, block in enumerate(blocks):
58 58 indent = min((len(l) - len(l.lstrip())) for l in block)
59 59 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
60 60 return blocks
61 61
62 62
63 63 def findliteralblocks(blocks):
64 64 """Finds literal blocks and adds a 'type' field to the blocks.
65 65
66 66 Literal blocks are given the type 'literal', all other blocks are
67 67 given type the 'paragraph'.
68 68 """
69 69 i = 0
70 70 while i < len(blocks):
71 71 # Searching for a block that looks like this:
72 72 #
73 73 # +------------------------------+
74 74 # | paragraph |
75 75 # | (ends with "::") |
76 76 # +------------------------------+
77 77 # +---------------------------+
78 78 # | indented literal block |
79 79 # +---------------------------+
80 80 blocks[i]['type'] = 'paragraph'
81 81 if blocks[i]['lines'][-1].endswith('::') and i+1 < len(blocks):
82 82 indent = blocks[i]['indent']
83 83 adjustment = blocks[i+1]['indent'] - indent
84 84
85 85 if blocks[i]['lines'] == ['::']:
86 86 # Expanded form: remove block
87 87 del blocks[i]
88 88 i -= 1
89 89 elif blocks[i]['lines'][-1].endswith(' ::'):
90 90 # Partially minimized form: remove space and both
91 91 # colons.
92 92 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
93 93 else:
94 94 # Fully minimized form: remove just one colon.
95 95 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
96 96
97 97 # List items are formatted with a hanging indent. We must
98 98 # correct for this here while we still have the original
99 99 # information on the indentation of the subsequent literal
100 100 # blocks available.
101 101 m = _bulletre.match(blocks[i]['lines'][0])
102 102 if m:
103 103 indent += m.end()
104 104 adjustment -= m.end()
105 105
106 106 # Mark the following indented blocks.
107 107 while i+1 < len(blocks) and blocks[i+1]['indent'] > indent:
108 108 blocks[i+1]['type'] = 'literal'
109 109 blocks[i+1]['indent'] -= adjustment
110 110 i += 1
111 111 i += 1
112 112 return blocks
113 113
114 114 _bulletre = re.compile(r'(-|[0-9A-Za-z]+\.|\(?[0-9A-Za-z]+\)) ')
115 115 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
116 116 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):( +)(.*)')
117 117 _definitionre = re.compile(r'[^ ]')
118 118
119 119 def splitparagraphs(blocks):
120 120 """Split paragraphs into lists."""
121 121 # Tuples with (list type, item regexp, single line items?). Order
122 122 # matters: definition lists has the least specific regexp and must
123 123 # come last.
124 124 listtypes = [('bullet', _bulletre, True),
125 125 ('option', _optionre, True),
126 126 ('field', _fieldre, True),
127 127 ('definition', _definitionre, False)]
128 128
129 129 def match(lines, i, itemre, singleline):
130 130 """Does itemre match an item at line i?
131 131
132 132 A list item can be followed by an idented line or another list
133 133 item (but only if singleline is True).
134 134 """
135 135 line1 = lines[i]
136 136 line2 = i+1 < len(lines) and lines[i+1] or ''
137 137 if not itemre.match(line1):
138 138 return False
139 139 if singleline:
140 140 return line2 == '' or line2[0] == ' ' or itemre.match(line2)
141 141 else:
142 142 return line2.startswith(' ')
143 143
144 144 i = 0
145 145 while i < len(blocks):
146 146 if blocks[i]['type'] == 'paragraph':
147 147 lines = blocks[i]['lines']
148 148 for type, itemre, singleline in listtypes:
149 149 if match(lines, 0, itemre, singleline):
150 150 items = []
151 151 for j, line in enumerate(lines):
152 152 if match(lines, j, itemre, singleline):
153 153 items.append(dict(type=type, lines=[],
154 154 indent=blocks[i]['indent']))
155 155 items[-1]['lines'].append(line)
156 156 blocks[i:i+1] = items
157 157 break
158 158 i += 1
159 159 return blocks
160 160
161 161
162 162 def findsections(blocks):
163 163 """Finds sections.
164 164
165 165 The blocks must have a 'type' field, i.e., they should have been
166 166 run through findliteralblocks first.
167 167 """
168 168 for block in blocks:
169 169 # Searching for a block that looks like this:
170 170 #
171 171 # +------------------------------+
172 172 # | Section title |
173 173 # | ------------- |
174 174 # +------------------------------+
175 175 if (block['type'] == 'paragraph' and
176 176 len(block['lines']) == 2 and
177 177 block['lines'][1] == '-' * len(block['lines'][0])):
178 178 block['type'] = 'section'
179 179 return blocks
180 180
181 181
182 182 def inlineliterals(blocks):
183 183 for b in blocks:
184 184 if b['type'] == 'paragraph':
185 185 b['lines'] = [l.replace('``', '"') for l in b['lines']]
186 186 return blocks
187 187
188 188
189 189 def addmargins(blocks):
190 190 """Adds empty blocks for vertical spacing.
191 191
192 192 This groups bullets, options, and definitions together with no vertical
193 193 space between them, and adds an empty block between all other blocks.
194 194 """
195 195 i = 1
196 196 while i < len(blocks):
197 197 if (blocks[i]['type'] == blocks[i-1]['type'] and
198 198 blocks[i]['type'] in ('bullet', 'option', 'field', 'definition')):
199 199 i += 1
200 200 else:
201 201 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
202 202 i += 2
203 203 return blocks
204 204
205 205
206 206 def formatblock(block, width):
207 207 """Format a block according to width."""
208 208 if width <= 0:
209 209 width = 78
210 210 indent = ' ' * block['indent']
211 211 if block['type'] == 'margin':
212 212 return ''
213 213 if block['type'] == 'literal':
214 214 indent += ' '
215 215 return indent + ('\n' + indent).join(block['lines'])
216 216 if block['type'] == 'section':
217 217 return indent + ('\n' + indent).join(block['lines'])
218 218 if block['type'] == 'definition':
219 219 term = indent + block['lines'][0]
220 220 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
221 221 defindent = indent + hang * ' '
222 222 text = ' '.join(map(str.strip, block['lines'][1:]))
223 223 return "%s\n%s" % (term, textwrap.fill(text, width=width,
224 224 initial_indent=defindent,
225 225 subsequent_indent=defindent))
226 226 initindent = subindent = indent
227 227 if block['type'] == 'bullet':
228 228 m = _bulletre.match(block['lines'][0])
229 if m:
230 subindent = indent + m.end() * ' '
229 subindent = indent + m.end() * ' '
231 230 elif block['type'] == 'field':
232 231 m = _fieldre.match(block['lines'][0])
233 if m:
234 key, spaces, rest = m.groups()
235 # Turn ":foo: bar" into "foo bar".
236 block['lines'][0] = '%s %s%s' % (key, spaces, rest)
237 subindent = indent + (2 + len(key) + len(spaces)) * ' '
232 key, spaces, rest = m.groups()
233 # Turn ":foo: bar" into "foo bar".
234 block['lines'][0] = '%s %s%s' % (key, spaces, rest)
235 subindent = indent + (2 + len(key) + len(spaces)) * ' '
238 236 elif block['type'] == 'option':
239 237 m = _optionre.match(block['lines'][0])
240 if m:
241 option, arg, rest = m.groups()
242 subindent = indent + (len(option) + len(arg)) * ' '
238 option, arg, rest = m.groups()
239 subindent = indent + (len(option) + len(arg)) * ' '
243 240
244 241 text = ' '.join(map(str.strip, block['lines']))
245 242 return textwrap.fill(text, width=width,
246 243 initial_indent=initindent,
247 244 subsequent_indent=subindent)
248 245
249 246
250 247 def format(text, width, indent=0):
251 248 """Parse and format the text according to width."""
252 249 blocks = findblocks(text)
253 250 for b in blocks:
254 251 b['indent'] += indent
255 252 blocks = findliteralblocks(blocks)
256 253 blocks = inlineliterals(blocks)
257 254 blocks = splitparagraphs(blocks)
258 255 blocks = findsections(blocks)
259 256 blocks = addmargins(blocks)
260 257 return '\n'.join(formatblock(b, width) for b in blocks)
261 258
262 259
263 260 if __name__ == "__main__":
264 261 from pprint import pprint
265 262
266 263 def debug(func, blocks):
267 264 blocks = func(blocks)
268 265 print "*** after %s:" % func.__name__
269 266 pprint(blocks)
270 267 print
271 268 return blocks
272 269
273 270 text = open(sys.argv[1]).read()
274 271 blocks = debug(findblocks, text)
275 272 blocks = debug(findliteralblocks, blocks)
276 273 blocks = debug(inlineliterals, blocks)
277 274 blocks = debug(splitparagraphs, blocks)
278 275 blocks = debug(findsections, blocks)
279 276 blocks = debug(addmargins, blocks)
280 277 print '\n'.join(formatblock(b, 30) for b in blocks)
General Comments 0
You need to be logged in to leave comments. Login now