##// END OF EJS Templates
minirst: don't test regexps twice...
Martin Geisler -
r10064:6f30c357 default
parent child Browse files
Show More
@@ -1,280 +1,277 b''
1 # minirst.py - minimal reStructuredText parser
1 # minirst.py - minimal reStructuredText parser
2 #
2 #
3 # Copyright 2009 Matt Mackall <mpm@selenic.com> and others
3 # Copyright 2009 Matt Mackall <mpm@selenic.com> and others
4 #
4 #
5 # This software may be used and distributed according to the terms of the
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2, incorporated herein by reference.
6 # GNU General Public License version 2, incorporated herein by reference.
7
7
8 """simplified reStructuredText parser.
8 """simplified reStructuredText parser.
9
9
10 This parser knows just enough about reStructuredText to parse the
10 This parser knows just enough about reStructuredText to parse the
11 Mercurial docstrings.
11 Mercurial docstrings.
12
12
13 It cheats in a major way: nested blocks are not really nested. They
13 It cheats in a major way: nested blocks are not really nested. They
14 are just indented blocks that look like they are nested. This relies
14 are just indented blocks that look like they are nested. This relies
15 on the user to keep the right indentation for the blocks.
15 on the user to keep the right indentation for the blocks.
16
16
17 It only supports a small subset of reStructuredText:
17 It only supports a small subset of reStructuredText:
18
18
19 - sections
19 - sections
20
20
21 - paragraphs
21 - paragraphs
22
22
23 - literal blocks
23 - literal blocks
24
24
25 - definition lists
25 - definition lists
26
26
27 - bullet lists (items must start with '-')
27 - bullet lists (items must start with '-')
28
28
29 - enumerated lists (no autonumbering)
29 - enumerated lists (no autonumbering)
30
30
31 - field lists (colons cannot be escaped)
31 - field lists (colons cannot be escaped)
32
32
33 - option lists (supports only long options without arguments)
33 - option lists (supports only long options without arguments)
34
34
35 - inline literals (no other inline markup is not recognized)
35 - inline literals (no other inline markup is not recognized)
36 """
36 """
37
37
38 import re, sys, textwrap
38 import re, sys, textwrap
39
39
40
40
41 def findblocks(text):
41 def findblocks(text):
42 """Find continuous blocks of lines in text.
42 """Find continuous blocks of lines in text.
43
43
44 Returns a list of dictionaries representing the blocks. Each block
44 Returns a list of dictionaries representing the blocks. Each block
45 has an 'indent' field and a 'lines' field.
45 has an 'indent' field and a 'lines' field.
46 """
46 """
47 blocks = [[]]
47 blocks = [[]]
48 lines = text.splitlines()
48 lines = text.splitlines()
49 for line in lines:
49 for line in lines:
50 if line.strip():
50 if line.strip():
51 blocks[-1].append(line)
51 blocks[-1].append(line)
52 elif blocks[-1]:
52 elif blocks[-1]:
53 blocks.append([])
53 blocks.append([])
54 if not blocks[-1]:
54 if not blocks[-1]:
55 del blocks[-1]
55 del blocks[-1]
56
56
57 for i, block in enumerate(blocks):
57 for i, block in enumerate(blocks):
58 indent = min((len(l) - len(l.lstrip())) for l in block)
58 indent = min((len(l) - len(l.lstrip())) for l in block)
59 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
59 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
60 return blocks
60 return blocks
61
61
62
62
63 def findliteralblocks(blocks):
63 def findliteralblocks(blocks):
64 """Finds literal blocks and adds a 'type' field to the blocks.
64 """Finds literal blocks and adds a 'type' field to the blocks.
65
65
66 Literal blocks are given the type 'literal', all other blocks are
66 Literal blocks are given the type 'literal', all other blocks are
67 given type the 'paragraph'.
67 given type the 'paragraph'.
68 """
68 """
69 i = 0
69 i = 0
70 while i < len(blocks):
70 while i < len(blocks):
71 # Searching for a block that looks like this:
71 # Searching for a block that looks like this:
72 #
72 #
73 # +------------------------------+
73 # +------------------------------+
74 # | paragraph |
74 # | paragraph |
75 # | (ends with "::") |
75 # | (ends with "::") |
76 # +------------------------------+
76 # +------------------------------+
77 # +---------------------------+
77 # +---------------------------+
78 # | indented literal block |
78 # | indented literal block |
79 # +---------------------------+
79 # +---------------------------+
80 blocks[i]['type'] = 'paragraph'
80 blocks[i]['type'] = 'paragraph'
81 if blocks[i]['lines'][-1].endswith('::') and i+1 < len(blocks):
81 if blocks[i]['lines'][-1].endswith('::') and i+1 < len(blocks):
82 indent = blocks[i]['indent']
82 indent = blocks[i]['indent']
83 adjustment = blocks[i+1]['indent'] - indent
83 adjustment = blocks[i+1]['indent'] - indent
84
84
85 if blocks[i]['lines'] == ['::']:
85 if blocks[i]['lines'] == ['::']:
86 # Expanded form: remove block
86 # Expanded form: remove block
87 del blocks[i]
87 del blocks[i]
88 i -= 1
88 i -= 1
89 elif blocks[i]['lines'][-1].endswith(' ::'):
89 elif blocks[i]['lines'][-1].endswith(' ::'):
90 # Partially minimized form: remove space and both
90 # Partially minimized form: remove space and both
91 # colons.
91 # colons.
92 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
92 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
93 else:
93 else:
94 # Fully minimized form: remove just one colon.
94 # Fully minimized form: remove just one colon.
95 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
95 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
96
96
97 # List items are formatted with a hanging indent. We must
97 # List items are formatted with a hanging indent. We must
98 # correct for this here while we still have the original
98 # correct for this here while we still have the original
99 # information on the indentation of the subsequent literal
99 # information on the indentation of the subsequent literal
100 # blocks available.
100 # blocks available.
101 m = _bulletre.match(blocks[i]['lines'][0])
101 m = _bulletre.match(blocks[i]['lines'][0])
102 if m:
102 if m:
103 indent += m.end()
103 indent += m.end()
104 adjustment -= m.end()
104 adjustment -= m.end()
105
105
106 # Mark the following indented blocks.
106 # Mark the following indented blocks.
107 while i+1 < len(blocks) and blocks[i+1]['indent'] > indent:
107 while i+1 < len(blocks) and blocks[i+1]['indent'] > indent:
108 blocks[i+1]['type'] = 'literal'
108 blocks[i+1]['type'] = 'literal'
109 blocks[i+1]['indent'] -= adjustment
109 blocks[i+1]['indent'] -= adjustment
110 i += 1
110 i += 1
111 i += 1
111 i += 1
112 return blocks
112 return blocks
113
113
114 _bulletre = re.compile(r'(-|[0-9A-Za-z]+\.|\(?[0-9A-Za-z]+\)) ')
114 _bulletre = re.compile(r'(-|[0-9A-Za-z]+\.|\(?[0-9A-Za-z]+\)) ')
115 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
115 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
116 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):( +)(.*)')
116 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):( +)(.*)')
117 _definitionre = re.compile(r'[^ ]')
117 _definitionre = re.compile(r'[^ ]')
118
118
119 def splitparagraphs(blocks):
119 def splitparagraphs(blocks):
120 """Split paragraphs into lists."""
120 """Split paragraphs into lists."""
121 # Tuples with (list type, item regexp, single line items?). Order
121 # Tuples with (list type, item regexp, single line items?). Order
122 # matters: definition lists has the least specific regexp and must
122 # matters: definition lists has the least specific regexp and must
123 # come last.
123 # come last.
124 listtypes = [('bullet', _bulletre, True),
124 listtypes = [('bullet', _bulletre, True),
125 ('option', _optionre, True),
125 ('option', _optionre, True),
126 ('field', _fieldre, True),
126 ('field', _fieldre, True),
127 ('definition', _definitionre, False)]
127 ('definition', _definitionre, False)]
128
128
129 def match(lines, i, itemre, singleline):
129 def match(lines, i, itemre, singleline):
130 """Does itemre match an item at line i?
130 """Does itemre match an item at line i?
131
131
132 A list item can be followed by an idented line or another list
132 A list item can be followed by an idented line or another list
133 item (but only if singleline is True).
133 item (but only if singleline is True).
134 """
134 """
135 line1 = lines[i]
135 line1 = lines[i]
136 line2 = i+1 < len(lines) and lines[i+1] or ''
136 line2 = i+1 < len(lines) and lines[i+1] or ''
137 if not itemre.match(line1):
137 if not itemre.match(line1):
138 return False
138 return False
139 if singleline:
139 if singleline:
140 return line2 == '' or line2[0] == ' ' or itemre.match(line2)
140 return line2 == '' or line2[0] == ' ' or itemre.match(line2)
141 else:
141 else:
142 return line2.startswith(' ')
142 return line2.startswith(' ')
143
143
144 i = 0
144 i = 0
145 while i < len(blocks):
145 while i < len(blocks):
146 if blocks[i]['type'] == 'paragraph':
146 if blocks[i]['type'] == 'paragraph':
147 lines = blocks[i]['lines']
147 lines = blocks[i]['lines']
148 for type, itemre, singleline in listtypes:
148 for type, itemre, singleline in listtypes:
149 if match(lines, 0, itemre, singleline):
149 if match(lines, 0, itemre, singleline):
150 items = []
150 items = []
151 for j, line in enumerate(lines):
151 for j, line in enumerate(lines):
152 if match(lines, j, itemre, singleline):
152 if match(lines, j, itemre, singleline):
153 items.append(dict(type=type, lines=[],
153 items.append(dict(type=type, lines=[],
154 indent=blocks[i]['indent']))
154 indent=blocks[i]['indent']))
155 items[-1]['lines'].append(line)
155 items[-1]['lines'].append(line)
156 blocks[i:i+1] = items
156 blocks[i:i+1] = items
157 break
157 break
158 i += 1
158 i += 1
159 return blocks
159 return blocks
160
160
161
161
162 def findsections(blocks):
162 def findsections(blocks):
163 """Finds sections.
163 """Finds sections.
164
164
165 The blocks must have a 'type' field, i.e., they should have been
165 The blocks must have a 'type' field, i.e., they should have been
166 run through findliteralblocks first.
166 run through findliteralblocks first.
167 """
167 """
168 for block in blocks:
168 for block in blocks:
169 # Searching for a block that looks like this:
169 # Searching for a block that looks like this:
170 #
170 #
171 # +------------------------------+
171 # +------------------------------+
172 # | Section title |
172 # | Section title |
173 # | ------------- |
173 # | ------------- |
174 # +------------------------------+
174 # +------------------------------+
175 if (block['type'] == 'paragraph' and
175 if (block['type'] == 'paragraph' and
176 len(block['lines']) == 2 and
176 len(block['lines']) == 2 and
177 block['lines'][1] == '-' * len(block['lines'][0])):
177 block['lines'][1] == '-' * len(block['lines'][0])):
178 block['type'] = 'section'
178 block['type'] = 'section'
179 return blocks
179 return blocks
180
180
181
181
182 def inlineliterals(blocks):
182 def inlineliterals(blocks):
183 for b in blocks:
183 for b in blocks:
184 if b['type'] == 'paragraph':
184 if b['type'] == 'paragraph':
185 b['lines'] = [l.replace('``', '"') for l in b['lines']]
185 b['lines'] = [l.replace('``', '"') for l in b['lines']]
186 return blocks
186 return blocks
187
187
188
188
189 def addmargins(blocks):
189 def addmargins(blocks):
190 """Adds empty blocks for vertical spacing.
190 """Adds empty blocks for vertical spacing.
191
191
192 This groups bullets, options, and definitions together with no vertical
192 This groups bullets, options, and definitions together with no vertical
193 space between them, and adds an empty block between all other blocks.
193 space between them, and adds an empty block between all other blocks.
194 """
194 """
195 i = 1
195 i = 1
196 while i < len(blocks):
196 while i < len(blocks):
197 if (blocks[i]['type'] == blocks[i-1]['type'] and
197 if (blocks[i]['type'] == blocks[i-1]['type'] and
198 blocks[i]['type'] in ('bullet', 'option', 'field', 'definition')):
198 blocks[i]['type'] in ('bullet', 'option', 'field', 'definition')):
199 i += 1
199 i += 1
200 else:
200 else:
201 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
201 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
202 i += 2
202 i += 2
203 return blocks
203 return blocks
204
204
205
205
206 def formatblock(block, width):
206 def formatblock(block, width):
207 """Format a block according to width."""
207 """Format a block according to width."""
208 if width <= 0:
208 if width <= 0:
209 width = 78
209 width = 78
210 indent = ' ' * block['indent']
210 indent = ' ' * block['indent']
211 if block['type'] == 'margin':
211 if block['type'] == 'margin':
212 return ''
212 return ''
213 if block['type'] == 'literal':
213 if block['type'] == 'literal':
214 indent += ' '
214 indent += ' '
215 return indent + ('\n' + indent).join(block['lines'])
215 return indent + ('\n' + indent).join(block['lines'])
216 if block['type'] == 'section':
216 if block['type'] == 'section':
217 return indent + ('\n' + indent).join(block['lines'])
217 return indent + ('\n' + indent).join(block['lines'])
218 if block['type'] == 'definition':
218 if block['type'] == 'definition':
219 term = indent + block['lines'][0]
219 term = indent + block['lines'][0]
220 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
220 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
221 defindent = indent + hang * ' '
221 defindent = indent + hang * ' '
222 text = ' '.join(map(str.strip, block['lines'][1:]))
222 text = ' '.join(map(str.strip, block['lines'][1:]))
223 return "%s\n%s" % (term, textwrap.fill(text, width=width,
223 return "%s\n%s" % (term, textwrap.fill(text, width=width,
224 initial_indent=defindent,
224 initial_indent=defindent,
225 subsequent_indent=defindent))
225 subsequent_indent=defindent))
226 initindent = subindent = indent
226 initindent = subindent = indent
227 if block['type'] == 'bullet':
227 if block['type'] == 'bullet':
228 m = _bulletre.match(block['lines'][0])
228 m = _bulletre.match(block['lines'][0])
229 if m:
229 subindent = indent + m.end() * ' '
230 subindent = indent + m.end() * ' '
231 elif block['type'] == 'field':
230 elif block['type'] == 'field':
232 m = _fieldre.match(block['lines'][0])
231 m = _fieldre.match(block['lines'][0])
233 if m:
232 key, spaces, rest = m.groups()
234 key, spaces, rest = m.groups()
233 # Turn ":foo: bar" into "foo bar".
235 # Turn ":foo: bar" into "foo bar".
234 block['lines'][0] = '%s %s%s' % (key, spaces, rest)
236 block['lines'][0] = '%s %s%s' % (key, spaces, rest)
235 subindent = indent + (2 + len(key) + len(spaces)) * ' '
237 subindent = indent + (2 + len(key) + len(spaces)) * ' '
238 elif block['type'] == 'option':
236 elif block['type'] == 'option':
239 m = _optionre.match(block['lines'][0])
237 m = _optionre.match(block['lines'][0])
240 if m:
238 option, arg, rest = m.groups()
241 option, arg, rest = m.groups()
239 subindent = indent + (len(option) + len(arg)) * ' '
242 subindent = indent + (len(option) + len(arg)) * ' '
243
240
244 text = ' '.join(map(str.strip, block['lines']))
241 text = ' '.join(map(str.strip, block['lines']))
245 return textwrap.fill(text, width=width,
242 return textwrap.fill(text, width=width,
246 initial_indent=initindent,
243 initial_indent=initindent,
247 subsequent_indent=subindent)
244 subsequent_indent=subindent)
248
245
249
246
250 def format(text, width, indent=0):
247 def format(text, width, indent=0):
251 """Parse and format the text according to width."""
248 """Parse and format the text according to width."""
252 blocks = findblocks(text)
249 blocks = findblocks(text)
253 for b in blocks:
250 for b in blocks:
254 b['indent'] += indent
251 b['indent'] += indent
255 blocks = findliteralblocks(blocks)
252 blocks = findliteralblocks(blocks)
256 blocks = inlineliterals(blocks)
253 blocks = inlineliterals(blocks)
257 blocks = splitparagraphs(blocks)
254 blocks = splitparagraphs(blocks)
258 blocks = findsections(blocks)
255 blocks = findsections(blocks)
259 blocks = addmargins(blocks)
256 blocks = addmargins(blocks)
260 return '\n'.join(formatblock(b, width) for b in blocks)
257 return '\n'.join(formatblock(b, width) for b in blocks)
261
258
262
259
263 if __name__ == "__main__":
260 if __name__ == "__main__":
264 from pprint import pprint
261 from pprint import pprint
265
262
266 def debug(func, blocks):
263 def debug(func, blocks):
267 blocks = func(blocks)
264 blocks = func(blocks)
268 print "*** after %s:" % func.__name__
265 print "*** after %s:" % func.__name__
269 pprint(blocks)
266 pprint(blocks)
270 print
267 print
271 return blocks
268 return blocks
272
269
273 text = open(sys.argv[1]).read()
270 text = open(sys.argv[1]).read()
274 blocks = debug(findblocks, text)
271 blocks = debug(findblocks, text)
275 blocks = debug(findliteralblocks, blocks)
272 blocks = debug(findliteralblocks, blocks)
276 blocks = debug(inlineliterals, blocks)
273 blocks = debug(inlineliterals, blocks)
277 blocks = debug(splitparagraphs, blocks)
274 blocks = debug(splitparagraphs, blocks)
278 blocks = debug(findsections, blocks)
275 blocks = debug(findsections, blocks)
279 blocks = debug(addmargins, blocks)
276 blocks = debug(addmargins, blocks)
280 print '\n'.join(formatblock(b, 30) for b in blocks)
277 print '\n'.join(formatblock(b, 30) for b in blocks)
General Comments 0
You need to be logged in to leave comments. Login now