##// END OF EJS Templates
minimal reStructuredText parser
Martin Geisler -
r9156:c9c7e8cd default
parent child Browse files
Show More
@@ -0,0 +1,299 b''
1 # minirst.py - minimal reStructuredText parser
2 #
3 # Copyright 2009 Matt Mackall <mpm@selenic.com> and others
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2, incorporated herein by reference.
7
8 """simplified reStructuredText parser.
9
10 This parser knows just enough about reStructuredText to parse the
11 Mercurial docstrings.
12
13 It cheats in a major way: nested blocks are not really nested. They
14 are just indented blocks that look like they are nested. This relies
15 on the user to keep the right indentation for the blocks.
16
17 It only supports a small subset of reStructuredText:
18
19 - paragraphs
20
21 - definition lists (must use ' ' to indent definitions)
22
23 - lists (items must start with '-')
24
25 - literal blocks
26
27 - option lists (supports only long options without arguments)
28
29 - inline markup is not recognized at all.
30 """
31
32 import re, sys, textwrap
33
34
35 def findblocks(text):
36 """Find continuous blocks of lines in text.
37
38 Returns a list of dictionaries representing the blocks. Each block
39 has an 'indent' field and a 'lines' field.
40 """
41 blocks = [[]]
42 lines = text.splitlines()
43 for line in lines:
44 if line.strip():
45 blocks[-1].append(line)
46 elif blocks[-1]:
47 blocks.append([])
48 if not blocks[-1]:
49 del blocks[-1]
50
51 for i, block in enumerate(blocks):
52 indent = min((len(l) - len(l.lstrip())) for l in block)
53 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
54 return blocks
55
56
57 def findliteralblocks(blocks):
58 """Finds literal blocks and adds a 'type' field to the blocks.
59
60 Literal blocks are given the type 'literal', all other blocks are
61 given type the 'paragraph'.
62 """
63 i = 0
64 while i < len(blocks):
65 # Searching for a block that looks like this:
66 #
67 # +------------------------------+
68 # | paragraph |
69 # | (ends with "::") |
70 # +------------------------------+
71 # +---------------------------+
72 # | indented literal block |
73 # +---------------------------+
74 blocks[i]['type'] = 'paragraph'
75 if blocks[i]['lines'][-1].endswith('::') and i+1 < len(blocks):
76 indent = blocks[i]['indent']
77 adjustment = blocks[i+1]['indent'] - indent
78
79 if blocks[i]['lines'] == ['::']:
80 # Expanded form: remove block
81 del blocks[i]
82 i -= 1
83 elif blocks[i]['lines'][-1].endswith(' ::'):
84 # Partially minimized form: remove space and both
85 # colons.
86 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
87 else:
88 # Fully minimized form: remove just one colon.
89 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
90
91 # List items are formatted with a hanging indent. We must
92 # correct for this here while we still have the original
93 # information on the indentation of the subsequent literal
94 # blocks available.
95 if blocks[i]['lines'][0].startswith('- '):
96 indent += 2
97 adjustment -= 2
98
99 # Mark the following indented blocks.
100 while i+1 < len(blocks) and blocks[i+1]['indent'] > indent:
101 blocks[i+1]['type'] = 'literal'
102 blocks[i+1]['indent'] -= adjustment
103 i += 1
104 i += 1
105 return blocks
106
107
108 def findsections(blocks):
109 """Finds sections.
110
111 The blocks must have a 'type' field, i.e., they should have been
112 run through findliteralblocks first.
113 """
114 for block in blocks:
115 # Searching for a block that looks like this:
116 #
117 # +------------------------------+
118 # | Section title |
119 # | ------------- |
120 # +------------------------------+
121 if (block['type'] == 'paragraph' and
122 len(block['lines']) == 2 and
123 block['lines'][1] == '-' * len(block['lines'][0])):
124 block['type'] = 'section'
125 return blocks
126
127
128 def findbulletlists(blocks):
129 """Finds bullet lists.
130
131 The blocks must have a 'type' field, i.e., they should have been
132 run through findliteralblocks first.
133 """
134 i = 0
135 while i < len(blocks):
136 # Searching for a paragraph that looks like this:
137 #
138 # +------+-----------------------+
139 # | "- " | list item |
140 # +------| (body elements)+ |
141 # +-----------------------+
142 if (blocks[i]['type'] == 'paragraph' and
143 blocks[i]['lines'][0].startswith('- ')):
144 items = []
145 for line in blocks[i]['lines']:
146 if line.startswith('- '):
147 items.append(dict(type='bullet', lines=[],
148 indent=blocks[i]['indent'] + 2))
149 line = line[2:]
150 items[-1]['lines'].append(line)
151 blocks[i:i+1] = items
152 i += len(items) - 1
153 i += 1
154 return blocks
155
156
157 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
158 def findoptionlists(blocks):
159 """Finds option lists.
160
161 The blocks must have a 'type' field, i.e., they should have been
162 run through findliteralblocks first.
163 """
164 i = 0
165 while i < len(blocks):
166 # Searching for a paragraph that looks like this:
167 #
168 # +----------------------------+-------------+
169 # | "--" option " " | description |
170 # +-------+--------------------+ |
171 # | (body elements)+ |
172 # +----------------------------------+
173 if (blocks[i]['type'] == 'paragraph' and
174 _optionre.match(blocks[i]['lines'][0])):
175 options = []
176 for line in blocks[i]['lines']:
177 m = _optionre.match(line)
178 if m:
179 option, arg, rest = m.groups()
180 width = len(option) + len(arg)
181 options.append(dict(type='option', lines=[],
182 indent=blocks[i]['indent'],
183 width=width))
184 options[-1]['lines'].append(line)
185 blocks[i:i+1] = options
186 i += len(options) - 1
187 i += 1
188 return blocks
189
190
191 def finddefinitionlists(blocks):
192 """Finds definition lists.
193
194 The blocks must have a 'type' field, i.e., they should have been
195 run through findliteralblocks first.
196 """
197 i = 0
198 while i < len(blocks):
199 # Searching for a paragraph that looks like this:
200 #
201 # +----------------------------+
202 # | term |
203 # +--+-------------------------+--+
204 # | definition |
205 # | (body elements)+ |
206 # +----------------------------+
207 if (blocks[i]['type'] == 'paragraph' and
208 len(blocks[i]['lines']) > 1 and
209 not blocks[i]['lines'][0].startswith(' ') and
210 blocks[i]['lines'][1].startswith(' ')):
211 definitions = []
212 for line in blocks[i]['lines']:
213 if not line.startswith(' '):
214 definitions.append(dict(type='definition', lines=[],
215 indent=blocks[i]['indent']))
216 definitions[-1]['lines'].append(line)
217 definitions[-1]['hang'] = len(line) - len(line.lstrip())
218 blocks[i:i+1] = definitions
219 i += len(definitions) - 1
220 i += 1
221 return blocks
222
223
224 def addmargins(blocks):
225 """Adds empty blocks for vertical spacing.
226
227 This groups bullets, options, and definitions together with no vertical
228 space between them, and adds an empty block between all other blocks.
229 """
230 i = 1
231 while i < len(blocks):
232 if (blocks[i]['type'] == blocks[i-1]['type'] and
233 blocks[i]['type'] in ('bullet', 'option', 'definition')):
234 i += 1
235 else:
236 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
237 i += 2
238 return blocks
239
240
241 def formatblock(block, width):
242 """Format a block according to width."""
243 indent = ' ' * block['indent']
244 if block['type'] == 'margin':
245 return ''
246 elif block['type'] in ('literal', 'section'):
247 return indent + ('\n' + indent).join(block['lines'])
248 elif block['type'] == 'definition':
249 term = indent + block['lines'][0]
250 defindent = indent + block['hang'] * ' '
251 text = ' '.join(map(str.strip, block['lines'][1:]))
252 return "%s\n%s" % (term, textwrap.fill(text, width=width,
253 initial_indent=defindent,
254 subsequent_indent=defindent))
255 else:
256 initindent = subindent = indent
257 text = ' '.join(map(str.strip, block['lines']))
258 if block['type'] == 'bullet':
259 initindent = indent[:-2] + '- '
260 subindent = indent
261 elif block['type'] == 'option':
262 subindent = indent + block['width'] * ' '
263
264 return textwrap.fill(text, width=width,
265 initial_indent=initindent,
266 subsequent_indent=subindent)
267
268
269 def format(text, width):
270 """Parse and format the text according to width."""
271 blocks = findblocks(text)
272 blocks = findliteralblocks(blocks)
273 blocks = findsections(blocks)
274 blocks = findbulletlists(blocks)
275 blocks = findoptionlists(blocks)
276 blocks = finddefinitionlists(blocks)
277 blocks = addmargins(blocks)
278 return '\n'.join(formatblock(b, width) for b in blocks)
279
280
281 if __name__ == "__main__":
282 from pprint import pprint
283
284 def debug(func, blocks):
285 blocks = func(blocks)
286 print "*** after %s:" % func.__name__
287 pprint(blocks)
288 print
289 return blocks
290
291 text = open(sys.argv[1]).read()
292 blocks = debug(findblocks, text)
293 blocks = debug(findliteralblocks, blocks)
294 blocks = debug(findsections, blocks)
295 blocks = debug(findbulletlists, blocks)
296 blocks = debug(findoptionlists, blocks)
297 blocks = debug(finddefinitionlists, blocks)
298 blocks = debug(addmargins, blocks)
299 print '\n'.join(formatblock(b, 30) for b in blocks)
@@ -0,0 +1,138 b''
1 #!/usr/bin/env python
2
3 from mercurial import minirst
4
5 def debugformat(title, text, width):
6 print "%s formatted to fit within %d characters:" % (title, width)
7 print "-" * 70
8 print minirst.format(text, width)
9 print "-" * 70
10 print
11
12 paragraphs = """
13 This is some text in the first paragraph.
14
15 An indented paragraph
16 with just two lines.
17
18
19 The third paragraph. It is followed by some
20 random lines with spurious spaces.
21
22
23
24
25
26 No indention
27 here, despite
28 the uneven left
29 margin.
30
31 Only the
32 left-most line
33 (this line!)
34 is significant
35 for the indentation
36
37 """
38
39 debugformat('paragraphs', paragraphs, 60)
40 debugformat('paragraphs', paragraphs, 30)
41
42
43 definitions = """
44 A Term
45 Definition. The indented
46 lines make up the definition.
47 Another Term
48 Another definition. The final line in the
49 definition determines the indentation, so
50 this will be indented with four spaces.
51
52 A Nested/Indented Term
53 Definition.
54 """
55
56 debugformat('definitions', definitions, 60)
57 debugformat('definitions', definitions, 30)
58
59
60 literals = r"""
61 The fully minimized form is the most
62 convenient form::
63
64 Hello
65 literal
66 world
67
68 In the partially minimized form a paragraph
69 simply ends with space-double-colon. ::
70
71 ////////////////////////////////////////
72 long un-wrapped line in a literal block
73 \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
74
75 ::
76
77 This literal block is started with '::',
78 the so-called expanded form. The paragraph
79 with '::' disappears in the final output.
80 """
81
82 debugformat('literals', literals, 60)
83 debugformat('literals', literals, 30)
84
85
86 lists = """
87 - This is the first list item.
88
89 Second paragraph in the first list item.
90
91 - List items need not be separated
92 by a blank line.
93 - And will be rendered without
94 one in any case.
95
96 We can have indented lists:
97
98 - This is an indented list item
99
100 - Another indented list item::
101
102 - A literal block in the middle
103 of an indented list.
104
105 (The above is not a list item since we are in the literal block.)
106
107 ::
108
109 Literal block with no indentation.
110 """
111
112 debugformat('lists', lists, 60)
113 debugformat('lists', lists, 30)
114
115
116 options = """
117 There is support for simple option lists,
118 but only with long options:
119
120 --all Output all.
121 --both Output both (this description is
122 quite long).
123 --long Output all day long.
124
125 --par This option has two paragraphs in its description.
126 This is the first.
127
128 This is the second. Blank lines may be omitted between
129 options (as above) or left in (as here).
130
131 The next paragraph looks like an option list, but lacks the two-space
132 marker after the option. It is treated as a normal paragraph:
133
134 --foo bar baz
135 """
136
137 debugformat('options', options, 60)
138 debugformat('options', options, 30)
@@ -0,0 +1,209 b''
1 paragraphs formatted to fit within 60 characters:
2 ----------------------------------------------------------------------
3 This is some text in the first paragraph.
4
5 An indented paragraph with just two lines.
6
7 The third paragraph. It is followed by some random lines
8 with spurious spaces.
9
10 No indention here, despite the uneven left margin.
11
12 Only the left-most line (this line!) is significant for
13 the indentation
14 ----------------------------------------------------------------------
15
16 paragraphs formatted to fit within 30 characters:
17 ----------------------------------------------------------------------
18 This is some text in the first
19 paragraph.
20
21 An indented paragraph with
22 just two lines.
23
24 The third paragraph. It is
25 followed by some random lines
26 with spurious spaces.
27
28 No indention here, despite the
29 uneven left margin.
30
31 Only the left-most line
32 (this line!) is significant
33 for the indentation
34 ----------------------------------------------------------------------
35
36 definitions formatted to fit within 60 characters:
37 ----------------------------------------------------------------------
38 A Term
39 Definition. The indented lines make up the definition.
40 Another Term
41 Another definition. The final line in the definition
42 determines the indentation, so this will be indented
43 with four spaces.
44 A Nested/Indented Term
45 Definition.
46 ----------------------------------------------------------------------
47
48 definitions formatted to fit within 30 characters:
49 ----------------------------------------------------------------------
50 A Term
51 Definition. The indented
52 lines make up the
53 definition.
54 Another Term
55 Another definition. The
56 final line in the
57 definition determines the
58 indentation, so this will
59 be indented with four
60 spaces.
61 A Nested/Indented Term
62 Definition.
63 ----------------------------------------------------------------------
64
65 literals formatted to fit within 60 characters:
66 ----------------------------------------------------------------------
67 The fully minimized form is the most convenient form:
68
69 Hello
70 literal
71 world
72
73 In the partially minimized form a paragraph simply ends with
74 space-double-colon.
75
76 ////////////////////////////////////////
77 long un-wrapped line in a literal block
78 \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
79
80 This literal block is started with '::',
81 the so-called expanded form. The paragraph
82 with '::' disappears in the final output.
83 ----------------------------------------------------------------------
84
85 literals formatted to fit within 30 characters:
86 ----------------------------------------------------------------------
87 The fully minimized form is
88 the most convenient form:
89
90 Hello
91 literal
92 world
93
94 In the partially minimized
95 form a paragraph simply ends
96 with space-double-colon.
97
98 ////////////////////////////////////////
99 long un-wrapped line in a literal block
100 \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
101
102 This literal block is started with '::',
103 the so-called expanded form. The paragraph
104 with '::' disappears in the final output.
105 ----------------------------------------------------------------------
106
107 lists formatted to fit within 60 characters:
108 ----------------------------------------------------------------------
109 - This is the first list item.
110
111 Second paragraph in the first list item.
112
113 - List items need not be separated by a blank line.
114 - And will be rendered without one in any case.
115
116 We can have indented lists:
117
118 - This is an indented list item
119 - Another indented list item:
120
121 - A literal block in the middle
122 of an indented list.
123
124 (The above is not a list item since we are in the literal block.)
125
126 Literal block with no indentation.
127 ----------------------------------------------------------------------
128
129 lists formatted to fit within 30 characters:
130 ----------------------------------------------------------------------
131 - This is the first list item.
132
133 Second paragraph in the
134 first list item.
135
136 - List items need not be
137 separated by a blank line.
138 - And will be rendered without
139 one in any case.
140
141 We can have indented lists:
142
143 - This is an indented list
144 item
145 - Another indented list
146 item:
147
148 - A literal block in the middle
149 of an indented list.
150
151 (The above is not a list item since we are in the literal block.)
152
153 Literal block with no indentation.
154 ----------------------------------------------------------------------
155
156 options formatted to fit within 60 characters:
157 ----------------------------------------------------------------------
158 There is support for simple option lists, but only with long
159 options:
160
161 --all Output all.
162 --both Output both (this description is quite long).
163 --long Output all day long.
164 --par This option has two paragraphs in its
165 description. This is the first.
166
167 This is the second. Blank lines may be omitted
168 between options (as above) or left in (as here).
169
170 The next paragraph looks like an option list, but lacks the
171 two-space marker after the option. It is treated as a normal
172 paragraph:
173
174 --foo bar baz
175 ----------------------------------------------------------------------
176
177 options formatted to fit within 30 characters:
178 ----------------------------------------------------------------------
179 There is support for simple
180 option lists, but only with
181 long options:
182
183 --all Output all.
184 --both Output both (this
185 description is
186 quite long).
187 --long Output all day
188 long.
189 --par This option has two
190 paragraphs in its
191 description. This
192 is the first.
193
194 This is the second.
195 Blank lines may be
196 omitted between
197 options (as above)
198 or left in (as
199 here).
200
201 The next paragraph looks like
202 an option list, but lacks the
203 two-space marker after the
204 option. It is treated as a
205 normal paragraph:
206
207 --foo bar baz
208 ----------------------------------------------------------------------
209
General Comments 0
You need to be logged in to leave comments. Login now