##// END OF EJS Templates
minirst: use unicode string as intermediate form for replacement...
FUJIWARA Katsunori -
r11464:521c8e0c stable
parent child Browse files
Show More
@@ -1,385 +1,392 b''
1 1 # minirst.py - minimal reStructuredText parser
2 2 #
3 3 # Copyright 2009, 2010 Matt Mackall <mpm@selenic.com> and others
4 4 #
5 5 # This software may be used and distributed according to the terms of the
6 6 # GNU General Public License version 2 or any later version.
7 7
8 8 """simplified reStructuredText parser.
9 9
10 10 This parser knows just enough about reStructuredText to parse the
11 11 Mercurial docstrings.
12 12
13 13 It cheats in a major way: nested blocks are not really nested. They
14 14 are just indented blocks that look like they are nested. This relies
15 15 on the user to keep the right indentation for the blocks.
16 16
17 17 It only supports a small subset of reStructuredText:
18 18
19 19 - sections
20 20
21 21 - paragraphs
22 22
23 23 - literal blocks
24 24
25 25 - definition lists
26 26
27 27 - bullet lists (items must start with '-')
28 28
29 29 - enumerated lists (no autonumbering)
30 30
31 31 - field lists (colons cannot be escaped)
32 32
33 33 - option lists (supports only long options without arguments)
34 34
35 35 - inline literals (no other inline markup is not recognized)
36 36 """
37 37
38 38 import re, sys
39 import util
39 import util, encoding
40
41 def replace(text, substs):
42 utext = text.decode(encoding.encoding)
43 for f, t in substs:
44 utext = utext.replace(f, t)
45 return utext.encode(encoding.encoding)
40 46
41 47 def findblocks(text):
42 48 """Find continuous blocks of lines in text.
43 49
44 50 Returns a list of dictionaries representing the blocks. Each block
45 51 has an 'indent' field and a 'lines' field.
46 52 """
47 53 blocks = [[]]
48 54 lines = text.splitlines()
49 55 for line in lines:
50 56 if line.strip():
51 57 blocks[-1].append(line)
52 58 elif blocks[-1]:
53 59 blocks.append([])
54 60 if not blocks[-1]:
55 61 del blocks[-1]
56 62
57 63 for i, block in enumerate(blocks):
58 64 indent = min((len(l) - len(l.lstrip())) for l in block)
59 65 blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block])
60 66 return blocks
61 67
62 68
63 69 def findliteralblocks(blocks):
64 70 """Finds literal blocks and adds a 'type' field to the blocks.
65 71
66 72 Literal blocks are given the type 'literal', all other blocks are
67 73 given type the 'paragraph'.
68 74 """
69 75 i = 0
70 76 while i < len(blocks):
71 77 # Searching for a block that looks like this:
72 78 #
73 79 # +------------------------------+
74 80 # | paragraph |
75 81 # | (ends with "::") |
76 82 # +------------------------------+
77 83 # +---------------------------+
78 84 # | indented literal block |
79 85 # +---------------------------+
80 86 blocks[i]['type'] = 'paragraph'
81 87 if blocks[i]['lines'][-1].endswith('::') and i + 1 < len(blocks):
82 88 indent = blocks[i]['indent']
83 89 adjustment = blocks[i + 1]['indent'] - indent
84 90
85 91 if blocks[i]['lines'] == ['::']:
86 92 # Expanded form: remove block
87 93 del blocks[i]
88 94 i -= 1
89 95 elif blocks[i]['lines'][-1].endswith(' ::'):
90 96 # Partially minimized form: remove space and both
91 97 # colons.
92 98 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
93 99 else:
94 100 # Fully minimized form: remove just one colon.
95 101 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
96 102
97 103 # List items are formatted with a hanging indent. We must
98 104 # correct for this here while we still have the original
99 105 # information on the indentation of the subsequent literal
100 106 # blocks available.
101 107 m = _bulletre.match(blocks[i]['lines'][0])
102 108 if m:
103 109 indent += m.end()
104 110 adjustment -= m.end()
105 111
106 112 # Mark the following indented blocks.
107 113 while i + 1 < len(blocks) and blocks[i + 1]['indent'] > indent:
108 114 blocks[i + 1]['type'] = 'literal'
109 115 blocks[i + 1]['indent'] -= adjustment
110 116 i += 1
111 117 i += 1
112 118 return blocks
113 119
114 120 _bulletre = re.compile(r'(-|[0-9A-Za-z]+\.|\(?[0-9A-Za-z]+\)|\|) ')
115 121 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$')
116 122 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):[ ]+(.*)')
117 123 _definitionre = re.compile(r'[^ ]')
118 124
119 125 def splitparagraphs(blocks):
120 126 """Split paragraphs into lists."""
121 127 # Tuples with (list type, item regexp, single line items?). Order
122 128 # matters: definition lists has the least specific regexp and must
123 129 # come last.
124 130 listtypes = [('bullet', _bulletre, True),
125 131 ('option', _optionre, True),
126 132 ('field', _fieldre, True),
127 133 ('definition', _definitionre, False)]
128 134
129 135 def match(lines, i, itemre, singleline):
130 136 """Does itemre match an item at line i?
131 137
132 138 A list item can be followed by an idented line or another list
133 139 item (but only if singleline is True).
134 140 """
135 141 line1 = lines[i]
136 142 line2 = i + 1 < len(lines) and lines[i + 1] or ''
137 143 if not itemre.match(line1):
138 144 return False
139 145 if singleline:
140 146 return line2 == '' or line2[0] == ' ' or itemre.match(line2)
141 147 else:
142 148 return line2.startswith(' ')
143 149
144 150 i = 0
145 151 while i < len(blocks):
146 152 if blocks[i]['type'] == 'paragraph':
147 153 lines = blocks[i]['lines']
148 154 for type, itemre, singleline in listtypes:
149 155 if match(lines, 0, itemre, singleline):
150 156 items = []
151 157 for j, line in enumerate(lines):
152 158 if match(lines, j, itemre, singleline):
153 159 items.append(dict(type=type, lines=[],
154 160 indent=blocks[i]['indent']))
155 161 items[-1]['lines'].append(line)
156 162 blocks[i:i + 1] = items
157 163 break
158 164 i += 1
159 165 return blocks
160 166
161 167
162 168 _fieldwidth = 12
163 169
164 170 def updatefieldlists(blocks):
165 171 """Find key and maximum key width for field lists."""
166 172 i = 0
167 173 while i < len(blocks):
168 174 if blocks[i]['type'] != 'field':
169 175 i += 1
170 176 continue
171 177
172 178 keywidth = 0
173 179 j = i
174 180 while j < len(blocks) and blocks[j]['type'] == 'field':
175 181 m = _fieldre.match(blocks[j]['lines'][0])
176 182 key, rest = m.groups()
177 183 blocks[j]['lines'][0] = rest
178 184 blocks[j]['key'] = key
179 185 keywidth = max(keywidth, len(key))
180 186 j += 1
181 187
182 188 for block in blocks[i:j]:
183 189 block['keywidth'] = keywidth
184 190 i = j + 1
185 191
186 192 return blocks
187 193
188 194
189 195 def prunecontainers(blocks, keep):
190 196 """Prune unwanted containers.
191 197
192 198 The blocks must have a 'type' field, i.e., they should have been
193 199 run through findliteralblocks first.
194 200 """
195 201 pruned = []
196 202 i = 0
197 203 while i + 1 < len(blocks):
198 204 # Searching for a block that looks like this:
199 205 #
200 206 # +-------+---------------------------+
201 207 # | ".. container ::" type |
202 208 # +---+ |
203 209 # | blocks |
204 210 # +-------------------------------+
205 211 if (blocks[i]['type'] == 'paragraph' and
206 212 blocks[i]['lines'][0].startswith('.. container::')):
207 213 indent = blocks[i]['indent']
208 214 adjustment = blocks[i + 1]['indent'] - indent
209 215 containertype = blocks[i]['lines'][0][15:]
210 216 prune = containertype not in keep
211 217 if prune:
212 218 pruned.append(containertype)
213 219
214 220 # Always delete "..container:: type" block
215 221 del blocks[i]
216 222 j = i
217 223 while j < len(blocks) and blocks[j]['indent'] > indent:
218 224 if prune:
219 225 del blocks[j]
220 226 i -= 1 # adjust outer index
221 227 else:
222 228 blocks[j]['indent'] -= adjustment
223 229 j += 1
224 230 i += 1
225 231 return blocks, pruned
226 232
227 233
228 234 _sectionre = re.compile(r"""^([-=`:.'"~^_*+#])\1+$""")
229 235
230 236 def findsections(blocks):
231 237 """Finds sections.
232 238
233 239 The blocks must have a 'type' field, i.e., they should have been
234 240 run through findliteralblocks first.
235 241 """
236 242 for block in blocks:
237 243 # Searching for a block that looks like this:
238 244 #
239 245 # +------------------------------+
240 246 # | Section title |
241 247 # | ------------- |
242 248 # +------------------------------+
243 249 if (block['type'] == 'paragraph' and
244 250 len(block['lines']) == 2 and
245 251 len(block['lines'][0]) == len(block['lines'][1]) and
246 252 _sectionre.match(block['lines'][1])):
247 253 block['underline'] = block['lines'][1][0]
248 254 block['type'] = 'section'
249 255 del block['lines'][1]
250 256 return blocks
251 257
252 258
253 259 def inlineliterals(blocks):
260 substs = [('``', '"')]
254 261 for b in blocks:
255 262 if b['type'] in ('paragraph', 'section'):
256 b['lines'] = [l.replace('``', '"') for l in b['lines']]
263 b['lines'] = [replace(l, substs) for l in b['lines']]
257 264 return blocks
258 265
259 266
260 267 def hgrole(blocks):
268 substs = [(':hg:`', '"hg '), ('`', '"')]
261 269 for b in blocks:
262 270 if b['type'] in ('paragraph', 'section'):
263 271 # Turn :hg:`command` into "hg command". This also works
264 272 # when there is a line break in the command and relies on
265 273 # the fact that we have no stray back-quotes in the input
266 274 # (run the blocks through inlineliterals first).
267 b['lines'] = [l.replace(':hg:`', '"hg ').replace('`', '"')
268 for l in b['lines']]
275 b['lines'] = [replace(l, substs) for l in b['lines']]
269 276 return blocks
270 277
271 278
272 279 def addmargins(blocks):
273 280 """Adds empty blocks for vertical spacing.
274 281
275 282 This groups bullets, options, and definitions together with no vertical
276 283 space between them, and adds an empty block between all other blocks.
277 284 """
278 285 i = 1
279 286 while i < len(blocks):
280 287 if (blocks[i]['type'] == blocks[i - 1]['type'] and
281 288 blocks[i]['type'] in ('bullet', 'option', 'field')):
282 289 i += 1
283 290 else:
284 291 blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
285 292 i += 2
286 293 return blocks
287 294
288 295
289 296 def formatblock(block, width):
290 297 """Format a block according to width."""
291 298 if width <= 0:
292 299 width = 78
293 300 indent = ' ' * block['indent']
294 301 if block['type'] == 'margin':
295 302 return ''
296 303 if block['type'] == 'literal':
297 304 indent += ' '
298 305 return indent + ('\n' + indent).join(block['lines'])
299 306 if block['type'] == 'section':
300 307 underline = len(block['lines'][0]) * block['underline']
301 308 return "%s%s\n%s%s" % (indent, block['lines'][0],indent, underline)
302 309 if block['type'] == 'definition':
303 310 term = indent + block['lines'][0]
304 311 hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
305 312 defindent = indent + hang * ' '
306 313 text = ' '.join(map(str.strip, block['lines'][1:]))
307 314 return '%s\n%s' % (term, util.wrap(text, width=width,
308 315 initindent=defindent,
309 316 hangindent=defindent))
310 317 subindent = indent
311 318 if block['type'] == 'bullet':
312 319 if block['lines'][0].startswith('| '):
313 320 # Remove bullet for line blocks and add no extra
314 321 # indention.
315 322 block['lines'][0] = block['lines'][0][2:]
316 323 else:
317 324 m = _bulletre.match(block['lines'][0])
318 325 subindent = indent + m.end() * ' '
319 326 elif block['type'] == 'field':
320 327 keywidth = block['keywidth']
321 328 key = block['key']
322 329
323 330 subindent = indent + _fieldwidth * ' '
324 331 if len(key) + 2 > _fieldwidth:
325 332 # key too large, use full line width
326 333 key = key.ljust(width)
327 334 elif keywidth + 2 < _fieldwidth:
328 335 # all keys are small, add only two spaces
329 336 key = key.ljust(keywidth + 2)
330 337 subindent = indent + (keywidth + 2) * ' '
331 338 else:
332 339 # mixed sizes, use fieldwidth for this one
333 340 key = key.ljust(_fieldwidth)
334 341 block['lines'][0] = key + block['lines'][0]
335 342 elif block['type'] == 'option':
336 343 m = _optionre.match(block['lines'][0])
337 344 option, arg, rest = m.groups()
338 345 subindent = indent + (len(option) + len(arg)) * ' '
339 346
340 347 text = ' '.join(map(str.strip, block['lines']))
341 348 return util.wrap(text, width=width,
342 349 initindent=indent,
343 350 hangindent=subindent)
344 351
345 352
346 353 def format(text, width, indent=0, keep=None):
347 354 """Parse and format the text according to width."""
348 355 blocks = findblocks(text)
349 356 for b in blocks:
350 357 b['indent'] += indent
351 358 blocks = findliteralblocks(blocks)
352 359 blocks, pruned = prunecontainers(blocks, keep or [])
353 360 blocks = findsections(blocks)
354 361 blocks = inlineliterals(blocks)
355 362 blocks = hgrole(blocks)
356 363 blocks = splitparagraphs(blocks)
357 364 blocks = updatefieldlists(blocks)
358 365 blocks = addmargins(blocks)
359 366 text = '\n'.join(formatblock(b, width) for b in blocks)
360 367 if keep is None:
361 368 return text
362 369 else:
363 370 return text, pruned
364 371
365 372
366 373 if __name__ == "__main__":
367 374 from pprint import pprint
368 375
369 376 def debug(func, *args):
370 377 blocks = func(*args)
371 378 print "*** after %s:" % func.__name__
372 379 pprint(blocks)
373 380 print
374 381 return blocks
375 382
376 383 text = open(sys.argv[1]).read()
377 384 blocks = debug(findblocks, text)
378 385 blocks = debug(findliteralblocks, blocks)
379 386 blocks, pruned = debug(prunecontainers, blocks, sys.argv[2:])
380 387 blocks = debug(inlineliterals, blocks)
381 388 blocks = debug(splitparagraphs, blocks)
382 389 blocks = debug(updatefieldlists, blocks)
383 390 blocks = debug(findsections, blocks)
384 391 blocks = debug(addmargins, blocks)
385 392 print '\n'.join(formatblock(b, 30) for b in blocks)
General Comments 0
You need to be logged in to leave comments. Login now