##// END OF EJS Templates
makrup-renderer: fix some cases which could cause lxml errors, skip js flags
marcink -
r1529:3683d343 default
parent child Browse files
Show More
@@ -1,455 +1,461 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 import urllib
32 31
33 32 from mako.lookup import TemplateLookup
34 33 from mako.template import Template as MakoTemplate
35 34
36 35 from docutils.core import publish_parts
37 36 from docutils.parsers.rst import directives
38 37 import markdown
39 38
40 39 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
41 40 from rhodecode.lib.utils2 import (
42 41 safe_str, safe_unicode, md5_safe, MENTIONS_REGEX)
43 42
44 43 log = logging.getLogger(__name__)
45 44
46 45 # default renderer used to generate automated comments
47 46 DEFAULT_COMMENTS_RENDERER = 'rst'
48 47
49 48
50 49 def relative_links(html_source, server_path):
51 doc = lxml.html.fromstring(html_source)
50 if not html_source:
51 return html_source
52
53 try:
54 doc = lxml.html.fromstring(html_source)
55 except Exception:
56 return html_source
57
52 58 for el in doc.cssselect('img, video'):
53 59 src = el.attrib['src']
54 60 if src:
55 61 el.attrib['src'] = relative_path(src, server_path)
56 62
57 63 for el in doc.cssselect('a:not(.gfm)'):
58 64 src = el.attrib['href']
59 65 if src:
60 66 el.attrib['href'] = relative_path(src, server_path)
61 67
62 68 return lxml.html.tostring(doc)
63 69
64 70
65 71 def relative_path(path, request_path, is_repo_file=None):
66 72 """
67 73 relative link support, path is a rel path, and request_path is current
68 74 server path (not absolute)
69 75
70 76 e.g.
71 77
72 78 path = '../logo.png'
73 79 request_path= '/repo/files/path/file.md'
74 80 produces: '/repo/files/logo.png'
75 81 """
76 82 # TODO(marcink): unicode/str support ?
77 83 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
78 84
79 85 def dummy_check(p):
80 86 return True # assume default is a valid file path
81 87
82 88 is_repo_file = is_repo_file or dummy_check
83 89 if not path:
84 90 return request_path
85 91
86 92 path = safe_unicode(path)
87 93 request_path = safe_unicode(request_path)
88 94
89 if path.startswith((u'data:', u'#', u':')):
95 if path.startswith((u'data:', u'javascript:', u'#', u':')):
90 96 # skip data, anchor, invalid links
91 97 return path
92 98
93 99 is_absolute = bool(urlparse.urlparse(path).netloc)
94 100 if is_absolute:
95 101 return path
96 102
97 103 if not request_path:
98 104 return path
99 105
100 106 if path.startswith(u'/'):
101 107 path = path[1:]
102 108
103 109 if path.startswith(u'./'):
104 110 path = path[2:]
105 111
106 112 parts = request_path.split('/')
107 113 # compute how deep we need to traverse the request_path
108 114 depth = 0
109 115
110 116 if is_repo_file(request_path):
111 117 # if request path is a VALID file, we use a relative path with
112 118 # one level up
113 119 depth += 1
114 120
115 121 while path.startswith(u'../'):
116 122 depth += 1
117 123 path = path[3:]
118 124
119 125 if depth > 0:
120 126 parts = parts[:-depth]
121 127
122 128 parts.append(path)
123 129 final_path = u'/'.join(parts).lstrip(u'/')
124 130
125 131 return u'/' + final_path
126 132
127 133
128 134 class MarkupRenderer(object):
129 135 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
130 136
131 137 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
132 138 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
133 139 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
134 140 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
135 141
136 142 extensions = ['codehilite', 'extra', 'def_list', 'sane_lists']
137 143 markdown_renderer = markdown.Markdown(
138 144 extensions, safe_mode=True, enable_attributes=False)
139 145
140 146 markdown_renderer_flavored = markdown.Markdown(
141 147 extensions + [GithubFlavoredMarkdownExtension()], safe_mode=True,
142 148 enable_attributes=False)
143 149
144 150 # extension together with weights. Lower is first means we control how
145 151 # extensions are attached to readme names with those.
146 152 PLAIN_EXTS = [
147 153 # prefer no extension
148 154 ('', 0), # special case that renders READMES names without extension
149 155 ('.text', 2), ('.TEXT', 2),
150 156 ('.txt', 3), ('.TXT', 3)
151 157 ]
152 158
153 159 RST_EXTS = [
154 160 ('.rst', 1), ('.rest', 1),
155 161 ('.RST', 2), ('.REST', 2)
156 162 ]
157 163
158 164 MARKDOWN_EXTS = [
159 165 ('.md', 1), ('.MD', 1),
160 166 ('.mkdn', 2), ('.MKDN', 2),
161 167 ('.mdown', 3), ('.MDOWN', 3),
162 168 ('.markdown', 4), ('.MARKDOWN', 4)
163 169 ]
164 170
165 171 def _detect_renderer(self, source, filename=None):
166 172 """
167 173 runs detection of what renderer should be used for generating html
168 174 from a markup language
169 175
170 176 filename can be also explicitly a renderer name
171 177
172 178 :param source:
173 179 :param filename:
174 180 """
175 181
176 182 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
177 183 detected_renderer = 'markdown'
178 184 elif MarkupRenderer.RST_PAT.findall(filename):
179 185 detected_renderer = 'rst'
180 186 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
181 187 detected_renderer = 'jupyter'
182 188 elif MarkupRenderer.PLAIN_PAT.findall(filename):
183 189 detected_renderer = 'plain'
184 190 else:
185 191 detected_renderer = 'plain'
186 192
187 193 return getattr(MarkupRenderer, detected_renderer)
188 194
189 195 @classmethod
190 196 def renderer_from_filename(cls, filename, exclude):
191 197 """
192 198 Detect renderer markdown/rst from filename and optionally use exclude
193 199 list to remove some options. This is mostly used in helpers.
194 200 Returns None when no renderer can be detected.
195 201 """
196 202 def _filter(elements):
197 203 if isinstance(exclude, (list, tuple)):
198 204 return [x for x in elements if x not in exclude]
199 205 return elements
200 206
201 207 if filename.endswith(
202 208 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
203 209 return 'markdown'
204 210 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
205 211 return 'rst'
206 212
207 213 return None
208 214
209 215 def render(self, source, filename=None):
210 216 """
211 217 Renders a given filename using detected renderer
212 218 it detects renderers based on file extension or mimetype.
213 219 At last it will just do a simple html replacing new lines with <br/>
214 220
215 221 :param file_name:
216 222 :param source:
217 223 """
218 224
219 225 renderer = self._detect_renderer(source, filename)
220 226 readme_data = renderer(source)
221 227 return readme_data
222 228
223 229 @classmethod
224 230 def _flavored_markdown(cls, text):
225 231 """
226 232 Github style flavored markdown
227 233
228 234 :param text:
229 235 """
230 236
231 237 # Extract pre blocks.
232 238 extractions = {}
233 239
234 240 def pre_extraction_callback(matchobj):
235 241 digest = md5_safe(matchobj.group(0))
236 242 extractions[digest] = matchobj.group(0)
237 243 return "{gfm-extraction-%s}" % digest
238 244 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
239 245 text = re.sub(pattern, pre_extraction_callback, text)
240 246
241 247 # Prevent foo_bar_baz from ending up with an italic word in the middle.
242 248 def italic_callback(matchobj):
243 249 s = matchobj.group(0)
244 250 if list(s).count('_') >= 2:
245 251 return s.replace('_', r'\_')
246 252 return s
247 253 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
248 254
249 255 # Insert pre block extractions.
250 256 def pre_insert_callback(matchobj):
251 257 return '\n\n' + extractions[matchobj.group(1)]
252 258 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
253 259 pre_insert_callback, text)
254 260
255 261 return text
256 262
257 263 @classmethod
258 264 def urlify_text(cls, text):
259 265 url_pat = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
260 266 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
261 267
262 268 def url_func(match_obj):
263 269 url_full = match_obj.groups()[0]
264 270 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
265 271
266 272 return url_pat.sub(url_func, text)
267 273
268 274 @classmethod
269 275 def plain(cls, source, universal_newline=True):
270 276 source = safe_unicode(source)
271 277 if universal_newline:
272 278 newline = '\n'
273 279 source = newline.join(source.splitlines())
274 280
275 281 source = cls.urlify_text(source)
276 282 return '<br />' + source.replace("\n", '<br />')
277 283
278 284 @classmethod
279 285 def markdown(cls, source, safe=True, flavored=True, mentions=False):
280 286 # It does not allow to insert inline HTML. In presence of HTML tags, it
281 287 # will replace them instead with [HTML_REMOVED]. This is controlled by
282 288 # the safe_mode=True parameter of the markdown method.
283 289
284 290 if flavored:
285 291 markdown_renderer = cls.markdown_renderer_flavored
286 292 else:
287 293 markdown_renderer = cls.markdown_renderer
288 294
289 295 if mentions:
290 296 mention_pat = re.compile(MENTIONS_REGEX)
291 297
292 298 def wrapp(match_obj):
293 299 uname = match_obj.groups()[0]
294 300 return ' **@%(uname)s** ' % {'uname': uname}
295 301 mention_hl = mention_pat.sub(wrapp, source).strip()
296 302 # we extracted mentions render with this using Mentions false
297 303 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
298 304 mentions=False)
299 305
300 306 source = safe_unicode(source)
301 307 try:
302 308 if flavored:
303 309 source = cls._flavored_markdown(source)
304 310 return markdown_renderer.convert(source)
305 311 except Exception:
306 312 log.exception('Error when rendering Markdown')
307 313 if safe:
308 314 log.debug('Fallback to render in plain mode')
309 315 return cls.plain(source)
310 316 else:
311 317 raise
312 318
313 319 @classmethod
314 320 def rst(cls, source, safe=True, mentions=False):
315 321 if mentions:
316 322 mention_pat = re.compile(MENTIONS_REGEX)
317 323
318 324 def wrapp(match_obj):
319 325 uname = match_obj.groups()[0]
320 326 return ' **@%(uname)s** ' % {'uname': uname}
321 327 mention_hl = mention_pat.sub(wrapp, source).strip()
322 328 # we extracted mentions render with this using Mentions false
323 329 return cls.rst(mention_hl, safe=safe, mentions=False)
324 330
325 331 source = safe_unicode(source)
326 332 try:
327 333 docutils_settings = dict(
328 334 [(alias, None) for alias in
329 335 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
330 336
331 337 docutils_settings.update({'input_encoding': 'unicode',
332 338 'report_level': 4})
333 339
334 340 for k, v in docutils_settings.iteritems():
335 341 directives.register_directive(k, v)
336 342
337 343 parts = publish_parts(source=source,
338 344 writer_name="html4css1",
339 345 settings_overrides=docutils_settings)
340 346
341 347 return parts['html_title'] + parts["fragment"]
342 348 except Exception:
343 349 log.exception('Error when rendering RST')
344 350 if safe:
345 351 log.debug('Fallbacking to render in plain mode')
346 352 return cls.plain(source)
347 353 else:
348 354 raise
349 355
350 356 @classmethod
351 357 def jupyter(cls, source, safe=True):
352 358 from rhodecode.lib import helpers
353 359
354 360 from traitlets.config import Config
355 361 import nbformat
356 362 from nbconvert import HTMLExporter
357 363 from nbconvert.preprocessors import Preprocessor
358 364
359 365 class CustomHTMLExporter(HTMLExporter):
360 366 def _template_file_default(self):
361 367 return 'basic'
362 368
363 369 class Sandbox(Preprocessor):
364 370
365 371 def preprocess(self, nb, resources):
366 372 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
367 373 for cell in nb['cells']:
368 374 if safe and 'outputs' in cell:
369 375 for cell_output in cell['outputs']:
370 376 if 'data' in cell_output:
371 377 if 'application/javascript' in cell_output['data']:
372 378 cell_output['data']['text/plain'] = sandbox_text
373 379 cell_output['data'].pop('application/javascript', None)
374 380 return nb, resources
375 381
376 382 def _sanitize_resources(resources):
377 383 """
378 384 Skip/sanitize some of the CSS generated and included in jupyter
379 385 so it doesn't messes up UI so much
380 386 """
381 387
382 388 # TODO(marcink): probably we should replace this with whole custom
383 389 # CSS set that doesn't screw up, but jupyter generated html has some
384 390 # special markers, so it requires Custom HTML exporter template with
385 391 # _default_template_path_default, to achieve that
386 392
387 393 # strip the reset CSS
388 394 resources[0] = resources[0][resources[0].find('/*! Source'):]
389 395 return resources
390 396
391 397 def as_html(notebook):
392 398 conf = Config()
393 399 conf.CustomHTMLExporter.preprocessors = [Sandbox]
394 400 html_exporter = CustomHTMLExporter(config=conf)
395 401
396 402 (body, resources) = html_exporter.from_notebook_node(notebook)
397 403 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
398 404 js = MakoTemplate(r'''
399 405 <!-- Load mathjax -->
400 406 <!-- MathJax configuration -->
401 407 <script type="text/x-mathjax-config">
402 408 MathJax.Hub.Config({
403 409 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
404 410 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
405 411 TeX: {
406 412 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
407 413 },
408 414 tex2jax: {
409 415 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
410 416 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
411 417 processEscapes: true,
412 418 processEnvironments: true
413 419 },
414 420 // Center justify equations in code and markdown cells. Elsewhere
415 421 // we use CSS to left justify single line equations in code cells.
416 422 displayAlign: 'center',
417 423 "HTML-CSS": {
418 424 styles: {'.MathJax_Display': {"margin": 0}},
419 425 linebreaks: { automatic: true },
420 426 availableFonts: ["STIX", "TeX"]
421 427 },
422 428 showMathMenu: false
423 429 });
424 430 </script>
425 431 <!-- End of mathjax configuration -->
426 432 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
427 433 ''').render(h=helpers)
428 434
429 435 css = '<style>{}</style>'.format(
430 436 ''.join(_sanitize_resources(resources['inlining']['css'])))
431 437
432 438 body = '\n'.join([header, css, js, body])
433 439 return body, resources
434 440
435 441 notebook = nbformat.reads(source, as_version=4)
436 442 (body, resources) = as_html(notebook)
437 443 return body
438 444
439 445
440 446 class RstTemplateRenderer(object):
441 447
442 448 def __init__(self):
443 449 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
444 450 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
445 451 self.template_store = TemplateLookup(
446 452 directories=rst_template_dirs,
447 453 input_encoding='utf-8',
448 454 imports=['from rhodecode.lib import helpers as h'])
449 455
450 456 def _get_template(self, templatename):
451 457 return self.template_store.get_template(templatename)
452 458
453 459 def render(self, template_name, **kwargs):
454 460 template = self._get_template(template_name)
455 461 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now