##// END OF EJS Templates
markup-renderer: use safe fetching of attributes to prevent from errors on malformed html.
marcink -
r1840:05beb7b6 default
parent child Browse files
Show More
@@ -1,488 +1,488 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 31
32 32 from mako.lookup import TemplateLookup
33 33 from mako.template import Template as MakoTemplate
34 34
35 35 from docutils.core import publish_parts
36 36 from docutils.parsers.rst import directives
37 37 from docutils import writers
38 38 from docutils.writers import html4css1
39 39 import markdown
40 40
41 41 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
42 42 from rhodecode.lib.utils2 import (
43 43 safe_str, safe_unicode, md5_safe, MENTIONS_REGEX)
44 44
45 45 log = logging.getLogger(__name__)
46 46
47 47 # default renderer used to generate automated comments
48 48 DEFAULT_COMMENTS_RENDERER = 'rst'
49 49
50 50
51 51 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
52 52 """
53 53 Custom HTML Translator used for sandboxing potential
54 54 JS injections in ref links
55 55 """
56 56
57 57 def visit_reference(self, node):
58 58 if 'refuri' in node.attributes:
59 59 refuri = node['refuri']
60 60 if ':' in refuri:
61 61 prefix, link = refuri.lstrip().split(':', 1)
62 62 if prefix == 'javascript':
63 63 # we don't allow javascript type of refs...
64 64 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
65 65
66 66 # old style class requires this...
67 67 return html4css1.HTMLTranslator.visit_reference(self, node)
68 68
69 69
70 70 class RhodeCodeWriter(writers.html4css1.Writer):
71 71 def __init__(self):
72 72 writers.Writer.__init__(self)
73 73 self.translator_class = CustomHTMLTranslator
74 74
75 75
76 76 def relative_links(html_source, server_path):
77 77 if not html_source:
78 78 return html_source
79 79
80 80 try:
81 81 doc = lxml.html.fromstring(html_source)
82 82 except Exception:
83 83 return html_source
84 84
85 85 for el in doc.cssselect('img, video'):
86 src = el.attrib['src']
86 src = el.attrib.get('src')
87 87 if src:
88 88 el.attrib['src'] = relative_path(src, server_path)
89 89
90 90 for el in doc.cssselect('a:not(.gfm)'):
91 src = el.attrib['href']
91 src = el.attrib.get('href')
92 92 if src:
93 93 el.attrib['href'] = relative_path(src, server_path)
94 94
95 95 return lxml.html.tostring(doc)
96 96
97 97
98 98 def relative_path(path, request_path, is_repo_file=None):
99 99 """
100 100 relative link support, path is a rel path, and request_path is current
101 101 server path (not absolute)
102 102
103 103 e.g.
104 104
105 105 path = '../logo.png'
106 106 request_path= '/repo/files/path/file.md'
107 107 produces: '/repo/files/logo.png'
108 108 """
109 109 # TODO(marcink): unicode/str support ?
110 110 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
111 111
112 112 def dummy_check(p):
113 113 return True # assume default is a valid file path
114 114
115 115 is_repo_file = is_repo_file or dummy_check
116 116 if not path:
117 117 return request_path
118 118
119 119 path = safe_unicode(path)
120 120 request_path = safe_unicode(request_path)
121 121
122 122 if path.startswith((u'data:', u'javascript:', u'#', u':')):
123 123 # skip data, anchor, invalid links
124 124 return path
125 125
126 126 is_absolute = bool(urlparse.urlparse(path).netloc)
127 127 if is_absolute:
128 128 return path
129 129
130 130 if not request_path:
131 131 return path
132 132
133 133 if path.startswith(u'/'):
134 134 path = path[1:]
135 135
136 136 if path.startswith(u'./'):
137 137 path = path[2:]
138 138
139 139 parts = request_path.split('/')
140 140 # compute how deep we need to traverse the request_path
141 141 depth = 0
142 142
143 143 if is_repo_file(request_path):
144 144 # if request path is a VALID file, we use a relative path with
145 145 # one level up
146 146 depth += 1
147 147
148 148 while path.startswith(u'../'):
149 149 depth += 1
150 150 path = path[3:]
151 151
152 152 if depth > 0:
153 153 parts = parts[:-depth]
154 154
155 155 parts.append(path)
156 156 final_path = u'/'.join(parts).lstrip(u'/')
157 157
158 158 return u'/' + final_path
159 159
160 160
161 161 class MarkupRenderer(object):
162 162 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
163 163
164 164 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
165 165 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
166 166 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
167 167 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
168 168
169 169 extensions = ['codehilite', 'extra', 'def_list', 'sane_lists']
170 170 markdown_renderer = markdown.Markdown(
171 171 extensions, safe_mode=True, enable_attributes=False)
172 172
173 173 markdown_renderer_flavored = markdown.Markdown(
174 174 extensions + [GithubFlavoredMarkdownExtension()], safe_mode=True,
175 175 enable_attributes=False)
176 176
177 177 # extension together with weights. Lower is first means we control how
178 178 # extensions are attached to readme names with those.
179 179 PLAIN_EXTS = [
180 180 # prefer no extension
181 181 ('', 0), # special case that renders READMES names without extension
182 182 ('.text', 2), ('.TEXT', 2),
183 183 ('.txt', 3), ('.TXT', 3)
184 184 ]
185 185
186 186 RST_EXTS = [
187 187 ('.rst', 1), ('.rest', 1),
188 188 ('.RST', 2), ('.REST', 2)
189 189 ]
190 190
191 191 MARKDOWN_EXTS = [
192 192 ('.md', 1), ('.MD', 1),
193 193 ('.mkdn', 2), ('.MKDN', 2),
194 194 ('.mdown', 3), ('.MDOWN', 3),
195 195 ('.markdown', 4), ('.MARKDOWN', 4)
196 196 ]
197 197
198 198 def _detect_renderer(self, source, filename=None):
199 199 """
200 200 runs detection of what renderer should be used for generating html
201 201 from a markup language
202 202
203 203 filename can be also explicitly a renderer name
204 204
205 205 :param source:
206 206 :param filename:
207 207 """
208 208
209 209 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
210 210 detected_renderer = 'markdown'
211 211 elif MarkupRenderer.RST_PAT.findall(filename):
212 212 detected_renderer = 'rst'
213 213 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
214 214 detected_renderer = 'jupyter'
215 215 elif MarkupRenderer.PLAIN_PAT.findall(filename):
216 216 detected_renderer = 'plain'
217 217 else:
218 218 detected_renderer = 'plain'
219 219
220 220 return getattr(MarkupRenderer, detected_renderer)
221 221
222 222 @classmethod
223 223 def renderer_from_filename(cls, filename, exclude):
224 224 """
225 225 Detect renderer markdown/rst from filename and optionally use exclude
226 226 list to remove some options. This is mostly used in helpers.
227 227 Returns None when no renderer can be detected.
228 228 """
229 229 def _filter(elements):
230 230 if isinstance(exclude, (list, tuple)):
231 231 return [x for x in elements if x not in exclude]
232 232 return elements
233 233
234 234 if filename.endswith(
235 235 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
236 236 return 'markdown'
237 237 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
238 238 return 'rst'
239 239
240 240 return None
241 241
242 242 def render(self, source, filename=None):
243 243 """
244 244 Renders a given filename using detected renderer
245 245 it detects renderers based on file extension or mimetype.
246 246 At last it will just do a simple html replacing new lines with <br/>
247 247
248 248 :param file_name:
249 249 :param source:
250 250 """
251 251
252 252 renderer = self._detect_renderer(source, filename)
253 253 readme_data = renderer(source)
254 254 return readme_data
255 255
256 256 @classmethod
257 257 def _flavored_markdown(cls, text):
258 258 """
259 259 Github style flavored markdown
260 260
261 261 :param text:
262 262 """
263 263
264 264 # Extract pre blocks.
265 265 extractions = {}
266 266
267 267 def pre_extraction_callback(matchobj):
268 268 digest = md5_safe(matchobj.group(0))
269 269 extractions[digest] = matchobj.group(0)
270 270 return "{gfm-extraction-%s}" % digest
271 271 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
272 272 text = re.sub(pattern, pre_extraction_callback, text)
273 273
274 274 # Prevent foo_bar_baz from ending up with an italic word in the middle.
275 275 def italic_callback(matchobj):
276 276 s = matchobj.group(0)
277 277 if list(s).count('_') >= 2:
278 278 return s.replace('_', r'\_')
279 279 return s
280 280 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
281 281
282 282 # Insert pre block extractions.
283 283 def pre_insert_callback(matchobj):
284 284 return '\n\n' + extractions[matchobj.group(1)]
285 285 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
286 286 pre_insert_callback, text)
287 287
288 288 return text
289 289
290 290 @classmethod
291 291 def urlify_text(cls, text):
292 292 url_pat = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
293 293 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
294 294
295 295 def url_func(match_obj):
296 296 url_full = match_obj.groups()[0]
297 297 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
298 298
299 299 return url_pat.sub(url_func, text)
300 300
301 301 @classmethod
302 302 def plain(cls, source, universal_newline=True):
303 303 source = safe_unicode(source)
304 304 if universal_newline:
305 305 newline = '\n'
306 306 source = newline.join(source.splitlines())
307 307
308 308 source = cls.urlify_text(source)
309 309 return '<br />' + source.replace("\n", '<br />')
310 310
311 311 @classmethod
312 312 def markdown(cls, source, safe=True, flavored=True, mentions=False):
313 313 # It does not allow to insert inline HTML. In presence of HTML tags, it
314 314 # will replace them instead with [HTML_REMOVED]. This is controlled by
315 315 # the safe_mode=True parameter of the markdown method.
316 316
317 317 if flavored:
318 318 markdown_renderer = cls.markdown_renderer_flavored
319 319 else:
320 320 markdown_renderer = cls.markdown_renderer
321 321
322 322 if mentions:
323 323 mention_pat = re.compile(MENTIONS_REGEX)
324 324
325 325 def wrapp(match_obj):
326 326 uname = match_obj.groups()[0]
327 327 return ' **@%(uname)s** ' % {'uname': uname}
328 328 mention_hl = mention_pat.sub(wrapp, source).strip()
329 329 # we extracted mentions render with this using Mentions false
330 330 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
331 331 mentions=False)
332 332
333 333 source = safe_unicode(source)
334 334 try:
335 335 if flavored:
336 336 source = cls._flavored_markdown(source)
337 337 return markdown_renderer.convert(source)
338 338 except Exception:
339 339 log.exception('Error when rendering Markdown')
340 340 if safe:
341 341 log.debug('Fallback to render in plain mode')
342 342 return cls.plain(source)
343 343 else:
344 344 raise
345 345
346 346 @classmethod
347 347 def rst(cls, source, safe=True, mentions=False):
348 348 if mentions:
349 349 mention_pat = re.compile(MENTIONS_REGEX)
350 350
351 351 def wrapp(match_obj):
352 352 uname = match_obj.groups()[0]
353 353 return ' **@%(uname)s** ' % {'uname': uname}
354 354 mention_hl = mention_pat.sub(wrapp, source).strip()
355 355 # we extracted mentions render with this using Mentions false
356 356 return cls.rst(mention_hl, safe=safe, mentions=False)
357 357
358 358 source = safe_unicode(source)
359 359 try:
360 360 docutils_settings = dict(
361 361 [(alias, None) for alias in
362 362 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
363 363
364 364 docutils_settings.update({'input_encoding': 'unicode',
365 365 'report_level': 4})
366 366
367 367 for k, v in docutils_settings.iteritems():
368 368 directives.register_directive(k, v)
369 369
370 370 parts = publish_parts(source=source,
371 371 writer=RhodeCodeWriter(),
372 372 settings_overrides=docutils_settings)
373 373
374 374 return parts['html_title'] + parts["fragment"]
375 375 except Exception:
376 376 log.exception('Error when rendering RST')
377 377 if safe:
378 378 log.debug('Fallbacking to render in plain mode')
379 379 return cls.plain(source)
380 380 else:
381 381 raise
382 382
383 383 @classmethod
384 384 def jupyter(cls, source, safe=True):
385 385 from rhodecode.lib import helpers
386 386
387 387 from traitlets.config import Config
388 388 import nbformat
389 389 from nbconvert import HTMLExporter
390 390 from nbconvert.preprocessors import Preprocessor
391 391
392 392 class CustomHTMLExporter(HTMLExporter):
393 393 def _template_file_default(self):
394 394 return 'basic'
395 395
396 396 class Sandbox(Preprocessor):
397 397
398 398 def preprocess(self, nb, resources):
399 399 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
400 400 for cell in nb['cells']:
401 401 if safe and 'outputs' in cell:
402 402 for cell_output in cell['outputs']:
403 403 if 'data' in cell_output:
404 404 if 'application/javascript' in cell_output['data']:
405 405 cell_output['data']['text/plain'] = sandbox_text
406 406 cell_output['data'].pop('application/javascript', None)
407 407 return nb, resources
408 408
409 409 def _sanitize_resources(resources):
410 410 """
411 411 Skip/sanitize some of the CSS generated and included in jupyter
412 412 so it doesn't messes up UI so much
413 413 """
414 414
415 415 # TODO(marcink): probably we should replace this with whole custom
416 416 # CSS set that doesn't screw up, but jupyter generated html has some
417 417 # special markers, so it requires Custom HTML exporter template with
418 418 # _default_template_path_default, to achieve that
419 419
420 420 # strip the reset CSS
421 421 resources[0] = resources[0][resources[0].find('/*! Source'):]
422 422 return resources
423 423
424 424 def as_html(notebook):
425 425 conf = Config()
426 426 conf.CustomHTMLExporter.preprocessors = [Sandbox]
427 427 html_exporter = CustomHTMLExporter(config=conf)
428 428
429 429 (body, resources) = html_exporter.from_notebook_node(notebook)
430 430 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
431 431 js = MakoTemplate(r'''
432 432 <!-- Load mathjax -->
433 433 <!-- MathJax configuration -->
434 434 <script type="text/x-mathjax-config">
435 435 MathJax.Hub.Config({
436 436 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
437 437 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
438 438 TeX: {
439 439 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
440 440 },
441 441 tex2jax: {
442 442 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
443 443 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
444 444 processEscapes: true,
445 445 processEnvironments: true
446 446 },
447 447 // Center justify equations in code and markdown cells. Elsewhere
448 448 // we use CSS to left justify single line equations in code cells.
449 449 displayAlign: 'center',
450 450 "HTML-CSS": {
451 451 styles: {'.MathJax_Display': {"margin": 0}},
452 452 linebreaks: { automatic: true },
453 453 availableFonts: ["STIX", "TeX"]
454 454 },
455 455 showMathMenu: false
456 456 });
457 457 </script>
458 458 <!-- End of mathjax configuration -->
459 459 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
460 460 ''').render(h=helpers)
461 461
462 462 css = '<style>{}</style>'.format(
463 463 ''.join(_sanitize_resources(resources['inlining']['css'])))
464 464
465 465 body = '\n'.join([header, css, js, body])
466 466 return body, resources
467 467
468 468 notebook = nbformat.reads(source, as_version=4)
469 469 (body, resources) = as_html(notebook)
470 470 return body
471 471
472 472
473 473 class RstTemplateRenderer(object):
474 474
475 475 def __init__(self):
476 476 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
477 477 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
478 478 self.template_store = TemplateLookup(
479 479 directories=rst_template_dirs,
480 480 input_encoding='utf-8',
481 481 imports=['from rhodecode.lib import helpers as h'])
482 482
483 483 def _get_template(self, templatename):
484 484 return self.template_store.get_template(templatename)
485 485
486 486 def render(self, template_name, **kwargs):
487 487 template = self._get_template(template_name)
488 488 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now