##// END OF EJS Templates
jupyter: sanitize markdown cells similar as we do for our own markdown cleanup.
marcink -
r3148:fb1dc128 default
parent child Browse files
Show More
@@ -1,526 +1,534 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2018 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 31 import bleach
32 32
33 33 from mako.lookup import TemplateLookup
34 34 from mako.template import Template as MakoTemplate
35 35
36 36 from docutils.core import publish_parts
37 37 from docutils.parsers.rst import directives
38 38 from docutils import writers
39 39 from docutils.writers import html4css1
40 40 import markdown
41 41
42 42 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
43 43 from rhodecode.lib.utils2 import (
44 44 safe_str, safe_unicode, md5_safe, MENTIONS_REGEX)
45 45
46 46 log = logging.getLogger(__name__)
47 47
48 48 # default renderer used to generate automated comments
49 49 DEFAULT_COMMENTS_RENDERER = 'rst'
50 50
51 51
52 52 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
53 53 """
54 54 Custom HTML Translator used for sandboxing potential
55 55 JS injections in ref links
56 56 """
57 57
58 58 def visit_reference(self, node):
59 59 if 'refuri' in node.attributes:
60 60 refuri = node['refuri']
61 61 if ':' in refuri:
62 62 prefix, link = refuri.lstrip().split(':', 1)
63 63 prefix = prefix or ''
64 64
65 65 if prefix.lower() == 'javascript':
66 66 # we don't allow javascript type of refs...
67 67 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
68 68
69 69 # old style class requires this...
70 70 return html4css1.HTMLTranslator.visit_reference(self, node)
71 71
72 72
73 73 class RhodeCodeWriter(writers.html4css1.Writer):
74 74 def __init__(self):
75 75 writers.Writer.__init__(self)
76 76 self.translator_class = CustomHTMLTranslator
77 77
78 78
79 79 def relative_links(html_source, server_paths):
80 80 if not html_source:
81 81 return html_source
82 82
83 83 try:
84 84 from lxml.html import fromstring
85 85 from lxml.html import tostring
86 86 except ImportError:
87 87 log.exception('Failed to import lxml')
88 88 return html_source
89 89
90 90 try:
91 91 doc = lxml.html.fromstring(html_source)
92 92 except Exception:
93 93 return html_source
94 94
95 95 for el in doc.cssselect('img, video'):
96 96 src = el.attrib.get('src')
97 97 if src:
98 98 el.attrib['src'] = relative_path(src, server_paths['raw'])
99 99
100 100 for el in doc.cssselect('a:not(.gfm)'):
101 101 src = el.attrib.get('href')
102 102 if src:
103 103 raw_mode = el.attrib['href'].endswith('?raw=1')
104 104 if raw_mode:
105 105 el.attrib['href'] = relative_path(src, server_paths['raw'])
106 106 else:
107 107 el.attrib['href'] = relative_path(src, server_paths['standard'])
108 108
109 109 return lxml.html.tostring(doc)
110 110
111 111
112 112 def relative_path(path, request_path, is_repo_file=None):
113 113 """
114 114 relative link support, path is a rel path, and request_path is current
115 115 server path (not absolute)
116 116
117 117 e.g.
118 118
119 119 path = '../logo.png'
120 120 request_path= '/repo/files/path/file.md'
121 121 produces: '/repo/files/logo.png'
122 122 """
123 123 # TODO(marcink): unicode/str support ?
124 124 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
125 125
126 126 def dummy_check(p):
127 127 return True # assume default is a valid file path
128 128
129 129 is_repo_file = is_repo_file or dummy_check
130 130 if not path:
131 131 return request_path
132 132
133 133 path = safe_unicode(path)
134 134 request_path = safe_unicode(request_path)
135 135
136 136 if path.startswith((u'data:', u'javascript:', u'#', u':')):
137 137 # skip data, anchor, invalid links
138 138 return path
139 139
140 140 is_absolute = bool(urlparse.urlparse(path).netloc)
141 141 if is_absolute:
142 142 return path
143 143
144 144 if not request_path:
145 145 return path
146 146
147 147 if path.startswith(u'/'):
148 148 path = path[1:]
149 149
150 150 if path.startswith(u'./'):
151 151 path = path[2:]
152 152
153 153 parts = request_path.split('/')
154 154 # compute how deep we need to traverse the request_path
155 155 depth = 0
156 156
157 157 if is_repo_file(request_path):
158 158 # if request path is a VALID file, we use a relative path with
159 159 # one level up
160 160 depth += 1
161 161
162 162 while path.startswith(u'../'):
163 163 depth += 1
164 164 path = path[3:]
165 165
166 166 if depth > 0:
167 167 parts = parts[:-depth]
168 168
169 169 parts.append(path)
170 170 final_path = u'/'.join(parts).lstrip(u'/')
171 171
172 172 return u'/' + final_path
173 173
174 174
175 175 class MarkupRenderer(object):
176 176 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
177 177
178 178 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
179 179 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
180 180 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
181 181 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
182 182
183 183 URL_PAT = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
184 184 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
185 185
186 186 extensions = ['codehilite', 'extra', 'def_list', 'sane_lists']
187 187 output_format = 'html4'
188 188 markdown_renderer = markdown.Markdown(
189 189 extensions, enable_attributes=False, output_format=output_format)
190 190
191 191 markdown_renderer_flavored = markdown.Markdown(
192 192 extensions + [GithubFlavoredMarkdownExtension()],
193 193 enable_attributes=False, output_format=output_format)
194 194
195 195 # extension together with weights. Lower is first means we control how
196 196 # extensions are attached to readme names with those.
197 197 PLAIN_EXTS = [
198 198 # prefer no extension
199 199 ('', 0), # special case that renders READMES names without extension
200 200 ('.text', 2), ('.TEXT', 2),
201 201 ('.txt', 3), ('.TXT', 3)
202 202 ]
203 203
204 204 RST_EXTS = [
205 205 ('.rst', 1), ('.rest', 1),
206 206 ('.RST', 2), ('.REST', 2)
207 207 ]
208 208
209 209 MARKDOWN_EXTS = [
210 210 ('.md', 1), ('.MD', 1),
211 211 ('.mkdn', 2), ('.MKDN', 2),
212 212 ('.mdown', 3), ('.MDOWN', 3),
213 213 ('.markdown', 4), ('.MARKDOWN', 4)
214 214 ]
215 215
216 216 def _detect_renderer(self, source, filename=None):
217 217 """
218 218 runs detection of what renderer should be used for generating html
219 219 from a markup language
220 220
221 221 filename can be also explicitly a renderer name
222 222
223 223 :param source:
224 224 :param filename:
225 225 """
226 226
227 227 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
228 228 detected_renderer = 'markdown'
229 229 elif MarkupRenderer.RST_PAT.findall(filename):
230 230 detected_renderer = 'rst'
231 231 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
232 232 detected_renderer = 'jupyter'
233 233 elif MarkupRenderer.PLAIN_PAT.findall(filename):
234 234 detected_renderer = 'plain'
235 235 else:
236 236 detected_renderer = 'plain'
237 237
238 238 return getattr(MarkupRenderer, detected_renderer)
239 239
240 240 @classmethod
241 241 def bleach_clean(cls, text):
242 242 from .bleach_whitelist import markdown_attrs, markdown_tags
243 243 allowed_tags = markdown_tags
244 244 allowed_attrs = markdown_attrs
245 245
246 246 try:
247 247 return bleach.clean(text, tags=allowed_tags, attributes=allowed_attrs)
248 248 except Exception:
249 249 return 'UNPARSEABLE TEXT'
250 250
251 251 @classmethod
252 252 def renderer_from_filename(cls, filename, exclude):
253 253 """
254 254 Detect renderer markdown/rst from filename and optionally use exclude
255 255 list to remove some options. This is mostly used in helpers.
256 256 Returns None when no renderer can be detected.
257 257 """
258 258 def _filter(elements):
259 259 if isinstance(exclude, (list, tuple)):
260 260 return [x for x in elements if x not in exclude]
261 261 return elements
262 262
263 263 if filename.endswith(
264 264 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
265 265 return 'markdown'
266 266 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
267 267 return 'rst'
268 268
269 269 return None
270 270
271 271 def render(self, source, filename=None):
272 272 """
273 273 Renders a given filename using detected renderer
274 274 it detects renderers based on file extension or mimetype.
275 275 At last it will just do a simple html replacing new lines with <br/>
276 276
277 277 :param file_name:
278 278 :param source:
279 279 """
280 280
281 281 renderer = self._detect_renderer(source, filename)
282 282 readme_data = renderer(source)
283 283 return readme_data
284 284
285 285 @classmethod
286 286 def _flavored_markdown(cls, text):
287 287 """
288 288 Github style flavored markdown
289 289
290 290 :param text:
291 291 """
292 292
293 293 # Extract pre blocks.
294 294 extractions = {}
295 295
296 296 def pre_extraction_callback(matchobj):
297 297 digest = md5_safe(matchobj.group(0))
298 298 extractions[digest] = matchobj.group(0)
299 299 return "{gfm-extraction-%s}" % digest
300 300 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
301 301 text = re.sub(pattern, pre_extraction_callback, text)
302 302
303 303 # Prevent foo_bar_baz from ending up with an italic word in the middle.
304 304 def italic_callback(matchobj):
305 305 s = matchobj.group(0)
306 306 if list(s).count('_') >= 2:
307 307 return s.replace('_', r'\_')
308 308 return s
309 309 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
310 310
311 311 # Insert pre block extractions.
312 312 def pre_insert_callback(matchobj):
313 313 return '\n\n' + extractions[matchobj.group(1)]
314 314 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
315 315 pre_insert_callback, text)
316 316
317 317 return text
318 318
319 319 @classmethod
320 320 def urlify_text(cls, text):
321 321 def url_func(match_obj):
322 322 url_full = match_obj.groups()[0]
323 323 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
324 324
325 325 return cls.URL_PAT.sub(url_func, text)
326 326
327 327 @classmethod
328 328 def plain(cls, source, universal_newline=True, leading_newline=True):
329 329 source = safe_unicode(source)
330 330 if universal_newline:
331 331 newline = '\n'
332 332 source = newline.join(source.splitlines())
333 333
334 334 rendered_source = cls.urlify_text(source)
335 335 source = ''
336 336 if leading_newline:
337 337 source += '<br />'
338 338 source += rendered_source.replace("\n", '<br />')
339 339 return source
340 340
341 341 @classmethod
342 342 def markdown(cls, source, safe=True, flavored=True, mentions=False,
343 343 clean_html=True):
344 344 """
345 345 returns markdown rendered code cleaned by the bleach library
346 346 """
347 347
348 348 if flavored:
349 349 markdown_renderer = cls.markdown_renderer_flavored
350 350 else:
351 351 markdown_renderer = cls.markdown_renderer
352 352
353 353 if mentions:
354 354 mention_pat = re.compile(MENTIONS_REGEX)
355 355
356 356 def wrapp(match_obj):
357 357 uname = match_obj.groups()[0]
358 358 return ' **@%(uname)s** ' % {'uname': uname}
359 359 mention_hl = mention_pat.sub(wrapp, source).strip()
360 360 # we extracted mentions render with this using Mentions false
361 361 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
362 362 mentions=False)
363 363
364 364 source = safe_unicode(source)
365 365
366 366 try:
367 367 if flavored:
368 368 source = cls._flavored_markdown(source)
369 369 rendered = markdown_renderer.convert(source)
370 370 except Exception:
371 371 log.exception('Error when rendering Markdown')
372 372 if safe:
373 373 log.debug('Fallback to render in plain mode')
374 374 rendered = cls.plain(source)
375 375 else:
376 376 raise
377 377
378 378 if clean_html:
379 379 rendered = cls.bleach_clean(rendered)
380 380 return rendered
381 381
382 382 @classmethod
383 383 def rst(cls, source, safe=True, mentions=False, clean_html=False):
384 384 if mentions:
385 385 mention_pat = re.compile(MENTIONS_REGEX)
386 386
387 387 def wrapp(match_obj):
388 388 uname = match_obj.groups()[0]
389 389 return ' **@%(uname)s** ' % {'uname': uname}
390 390 mention_hl = mention_pat.sub(wrapp, source).strip()
391 391 # we extracted mentions render with this using Mentions false
392 392 return cls.rst(mention_hl, safe=safe, mentions=False)
393 393
394 394 source = safe_unicode(source)
395 395 try:
396 396 docutils_settings = dict(
397 397 [(alias, None) for alias in
398 398 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
399 399
400 400 docutils_settings.update({
401 401 'input_encoding': 'unicode', 'report_level': 4})
402 402
403 403 for k, v in docutils_settings.iteritems():
404 404 directives.register_directive(k, v)
405 405
406 406 parts = publish_parts(source=source,
407 407 writer=RhodeCodeWriter(),
408 408 settings_overrides=docutils_settings)
409 409 rendered = parts["fragment"]
410 410 if clean_html:
411 411 rendered = cls.bleach_clean(rendered)
412 412 return parts['html_title'] + rendered
413 413 except Exception:
414 414 log.exception('Error when rendering RST')
415 415 if safe:
416 416 log.debug('Fallbacking to render in plain mode')
417 417 return cls.plain(source)
418 418 else:
419 419 raise
420 420
421 421 @classmethod
422 422 def jupyter(cls, source, safe=True):
423 423 from rhodecode.lib import helpers
424 424
425 425 from traitlets.config import Config
426 426 import nbformat
427 427 from nbconvert import HTMLExporter
428 428 from nbconvert.preprocessors import Preprocessor
429 429
430 430 class CustomHTMLExporter(HTMLExporter):
431 431 def _template_file_default(self):
432 432 return 'basic'
433 433
434 434 class Sandbox(Preprocessor):
435 435
436 436 def preprocess(self, nb, resources):
437 437 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
438 438 for cell in nb['cells']:
439 if safe and 'outputs' in cell:
439 if not safe:
440 continue
441
442 if 'outputs' in cell:
440 443 for cell_output in cell['outputs']:
441 444 if 'data' in cell_output:
442 445 if 'application/javascript' in cell_output['data']:
443 446 cell_output['data']['text/plain'] = sandbox_text
444 447 cell_output['data'].pop('application/javascript', None)
448
449 if 'source' in cell and cell['cell_type'] == 'markdown':
450 # sanitize similar like in markdown
451 cell['source'] = cls.bleach_clean(cell['source'])
452
445 453 return nb, resources
446 454
447 455 def _sanitize_resources(resources):
448 456 """
449 457 Skip/sanitize some of the CSS generated and included in jupyter
450 458 so it doesn't messes up UI so much
451 459 """
452 460
453 461 # TODO(marcink): probably we should replace this with whole custom
454 462 # CSS set that doesn't screw up, but jupyter generated html has some
455 463 # special markers, so it requires Custom HTML exporter template with
456 464 # _default_template_path_default, to achieve that
457 465
458 466 # strip the reset CSS
459 467 resources[0] = resources[0][resources[0].find('/*! Source'):]
460 468 return resources
461 469
462 470 def as_html(notebook):
463 471 conf = Config()
464 472 conf.CustomHTMLExporter.preprocessors = [Sandbox]
465 473 html_exporter = CustomHTMLExporter(config=conf)
466 474
467 475 (body, resources) = html_exporter.from_notebook_node(notebook)
468 476 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
469 477 js = MakoTemplate(r'''
470 478 <!-- Load mathjax -->
471 479 <!-- MathJax configuration -->
472 480 <script type="text/x-mathjax-config">
473 481 MathJax.Hub.Config({
474 482 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
475 483 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
476 484 TeX: {
477 485 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
478 486 },
479 487 tex2jax: {
480 488 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
481 489 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
482 490 processEscapes: true,
483 491 processEnvironments: true
484 492 },
485 493 // Center justify equations in code and markdown cells. Elsewhere
486 494 // we use CSS to left justify single line equations in code cells.
487 495 displayAlign: 'center',
488 496 "HTML-CSS": {
489 497 styles: {'.MathJax_Display': {"margin": 0}},
490 498 linebreaks: { automatic: true },
491 499 availableFonts: ["STIX", "TeX"]
492 500 },
493 501 showMathMenu: false
494 502 });
495 503 </script>
496 504 <!-- End of mathjax configuration -->
497 505 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
498 506 ''').render(h=helpers)
499 507
500 508 css = '<style>{}</style>'.format(
501 509 ''.join(_sanitize_resources(resources['inlining']['css'])))
502 510
503 511 body = '\n'.join([header, css, js, body])
504 512 return body, resources
505 513
506 514 notebook = nbformat.reads(source, as_version=4)
507 515 (body, resources) = as_html(notebook)
508 516 return body
509 517
510 518
511 519 class RstTemplateRenderer(object):
512 520
513 521 def __init__(self):
514 522 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
515 523 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
516 524 self.template_store = TemplateLookup(
517 525 directories=rst_template_dirs,
518 526 input_encoding='utf-8',
519 527 imports=['from rhodecode.lib import helpers as h'])
520 528
521 529 def _get_template(self, templatename):
522 530 return self.template_store.get_template(templatename)
523 531
524 532 def render(self, template_name, **kwargs):
525 533 template = self._get_template(template_name)
526 534 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now