##// END OF EJS Templates
fix(jupyter): adopted to support more formats. Fixes: RCCE-38
ilin.s -
r5273:5af2b517 default
parent child Browse files
Show More
@@ -1,581 +1,581 b''
1 1
2 2
3 3 # Copyright (C) 2011-2023 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urllib.parse
31 31 import pycmarkgfm
32 32
33 33 from mako.lookup import TemplateLookup
34 34 from mako.template import Template as MakoTemplate
35 35
36 36 from docutils.core import publish_parts
37 37 from docutils.parsers.rst import directives
38 38 from docutils import writers
39 39 from docutils.writers import html4css1
40 40 import markdown
41 41
42 42 from rhodecode.lib.utils2 import safe_str, MENTIONS_REGEX
43 43
44 44 log = logging.getLogger(__name__)
45 45
46 46 # default renderer used to generate automated comments
47 47 DEFAULT_COMMENTS_RENDERER = 'rst'
48 48
49 49 try:
50 50 from lxml.html import fromstring
51 51 from lxml.html import tostring
52 52 except ImportError:
53 53 log.exception('Failed to import lxml')
54 54 fromstring = None
55 55 tostring = None
56 56
57 57
58 58 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
59 59 """
60 60 Custom HTML Translator used for sandboxing potential
61 61 JS injections in ref links
62 62 """
63 63 def visit_literal_block(self, node):
64 64 self.body.append(self.starttag(node, 'pre', CLASS='codehilite literal-block'))
65 65
66 66 def visit_reference(self, node):
67 67 if 'refuri' in node.attributes:
68 68 refuri = node['refuri']
69 69 if ':' in refuri:
70 70 prefix, link = refuri.lstrip().split(':', 1)
71 71 prefix = prefix or ''
72 72
73 73 if prefix.lower() == 'javascript':
74 74 # we don't allow javascript type of refs...
75 75 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
76 76
77 77 # old style class requires this...
78 78 return html4css1.HTMLTranslator.visit_reference(self, node)
79 79
80 80
81 81 class RhodeCodeWriter(writers.html4css1.Writer):
82 82 def __init__(self):
83 83 super(RhodeCodeWriter, self).__init__()
84 84 self.translator_class = CustomHTMLTranslator
85 85
86 86
87 87 def relative_links(html_source, server_paths):
88 88 if not html_source:
89 89 return html_source
90 90
91 91 if not fromstring and tostring:
92 92 return html_source
93 93
94 94 try:
95 95 doc = lxml.html.fromstring(html_source)
96 96 except Exception:
97 97 return html_source
98 98
99 99 for el in doc.cssselect('img, video'):
100 100 src = el.attrib.get('src')
101 101 if src:
102 102 el.attrib['src'] = relative_path(src, server_paths['raw'])
103 103
104 104 for el in doc.cssselect('a:not(.gfm)'):
105 105 src = el.attrib.get('href')
106 106 if src:
107 107 raw_mode = el.attrib['href'].endswith('?raw=1')
108 108 if raw_mode:
109 109 el.attrib['href'] = relative_path(src, server_paths['raw'])
110 110 else:
111 111 el.attrib['href'] = relative_path(src, server_paths['standard'])
112 112
113 113 return lxml.html.tostring(doc, encoding='unicode')
114 114
115 115
116 116 def relative_path(path, request_path, is_repo_file=None):
117 117 """
118 118 relative link support, path is a rel path, and request_path is current
119 119 server path (not absolute)
120 120
121 121 e.g.
122 122
123 123 path = '../logo.png'
124 124 request_path= '/repo/files/path/file.md'
125 125 produces: '/repo/files/logo.png'
126 126 """
127 127 # TODO(marcink): unicode/str support ?
128 128 # maybe=> safe_str(urllib.quote(safe_str(final_path), '/:'))
129 129
130 130 def dummy_check(p):
131 131 return True # assume default is a valid file path
132 132
133 133 is_repo_file = is_repo_file or dummy_check
134 134 if not path:
135 135 return request_path
136 136
137 137 path = safe_str(path)
138 138 request_path = safe_str(request_path)
139 139
140 140 if path.startswith(('data:', 'javascript:', '#', ':')):
141 141 # skip data, anchor, invalid links
142 142 return path
143 143
144 144 is_absolute = bool(urllib.parse.urlparse(path).netloc)
145 145 if is_absolute:
146 146 return path
147 147
148 148 if not request_path:
149 149 return path
150 150
151 151 if path.startswith('/'):
152 152 path = path[1:]
153 153
154 154 if path.startswith('./'):
155 155 path = path[2:]
156 156
157 157 parts = request_path.split('/')
158 158 # compute how deep we need to traverse the request_path
159 159 depth = 0
160 160
161 161 if is_repo_file(request_path):
162 162 # if request path is a VALID file, we use a relative path with
163 163 # one level up
164 164 depth += 1
165 165
166 166 while path.startswith('../'):
167 167 depth += 1
168 168 path = path[3:]
169 169
170 170 if depth > 0:
171 171 parts = parts[:-depth]
172 172
173 173 parts.append(path)
174 174 final_path = '/'.join(parts).lstrip('/')
175 175
176 176 return '/' + final_path
177 177
178 178
179 179 _cached_markdown_renderer = None
180 180
181 181
182 182 def get_markdown_renderer(extensions, output_format):
183 183 global _cached_markdown_renderer
184 184
185 185 if _cached_markdown_renderer is None:
186 186 _cached_markdown_renderer = markdown.Markdown(
187 187 extensions=extensions + ['legacy_attrs'],
188 188 output_format=output_format)
189 189 return _cached_markdown_renderer
190 190
191 191
192 192 def get_markdown_renderer_flavored(extensions, output_format):
193 193 """
194 194 Dummy wrapper to mimic markdown API and render github HTML rendered
195 195
196 196 """
197 197 md = get_markdown_renderer(extensions, output_format)
198 198
199 199 class GFM(object):
200 200 def convert(self, source):
201 201 with pycmarkgfm.parse_gfm(source, options=pycmarkgfm.options.hardbreaks) as document:
202 202 parsed_md = document.to_commonmark()
203 203 return md.convert(parsed_md)
204 204
205 205 return GFM()
206 206
207 207
208 208 class MarkupRenderer(object):
209 209 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
210 210
211 211 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
212 212 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
213 213 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
214 214 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
215 215
216 216 URL_PAT = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
217 217 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
218 218
219 219 MENTION_PAT = re.compile(MENTIONS_REGEX)
220 220
221 221 extensions = ['markdown.extensions.codehilite', 'markdown.extensions.extra',
222 222 'markdown.extensions.def_list', 'markdown.extensions.sane_lists']
223 223
224 224 output_format = 'html4'
225 225
226 226 # extension together with weights. Lower is first means we control how
227 227 # extensions are attached to readme names with those.
228 228 PLAIN_EXTS = [
229 229 # prefer no extension
230 230 ('', 0), # special case that renders READMES names without extension
231 231 ('.text', 2), ('.TEXT', 2),
232 232 ('.txt', 3), ('.TXT', 3)
233 233 ]
234 234
235 235 RST_EXTS = [
236 236 ('.rst', 1), ('.rest', 1),
237 237 ('.RST', 2), ('.REST', 2)
238 238 ]
239 239
240 240 MARKDOWN_EXTS = [
241 241 ('.md', 1), ('.MD', 1),
242 242 ('.mkdn', 2), ('.MKDN', 2),
243 243 ('.mdown', 3), ('.MDOWN', 3),
244 244 ('.markdown', 4), ('.MARKDOWN', 4)
245 245 ]
246 246
247 247 def _detect_renderer(self, source, filename=None):
248 248 """
249 249 runs detection of what renderer should be used for generating html
250 250 from a markup language
251 251
252 252 filename can be also explicitly a renderer name
253 253
254 254 :param source:
255 255 :param filename:
256 256 """
257 257
258 258 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
259 259 detected_renderer = 'markdown'
260 260 elif MarkupRenderer.RST_PAT.findall(filename):
261 261 detected_renderer = 'rst'
262 262 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
263 263 detected_renderer = 'jupyter'
264 264 elif MarkupRenderer.PLAIN_PAT.findall(filename):
265 265 detected_renderer = 'plain'
266 266 else:
267 267 detected_renderer = 'plain'
268 268
269 269 return getattr(MarkupRenderer, detected_renderer)
270 270
271 271 @classmethod
272 272 def sanitize_html(cls, text):
273 273 from .html_filters import sanitize_html
274 274 return sanitize_html(text, markdown=True)
275 275
276 276 @classmethod
277 277 def renderer_from_filename(cls, filename, exclude):
278 278 """
279 279 Detect renderer markdown/rst from filename and optionally use exclude
280 280 list to remove some options. This is mostly used in helpers.
281 281 Returns None when no renderer can be detected.
282 282 """
283 283 def _filter(elements):
284 284 if isinstance(exclude, (list, tuple)):
285 285 return [x for x in elements if x not in exclude]
286 286 return elements
287 287
288 288 if filename.endswith(
289 289 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
290 290 return 'markdown'
291 291 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
292 292 return 'rst'
293 293
294 294 return None
295 295
296 296 def render(self, source, filename=None):
297 297 """
298 298 Renders a given filename using detected renderer
299 299 it detects renderers based on file extension or mimetype.
300 300 At last it will just do a simple html replacing new lines with <br/>
301 301 """
302 302
303 303 renderer = self._detect_renderer(source, filename)
304 304 readme_data = renderer(source)
305 305 return readme_data
306 306
307 307 @classmethod
308 308 def urlify_text(cls, text):
309 309 def url_func(match_obj):
310 310 url_full = match_obj.groups()[0]
311 311 return f'<a href="{url_full}">{url_full}</a>'
312 312
313 313 return cls.URL_PAT.sub(url_func, text)
314 314
315 315 @classmethod
316 316 def convert_mentions(cls, text, mode):
317 317 mention_pat = cls.MENTION_PAT
318 318
319 319 def wrapp(match_obj):
320 320 uname = match_obj.groups()[0]
321 321 hovercard_url = "pyroutes.url('hovercard_username', {'username': '%s'});" % uname
322 322
323 323 if mode == 'markdown':
324 324 tmpl = '<strong class="tooltip-hovercard" data-hovercard-alt="{uname}" data-hovercard-url="{hovercard_url}">@{uname}</strong>'
325 325 elif mode == 'rst':
326 326 tmpl = ' **@{uname}** '
327 327 else:
328 328 raise ValueError('mode must be rst or markdown')
329 329
330 330 return tmpl.format(**{'uname': uname,
331 331 'hovercard_url': hovercard_url})
332 332
333 333 return mention_pat.sub(wrapp, text).strip()
334 334
335 335 @classmethod
336 336 def plain(cls, source, universal_newline=True, leading_newline=True):
337 337 source = safe_str(source)
338 338 if universal_newline:
339 339 newline = '\n'
340 340 source = newline.join(source.splitlines())
341 341
342 342 rendered_source = cls.urlify_text(source)
343 343 source = ''
344 344 if leading_newline:
345 345 source += '<br />'
346 346 source += rendered_source.replace("\n", '<br />')
347 347
348 348 rendered = cls.sanitize_html(source)
349 349 return rendered
350 350
351 351 @classmethod
352 352 def markdown(cls, source, safe=True, flavored=True, mentions=False,
353 353 clean_html=True):
354 354 """
355 355 returns markdown rendered code cleaned by the bleach library
356 356 """
357 357
358 358 if flavored:
359 359 markdown_renderer = get_markdown_renderer_flavored(
360 360 cls.extensions, cls.output_format)
361 361 else:
362 362 markdown_renderer = get_markdown_renderer(
363 363 cls.extensions, cls.output_format)
364 364
365 365 if mentions:
366 366 mention_hl = cls.convert_mentions(source, mode='markdown')
367 367 # we extracted mentions render with this using Mentions false
368 368 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
369 369 mentions=False)
370 370
371 371 try:
372 372 rendered = markdown_renderer.convert(source)
373 373
374 374 except Exception:
375 375 log.exception('Error when rendering Markdown')
376 376 if safe:
377 377 log.debug('Fallback to render in plain mode')
378 378 rendered = cls.plain(source)
379 379 else:
380 380 raise
381 381
382 382 if clean_html:
383 383 rendered = cls.sanitize_html(rendered)
384 384 return rendered
385 385
386 386 @classmethod
387 387 def rst(cls, source, safe=True, mentions=False, clean_html=False):
388 388
389 389 if mentions:
390 390 mention_hl = cls.convert_mentions(source, mode='rst')
391 391 # we extracted mentions render with this using Mentions false
392 392 return cls.rst(mention_hl, safe=safe, mentions=False)
393 393
394 394 source = safe_str(source)
395 395 try:
396 396 docutils_settings = dict(
397 397 [(alias, None) for alias in
398 398 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
399 399
400 400 docutils_settings.update({
401 401 'input_encoding': 'unicode',
402 402 'report_level': 4,
403 403 'syntax_highlight': 'short',
404 404 })
405 405
406 406 for k, v in list(docutils_settings.items()):
407 407 directives.register_directive(k, v)
408 408
409 409 parts = publish_parts(source=source,
410 410 writer=RhodeCodeWriter(),
411 411 settings_overrides=docutils_settings)
412 412 rendered = parts["fragment"]
413 413 if clean_html:
414 414 rendered = cls.sanitize_html(rendered)
415 415 return parts['html_title'] + rendered
416 416 except Exception:
417 417 log.exception('Error when rendering RST')
418 418 if safe:
419 419 log.debug('Fallback to render in plain mode')
420 420 return cls.plain(source)
421 421 else:
422 422 raise
423 423
424 424 @classmethod
425 425 def jupyter(cls, source, safe=True):
426 426 from rhodecode.lib import helpers
427 427 from .html_sanitizer_defs import markdown_attrs, all_tags, all_styles
428 428
429 429 from traitlets import default, config
430 430 import nbformat
431 431 from nbconvert import HTMLExporter
432 432 from nbconvert.preprocessors import Preprocessor
433 433 from nbconvert.preprocessors.sanitize import SanitizeHTML
434 434
435 435 class CustomHTMLExporter(HTMLExporter):
436 436
437 437 @default("template_file")
438 438 def _template_file_default(self):
439 439 if self.template_extension:
440 440 return "basic/index" + self.template_extension
441 441
442 442 class Sandbox(Preprocessor):
443 443
444 444 def preprocess_cell(self, cell, resources, cell_index):
445 445 if not safe:
446 446 return cell, resources
447 447 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
448 448 if cell.cell_type == "markdown":
449 449 cell.source = cls.sanitize_html(cell.source)
450 450 return cell, resources
451 451
452 for cell_output in cell.outputs:
452 for cell_output in cell.get('outputs', []):
453 453 if 'data' in cell_output:
454 454 if 'application/javascript' in cell_output['data']:
455 455 cell_output['data']['text/plain'] = sandbox_text
456 456 cell_output['data'].pop('application/javascript', None)
457 457 return cell, resources
458 458
459 459 def _sanitize_resources(input_resources):
460 460 """
461 461 Skip/sanitize some of the CSS generated and included in jupyter
462 462 so it doesn't mess up UI so much
463 463 """
464 464
465 465 # TODO(marcink): probably we should replace this with whole custom
466 466 # CSS set that doesn't screw up, but jupyter generated html has some
467 467 # special markers, so it requires Custom HTML exporter template with
468 468 # _default_template_path_default, to achieve that
469 469
470 470 # strip the reset CSS
471 471 input_resources[0] = input_resources[0][input_resources[0].find('/*! Source'):]
472 472 return input_resources
473 473
474 474 def as_html(notebook):
475 475 conf = config.Config()
476 476 # TODO: Keep an eye on the order of preprocessors
477 477 conf.CustomHTMLExporter.default_preprocessors = [Sandbox, SanitizeHTML]
478 478 conf.Sandbox.enabled = True
479 479 conf.SanitizeHTML.enabled = True
480 480 conf.SanitizeHTML.attributes = markdown_attrs
481 481 conf.SanitizeHTML.tags = all_tags
482 482 conf.SanitizeHTML.styles = all_styles
483 483 conf.SanitizeHTML.sanitized_output_types = {
484 484 "text/html",
485 485 "text/markdown",
486 486 }
487 487 conf.SanitizeHTML.safe_output_keys = {
488 488 "metadata",
489 489 "text/plain",
490 490 "text/latex",
491 491 "application/json",
492 492 "image/png",
493 493 "image/jpg"
494 494 "image/jpeg",
495 495 "image/svg",
496 496 "image/svg+xml"
497 497 }
498 498
499 499 html_exporter = CustomHTMLExporter(config=conf)
500 500
501 501 (body, resources) = html_exporter.from_notebook_node(notebook)
502 502
503 503 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
504 504 js = MakoTemplate(r'''
505 505 <!-- MathJax configuration -->
506 506 <script type="text/x-mathjax-config">
507 507 MathJax.Hub.Config({
508 508 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
509 509 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
510 510 TeX: {
511 511 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
512 512 },
513 513 tex2jax: {
514 514 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
515 515 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
516 516 processEscapes: true,
517 517 processEnvironments: true
518 518 },
519 519 // Center justify equations in code and markdown cells. Elsewhere
520 520 // we use CSS to left justify single line equations in code cells.
521 521 displayAlign: 'center',
522 522 "HTML-CSS": {
523 523 styles: {'.MathJax_Display': {"margin": 0}},
524 524 linebreaks: { automatic: true },
525 525 availableFonts: ["STIX", "TeX"]
526 526 },
527 527 showMathMenu: false
528 528 });
529 529 </script>
530 530 <!-- End of MathJax configuration -->
531 531 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
532 532 ''').render(h=helpers)
533 533
534 534 css = MakoTemplate(r'''
535 535 <link rel="stylesheet" type="text/css" href="${h.asset('css/style-ipython.css', ver=ver)}" media="screen"/>
536 536 ''').render(h=helpers, ver='ver1')
537 537
538 538 body = '\n'.join([header, css, js, body])
539 539 return body, resources
540 540
541 541 captured_errors = {}
542 542 error_body = """
543 543 <div style="text-align: center;">
544 544 <h3>Invalid Notebook!</h3>
545 545 <p>{}</p>
546 546 </div>
547 547 """
548 548 # TODO: In the event of a newer jupyter notebook version, consider increasing the as_version parameter
549 549 notebook = nbformat.reads(source, as_version=4, capture_validation_error=captured_errors)
550 550 if captured_errors:
551 551 error_messages = '<br>'.join(str(error) for error in captured_errors.values())
552 552 body = error_body.format(error_messages)
553 553 else:
554 554 try:
555 555 body, _ = as_html(notebook)
556 except AttributeError:
556 except (AttributeError, nbformat.ValidationError):
557 557 try:
558 558 nbformat.validate(nbformat.reader.reads(source))
559 559 except nbformat.ValidationError as exc:
560 560 body = error_body.format(str(exc))
561 561 else:
562 562 raise
563 563 return body
564 564
565 565
566 566 class RstTemplateRenderer(object):
567 567
568 568 def __init__(self):
569 569 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
570 570 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
571 571 self.template_store = TemplateLookup(
572 572 directories=rst_template_dirs,
573 573 input_encoding='utf-8',
574 574 imports=['from rhodecode.lib import helpers as h'])
575 575
576 576 def _get_template(self, templatename):
577 577 return self.template_store.get_template(templatename)
578 578
579 579 def render(self, template_name, **kwargs):
580 580 template = self._get_template(template_name)
581 581 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now