##// END OF EJS Templates
security: sanitize plaintext renderer with bleach.
marcink -
r3485:80e2c96a default
parent child Browse files
Show More
@@ -1,557 +1,559 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2019 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 31 import bleach
32 32
33 33 from mako.lookup import TemplateLookup
34 34 from mako.template import Template as MakoTemplate
35 35
36 36 from docutils.core import publish_parts
37 37 from docutils.parsers.rst import directives
38 38 from docutils import writers
39 39 from docutils.writers import html4css1
40 40 import markdown
41 41
42 42 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
43 43 from rhodecode.lib.utils2 import (safe_unicode, md5_safe, MENTIONS_REGEX)
44 44
45 45 log = logging.getLogger(__name__)
46 46
47 47 # default renderer used to generate automated comments
48 48 DEFAULT_COMMENTS_RENDERER = 'rst'
49 49
50 50
51 51 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
52 52 """
53 53 Custom HTML Translator used for sandboxing potential
54 54 JS injections in ref links
55 55 """
56 56
57 57 def visit_reference(self, node):
58 58 if 'refuri' in node.attributes:
59 59 refuri = node['refuri']
60 60 if ':' in refuri:
61 61 prefix, link = refuri.lstrip().split(':', 1)
62 62 prefix = prefix or ''
63 63
64 64 if prefix.lower() == 'javascript':
65 65 # we don't allow javascript type of refs...
66 66 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
67 67
68 68 # old style class requires this...
69 69 return html4css1.HTMLTranslator.visit_reference(self, node)
70 70
71 71
72 72 class RhodeCodeWriter(writers.html4css1.Writer):
73 73 def __init__(self):
74 74 writers.Writer.__init__(self)
75 75 self.translator_class = CustomHTMLTranslator
76 76
77 77
78 78 def relative_links(html_source, server_paths):
79 79 if not html_source:
80 80 return html_source
81 81
82 82 try:
83 83 from lxml.html import fromstring
84 84 from lxml.html import tostring
85 85 except ImportError:
86 86 log.exception('Failed to import lxml')
87 87 return html_source
88 88
89 89 try:
90 90 doc = lxml.html.fromstring(html_source)
91 91 except Exception:
92 92 return html_source
93 93
94 94 for el in doc.cssselect('img, video'):
95 95 src = el.attrib.get('src')
96 96 if src:
97 97 el.attrib['src'] = relative_path(src, server_paths['raw'])
98 98
99 99 for el in doc.cssselect('a:not(.gfm)'):
100 100 src = el.attrib.get('href')
101 101 if src:
102 102 raw_mode = el.attrib['href'].endswith('?raw=1')
103 103 if raw_mode:
104 104 el.attrib['href'] = relative_path(src, server_paths['raw'])
105 105 else:
106 106 el.attrib['href'] = relative_path(src, server_paths['standard'])
107 107
108 108 return lxml.html.tostring(doc)
109 109
110 110
111 111 def relative_path(path, request_path, is_repo_file=None):
112 112 """
113 113 relative link support, path is a rel path, and request_path is current
114 114 server path (not absolute)
115 115
116 116 e.g.
117 117
118 118 path = '../logo.png'
119 119 request_path= '/repo/files/path/file.md'
120 120 produces: '/repo/files/logo.png'
121 121 """
122 122 # TODO(marcink): unicode/str support ?
123 123 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
124 124
125 125 def dummy_check(p):
126 126 return True # assume default is a valid file path
127 127
128 128 is_repo_file = is_repo_file or dummy_check
129 129 if not path:
130 130 return request_path
131 131
132 132 path = safe_unicode(path)
133 133 request_path = safe_unicode(request_path)
134 134
135 135 if path.startswith((u'data:', u'javascript:', u'#', u':')):
136 136 # skip data, anchor, invalid links
137 137 return path
138 138
139 139 is_absolute = bool(urlparse.urlparse(path).netloc)
140 140 if is_absolute:
141 141 return path
142 142
143 143 if not request_path:
144 144 return path
145 145
146 146 if path.startswith(u'/'):
147 147 path = path[1:]
148 148
149 149 if path.startswith(u'./'):
150 150 path = path[2:]
151 151
152 152 parts = request_path.split('/')
153 153 # compute how deep we need to traverse the request_path
154 154 depth = 0
155 155
156 156 if is_repo_file(request_path):
157 157 # if request path is a VALID file, we use a relative path with
158 158 # one level up
159 159 depth += 1
160 160
161 161 while path.startswith(u'../'):
162 162 depth += 1
163 163 path = path[3:]
164 164
165 165 if depth > 0:
166 166 parts = parts[:-depth]
167 167
168 168 parts.append(path)
169 169 final_path = u'/'.join(parts).lstrip(u'/')
170 170
171 171 return u'/' + final_path
172 172
173 173
174 174 _cached_markdown_renderer = None
175 175
176 176
177 177 def get_markdown_renderer(extensions, output_format):
178 178 global _cached_markdown_renderer
179 179
180 180 if _cached_markdown_renderer is None:
181 181 _cached_markdown_renderer = markdown.Markdown(
182 182 extensions=extensions,
183 183 enable_attributes=False, output_format=output_format)
184 184 return _cached_markdown_renderer
185 185
186 186
187 187 _cached_markdown_renderer_flavored = None
188 188
189 189
190 190 def get_markdown_renderer_flavored(extensions, output_format):
191 191 global _cached_markdown_renderer_flavored
192 192
193 193 if _cached_markdown_renderer_flavored is None:
194 194 _cached_markdown_renderer_flavored = markdown.Markdown(
195 195 extensions=extensions + [GithubFlavoredMarkdownExtension()],
196 196 enable_attributes=False, output_format=output_format)
197 197 return _cached_markdown_renderer_flavored
198 198
199 199
200 200 class MarkupRenderer(object):
201 201 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
202 202
203 203 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
204 204 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
205 205 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
206 206 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
207 207
208 208 URL_PAT = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
209 209 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
210 210
211 211 extensions = ['markdown.extensions.codehilite', 'markdown.extensions.extra',
212 212 'markdown.extensions.def_list', 'markdown.extensions.sane_lists']
213 213
214 214 output_format = 'html4'
215 215
216 216 # extension together with weights. Lower is first means we control how
217 217 # extensions are attached to readme names with those.
218 218 PLAIN_EXTS = [
219 219 # prefer no extension
220 220 ('', 0), # special case that renders READMES names without extension
221 221 ('.text', 2), ('.TEXT', 2),
222 222 ('.txt', 3), ('.TXT', 3)
223 223 ]
224 224
225 225 RST_EXTS = [
226 226 ('.rst', 1), ('.rest', 1),
227 227 ('.RST', 2), ('.REST', 2)
228 228 ]
229 229
230 230 MARKDOWN_EXTS = [
231 231 ('.md', 1), ('.MD', 1),
232 232 ('.mkdn', 2), ('.MKDN', 2),
233 233 ('.mdown', 3), ('.MDOWN', 3),
234 234 ('.markdown', 4), ('.MARKDOWN', 4)
235 235 ]
236 236
237 237 def _detect_renderer(self, source, filename=None):
238 238 """
239 239 runs detection of what renderer should be used for generating html
240 240 from a markup language
241 241
242 242 filename can be also explicitly a renderer name
243 243
244 244 :param source:
245 245 :param filename:
246 246 """
247 247
248 248 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
249 249 detected_renderer = 'markdown'
250 250 elif MarkupRenderer.RST_PAT.findall(filename):
251 251 detected_renderer = 'rst'
252 252 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
253 253 detected_renderer = 'jupyter'
254 254 elif MarkupRenderer.PLAIN_PAT.findall(filename):
255 255 detected_renderer = 'plain'
256 256 else:
257 257 detected_renderer = 'plain'
258 258
259 259 return getattr(MarkupRenderer, detected_renderer)
260 260
261 261 @classmethod
262 262 def bleach_clean(cls, text):
263 263 from .bleach_whitelist import markdown_attrs, markdown_tags
264 264 allowed_tags = markdown_tags
265 265 allowed_attrs = markdown_attrs
266 266
267 267 try:
268 268 return bleach.clean(text, tags=allowed_tags, attributes=allowed_attrs)
269 269 except Exception:
270 270 return 'UNPARSEABLE TEXT'
271 271
272 272 @classmethod
273 273 def renderer_from_filename(cls, filename, exclude):
274 274 """
275 275 Detect renderer markdown/rst from filename and optionally use exclude
276 276 list to remove some options. This is mostly used in helpers.
277 277 Returns None when no renderer can be detected.
278 278 """
279 279 def _filter(elements):
280 280 if isinstance(exclude, (list, tuple)):
281 281 return [x for x in elements if x not in exclude]
282 282 return elements
283 283
284 284 if filename.endswith(
285 285 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
286 286 return 'markdown'
287 287 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
288 288 return 'rst'
289 289
290 290 return None
291 291
292 292 def render(self, source, filename=None):
293 293 """
294 294 Renders a given filename using detected renderer
295 295 it detects renderers based on file extension or mimetype.
296 296 At last it will just do a simple html replacing new lines with <br/>
297 297
298 298 :param file_name:
299 299 :param source:
300 300 """
301 301
302 302 renderer = self._detect_renderer(source, filename)
303 303 readme_data = renderer(source)
304 304 return readme_data
305 305
306 306 @classmethod
307 307 def _flavored_markdown(cls, text):
308 308 """
309 309 Github style flavored markdown
310 310
311 311 :param text:
312 312 """
313 313
314 314 # Extract pre blocks.
315 315 extractions = {}
316 316
317 317 def pre_extraction_callback(matchobj):
318 318 digest = md5_safe(matchobj.group(0))
319 319 extractions[digest] = matchobj.group(0)
320 320 return "{gfm-extraction-%s}" % digest
321 321 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
322 322 text = re.sub(pattern, pre_extraction_callback, text)
323 323
324 324 # Prevent foo_bar_baz from ending up with an italic word in the middle.
325 325 def italic_callback(matchobj):
326 326 s = matchobj.group(0)
327 327 if list(s).count('_') >= 2:
328 328 return s.replace('_', r'\_')
329 329 return s
330 330 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
331 331
332 332 # Insert pre block extractions.
333 333 def pre_insert_callback(matchobj):
334 334 return '\n\n' + extractions[matchobj.group(1)]
335 335 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
336 336 pre_insert_callback, text)
337 337
338 338 return text
339 339
340 340 @classmethod
341 341 def urlify_text(cls, text):
342 342 def url_func(match_obj):
343 343 url_full = match_obj.groups()[0]
344 344 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
345 345
346 346 return cls.URL_PAT.sub(url_func, text)
347 347
348 348 @classmethod
349 349 def plain(cls, source, universal_newline=True, leading_newline=True):
350 350 source = safe_unicode(source)
351 351 if universal_newline:
352 352 newline = '\n'
353 353 source = newline.join(source.splitlines())
354 354
355 355 rendered_source = cls.urlify_text(source)
356 356 source = ''
357 357 if leading_newline:
358 358 source += '<br />'
359 359 source += rendered_source.replace("\n", '<br />')
360 return source
360
361 rendered = cls.bleach_clean(source)
362 return rendered
361 363
362 364 @classmethod
363 365 def markdown(cls, source, safe=True, flavored=True, mentions=False,
364 366 clean_html=True):
365 367 """
366 368 returns markdown rendered code cleaned by the bleach library
367 369 """
368 370
369 371 if flavored:
370 372 markdown_renderer = get_markdown_renderer_flavored(
371 373 cls.extensions, cls.output_format)
372 374 else:
373 375 markdown_renderer = get_markdown_renderer(
374 376 cls.extensions, cls.output_format)
375 377
376 378 if mentions:
377 379 mention_pat = re.compile(MENTIONS_REGEX)
378 380
379 381 def wrapp(match_obj):
380 382 uname = match_obj.groups()[0]
381 383 return ' **@%(uname)s** ' % {'uname': uname}
382 384 mention_hl = mention_pat.sub(wrapp, source).strip()
383 385 # we extracted mentions render with this using Mentions false
384 386 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
385 387 mentions=False)
386 388
387 389 source = safe_unicode(source)
388 390
389 391 try:
390 392 if flavored:
391 393 source = cls._flavored_markdown(source)
392 394 rendered = markdown_renderer.convert(source)
393 395 except Exception:
394 396 log.exception('Error when rendering Markdown')
395 397 if safe:
396 398 log.debug('Fallback to render in plain mode')
397 399 rendered = cls.plain(source)
398 400 else:
399 401 raise
400 402
401 403 if clean_html:
402 404 rendered = cls.bleach_clean(rendered)
403 405 return rendered
404 406
405 407 @classmethod
406 408 def rst(cls, source, safe=True, mentions=False, clean_html=False):
407 409 if mentions:
408 410 mention_pat = re.compile(MENTIONS_REGEX)
409 411
410 412 def wrapp(match_obj):
411 413 uname = match_obj.groups()[0]
412 414 return ' **@%(uname)s** ' % {'uname': uname}
413 415 mention_hl = mention_pat.sub(wrapp, source).strip()
414 416 # we extracted mentions render with this using Mentions false
415 417 return cls.rst(mention_hl, safe=safe, mentions=False)
416 418
417 419 source = safe_unicode(source)
418 420 try:
419 421 docutils_settings = dict(
420 422 [(alias, None) for alias in
421 423 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
422 424
423 425 docutils_settings.update({
424 426 'input_encoding': 'unicode', 'report_level': 4})
425 427
426 428 for k, v in docutils_settings.iteritems():
427 429 directives.register_directive(k, v)
428 430
429 431 parts = publish_parts(source=source,
430 432 writer=RhodeCodeWriter(),
431 433 settings_overrides=docutils_settings)
432 434 rendered = parts["fragment"]
433 435 if clean_html:
434 436 rendered = cls.bleach_clean(rendered)
435 437 return parts['html_title'] + rendered
436 438 except Exception:
437 439 log.exception('Error when rendering RST')
438 440 if safe:
439 441 log.debug('Fallbacking to render in plain mode')
440 442 return cls.plain(source)
441 443 else:
442 444 raise
443 445
444 446 @classmethod
445 447 def jupyter(cls, source, safe=True):
446 448 from rhodecode.lib import helpers
447 449
448 450 from traitlets.config import Config
449 451 import nbformat
450 452 from nbconvert import HTMLExporter
451 453 from nbconvert.preprocessors import Preprocessor
452 454
453 455 class CustomHTMLExporter(HTMLExporter):
454 456 def _template_file_default(self):
455 457 return 'basic'
456 458
457 459 class Sandbox(Preprocessor):
458 460
459 461 def preprocess(self, nb, resources):
460 462 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
461 463 for cell in nb['cells']:
462 464 if not safe:
463 465 continue
464 466
465 467 if 'outputs' in cell:
466 468 for cell_output in cell['outputs']:
467 469 if 'data' in cell_output:
468 470 if 'application/javascript' in cell_output['data']:
469 471 cell_output['data']['text/plain'] = sandbox_text
470 472 cell_output['data'].pop('application/javascript', None)
471 473
472 474 if 'source' in cell and cell['cell_type'] == 'markdown':
473 475 # sanitize similar like in markdown
474 476 cell['source'] = cls.bleach_clean(cell['source'])
475 477
476 478 return nb, resources
477 479
478 480 def _sanitize_resources(input_resources):
479 481 """
480 482 Skip/sanitize some of the CSS generated and included in jupyter
481 483 so it doesn't messes up UI so much
482 484 """
483 485
484 486 # TODO(marcink): probably we should replace this with whole custom
485 487 # CSS set that doesn't screw up, but jupyter generated html has some
486 488 # special markers, so it requires Custom HTML exporter template with
487 489 # _default_template_path_default, to achieve that
488 490
489 491 # strip the reset CSS
490 492 input_resources[0] = input_resources[0][input_resources[0].find('/*! Source'):]
491 493 return input_resources
492 494
493 495 def as_html(notebook):
494 496 conf = Config()
495 497 conf.CustomHTMLExporter.preprocessors = [Sandbox]
496 498 html_exporter = CustomHTMLExporter(config=conf)
497 499
498 500 (body, resources) = html_exporter.from_notebook_node(notebook)
499 501 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
500 502 js = MakoTemplate(r'''
501 503 <!-- Load mathjax -->
502 504 <!-- MathJax configuration -->
503 505 <script type="text/x-mathjax-config">
504 506 MathJax.Hub.Config({
505 507 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
506 508 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
507 509 TeX: {
508 510 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
509 511 },
510 512 tex2jax: {
511 513 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
512 514 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
513 515 processEscapes: true,
514 516 processEnvironments: true
515 517 },
516 518 // Center justify equations in code and markdown cells. Elsewhere
517 519 // we use CSS to left justify single line equations in code cells.
518 520 displayAlign: 'center',
519 521 "HTML-CSS": {
520 522 styles: {'.MathJax_Display': {"margin": 0}},
521 523 linebreaks: { automatic: true },
522 524 availableFonts: ["STIX", "TeX"]
523 525 },
524 526 showMathMenu: false
525 527 });
526 528 </script>
527 529 <!-- End of mathjax configuration -->
528 530 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
529 531 ''').render(h=helpers)
530 532
531 533 css = '<style>{}</style>'.format(
532 534 ''.join(_sanitize_resources(resources['inlining']['css'])))
533 535
534 536 body = '\n'.join([header, css, js, body])
535 537 return body, resources
536 538
537 539 notebook = nbformat.reads(source, as_version=4)
538 540 (body, resources) = as_html(notebook)
539 541 return body
540 542
541 543
542 544 class RstTemplateRenderer(object):
543 545
544 546 def __init__(self):
545 547 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
546 548 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
547 549 self.template_store = TemplateLookup(
548 550 directories=rst_template_dirs,
549 551 input_encoding='utf-8',
550 552 imports=['from rhodecode.lib import helpers as h'])
551 553
552 554 def _get_template(self, templatename):
553 555 return self.template_store.get_template(templatename)
554 556
555 557 def render(self, template_name, **kwargs):
556 558 template = self._get_template(template_name)
557 559 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now