##// END OF EJS Templates
security: improve Javascript RST sandbox to also catch mixed case.
marcink -
r3147:7609f194 default
parent child Browse files
Show More
@@ -1,524 +1,526 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2018 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 31 import bleach
32 32
33 33 from mako.lookup import TemplateLookup
34 34 from mako.template import Template as MakoTemplate
35 35
36 36 from docutils.core import publish_parts
37 37 from docutils.parsers.rst import directives
38 38 from docutils import writers
39 39 from docutils.writers import html4css1
40 40 import markdown
41 41
42 42 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
43 43 from rhodecode.lib.utils2 import (
44 44 safe_str, safe_unicode, md5_safe, MENTIONS_REGEX)
45 45
46 46 log = logging.getLogger(__name__)
47 47
48 48 # default renderer used to generate automated comments
49 49 DEFAULT_COMMENTS_RENDERER = 'rst'
50 50
51 51
52 52 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
53 53 """
54 54 Custom HTML Translator used for sandboxing potential
55 55 JS injections in ref links
56 56 """
57 57
58 58 def visit_reference(self, node):
59 59 if 'refuri' in node.attributes:
60 60 refuri = node['refuri']
61 61 if ':' in refuri:
62 62 prefix, link = refuri.lstrip().split(':', 1)
63 if prefix == 'javascript':
63 prefix = prefix or ''
64
65 if prefix.lower() == 'javascript':
64 66 # we don't allow javascript type of refs...
65 67 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
66 68
67 69 # old style class requires this...
68 70 return html4css1.HTMLTranslator.visit_reference(self, node)
69 71
70 72
71 73 class RhodeCodeWriter(writers.html4css1.Writer):
72 74 def __init__(self):
73 75 writers.Writer.__init__(self)
74 76 self.translator_class = CustomHTMLTranslator
75 77
76 78
77 79 def relative_links(html_source, server_paths):
78 80 if not html_source:
79 81 return html_source
80 82
81 83 try:
82 84 from lxml.html import fromstring
83 85 from lxml.html import tostring
84 86 except ImportError:
85 87 log.exception('Failed to import lxml')
86 88 return html_source
87 89
88 90 try:
89 91 doc = lxml.html.fromstring(html_source)
90 92 except Exception:
91 93 return html_source
92 94
93 95 for el in doc.cssselect('img, video'):
94 96 src = el.attrib.get('src')
95 97 if src:
96 98 el.attrib['src'] = relative_path(src, server_paths['raw'])
97 99
98 100 for el in doc.cssselect('a:not(.gfm)'):
99 101 src = el.attrib.get('href')
100 102 if src:
101 103 raw_mode = el.attrib['href'].endswith('?raw=1')
102 104 if raw_mode:
103 105 el.attrib['href'] = relative_path(src, server_paths['raw'])
104 106 else:
105 107 el.attrib['href'] = relative_path(src, server_paths['standard'])
106 108
107 109 return lxml.html.tostring(doc)
108 110
109 111
110 112 def relative_path(path, request_path, is_repo_file=None):
111 113 """
112 114 relative link support, path is a rel path, and request_path is current
113 115 server path (not absolute)
114 116
115 117 e.g.
116 118
117 119 path = '../logo.png'
118 120 request_path= '/repo/files/path/file.md'
119 121 produces: '/repo/files/logo.png'
120 122 """
121 123 # TODO(marcink): unicode/str support ?
122 124 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
123 125
124 126 def dummy_check(p):
125 127 return True # assume default is a valid file path
126 128
127 129 is_repo_file = is_repo_file or dummy_check
128 130 if not path:
129 131 return request_path
130 132
131 133 path = safe_unicode(path)
132 134 request_path = safe_unicode(request_path)
133 135
134 136 if path.startswith((u'data:', u'javascript:', u'#', u':')):
135 137 # skip data, anchor, invalid links
136 138 return path
137 139
138 140 is_absolute = bool(urlparse.urlparse(path).netloc)
139 141 if is_absolute:
140 142 return path
141 143
142 144 if not request_path:
143 145 return path
144 146
145 147 if path.startswith(u'/'):
146 148 path = path[1:]
147 149
148 150 if path.startswith(u'./'):
149 151 path = path[2:]
150 152
151 153 parts = request_path.split('/')
152 154 # compute how deep we need to traverse the request_path
153 155 depth = 0
154 156
155 157 if is_repo_file(request_path):
156 158 # if request path is a VALID file, we use a relative path with
157 159 # one level up
158 160 depth += 1
159 161
160 162 while path.startswith(u'../'):
161 163 depth += 1
162 164 path = path[3:]
163 165
164 166 if depth > 0:
165 167 parts = parts[:-depth]
166 168
167 169 parts.append(path)
168 170 final_path = u'/'.join(parts).lstrip(u'/')
169 171
170 172 return u'/' + final_path
171 173
172 174
173 175 class MarkupRenderer(object):
174 176 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
175 177
176 178 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
177 179 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
178 180 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
179 181 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
180 182
181 183 URL_PAT = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
182 184 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
183 185
184 186 extensions = ['codehilite', 'extra', 'def_list', 'sane_lists']
185 187 output_format = 'html4'
186 188 markdown_renderer = markdown.Markdown(
187 189 extensions, enable_attributes=False, output_format=output_format)
188 190
189 191 markdown_renderer_flavored = markdown.Markdown(
190 192 extensions + [GithubFlavoredMarkdownExtension()],
191 193 enable_attributes=False, output_format=output_format)
192 194
193 195 # extension together with weights. Lower is first means we control how
194 196 # extensions are attached to readme names with those.
195 197 PLAIN_EXTS = [
196 198 # prefer no extension
197 199 ('', 0), # special case that renders READMES names without extension
198 200 ('.text', 2), ('.TEXT', 2),
199 201 ('.txt', 3), ('.TXT', 3)
200 202 ]
201 203
202 204 RST_EXTS = [
203 205 ('.rst', 1), ('.rest', 1),
204 206 ('.RST', 2), ('.REST', 2)
205 207 ]
206 208
207 209 MARKDOWN_EXTS = [
208 210 ('.md', 1), ('.MD', 1),
209 211 ('.mkdn', 2), ('.MKDN', 2),
210 212 ('.mdown', 3), ('.MDOWN', 3),
211 213 ('.markdown', 4), ('.MARKDOWN', 4)
212 214 ]
213 215
214 216 def _detect_renderer(self, source, filename=None):
215 217 """
216 218 runs detection of what renderer should be used for generating html
217 219 from a markup language
218 220
219 221 filename can be also explicitly a renderer name
220 222
221 223 :param source:
222 224 :param filename:
223 225 """
224 226
225 227 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
226 228 detected_renderer = 'markdown'
227 229 elif MarkupRenderer.RST_PAT.findall(filename):
228 230 detected_renderer = 'rst'
229 231 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
230 232 detected_renderer = 'jupyter'
231 233 elif MarkupRenderer.PLAIN_PAT.findall(filename):
232 234 detected_renderer = 'plain'
233 235 else:
234 236 detected_renderer = 'plain'
235 237
236 238 return getattr(MarkupRenderer, detected_renderer)
237 239
238 240 @classmethod
239 241 def bleach_clean(cls, text):
240 242 from .bleach_whitelist import markdown_attrs, markdown_tags
241 243 allowed_tags = markdown_tags
242 244 allowed_attrs = markdown_attrs
243 245
244 246 try:
245 247 return bleach.clean(text, tags=allowed_tags, attributes=allowed_attrs)
246 248 except Exception:
247 249 return 'UNPARSEABLE TEXT'
248 250
249 251 @classmethod
250 252 def renderer_from_filename(cls, filename, exclude):
251 253 """
252 254 Detect renderer markdown/rst from filename and optionally use exclude
253 255 list to remove some options. This is mostly used in helpers.
254 256 Returns None when no renderer can be detected.
255 257 """
256 258 def _filter(elements):
257 259 if isinstance(exclude, (list, tuple)):
258 260 return [x for x in elements if x not in exclude]
259 261 return elements
260 262
261 263 if filename.endswith(
262 264 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
263 265 return 'markdown'
264 266 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
265 267 return 'rst'
266 268
267 269 return None
268 270
269 271 def render(self, source, filename=None):
270 272 """
271 273 Renders a given filename using detected renderer
272 274 it detects renderers based on file extension or mimetype.
273 275 At last it will just do a simple html replacing new lines with <br/>
274 276
275 277 :param file_name:
276 278 :param source:
277 279 """
278 280
279 281 renderer = self._detect_renderer(source, filename)
280 282 readme_data = renderer(source)
281 283 return readme_data
282 284
283 285 @classmethod
284 286 def _flavored_markdown(cls, text):
285 287 """
286 288 Github style flavored markdown
287 289
288 290 :param text:
289 291 """
290 292
291 293 # Extract pre blocks.
292 294 extractions = {}
293 295
294 296 def pre_extraction_callback(matchobj):
295 297 digest = md5_safe(matchobj.group(0))
296 298 extractions[digest] = matchobj.group(0)
297 299 return "{gfm-extraction-%s}" % digest
298 300 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
299 301 text = re.sub(pattern, pre_extraction_callback, text)
300 302
301 303 # Prevent foo_bar_baz from ending up with an italic word in the middle.
302 304 def italic_callback(matchobj):
303 305 s = matchobj.group(0)
304 306 if list(s).count('_') >= 2:
305 307 return s.replace('_', r'\_')
306 308 return s
307 309 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
308 310
309 311 # Insert pre block extractions.
310 312 def pre_insert_callback(matchobj):
311 313 return '\n\n' + extractions[matchobj.group(1)]
312 314 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
313 315 pre_insert_callback, text)
314 316
315 317 return text
316 318
317 319 @classmethod
318 320 def urlify_text(cls, text):
319 321 def url_func(match_obj):
320 322 url_full = match_obj.groups()[0]
321 323 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
322 324
323 325 return cls.URL_PAT.sub(url_func, text)
324 326
325 327 @classmethod
326 328 def plain(cls, source, universal_newline=True, leading_newline=True):
327 329 source = safe_unicode(source)
328 330 if universal_newline:
329 331 newline = '\n'
330 332 source = newline.join(source.splitlines())
331 333
332 334 rendered_source = cls.urlify_text(source)
333 335 source = ''
334 336 if leading_newline:
335 337 source += '<br />'
336 338 source += rendered_source.replace("\n", '<br />')
337 339 return source
338 340
339 341 @classmethod
340 342 def markdown(cls, source, safe=True, flavored=True, mentions=False,
341 343 clean_html=True):
342 344 """
343 345 returns markdown rendered code cleaned by the bleach library
344 346 """
345 347
346 348 if flavored:
347 349 markdown_renderer = cls.markdown_renderer_flavored
348 350 else:
349 351 markdown_renderer = cls.markdown_renderer
350 352
351 353 if mentions:
352 354 mention_pat = re.compile(MENTIONS_REGEX)
353 355
354 356 def wrapp(match_obj):
355 357 uname = match_obj.groups()[0]
356 358 return ' **@%(uname)s** ' % {'uname': uname}
357 359 mention_hl = mention_pat.sub(wrapp, source).strip()
358 360 # we extracted mentions render with this using Mentions false
359 361 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
360 362 mentions=False)
361 363
362 364 source = safe_unicode(source)
363 365
364 366 try:
365 367 if flavored:
366 368 source = cls._flavored_markdown(source)
367 369 rendered = markdown_renderer.convert(source)
368 370 except Exception:
369 371 log.exception('Error when rendering Markdown')
370 372 if safe:
371 373 log.debug('Fallback to render in plain mode')
372 374 rendered = cls.plain(source)
373 375 else:
374 376 raise
375 377
376 378 if clean_html:
377 379 rendered = cls.bleach_clean(rendered)
378 380 return rendered
379 381
380 382 @classmethod
381 383 def rst(cls, source, safe=True, mentions=False, clean_html=False):
382 384 if mentions:
383 385 mention_pat = re.compile(MENTIONS_REGEX)
384 386
385 387 def wrapp(match_obj):
386 388 uname = match_obj.groups()[0]
387 389 return ' **@%(uname)s** ' % {'uname': uname}
388 390 mention_hl = mention_pat.sub(wrapp, source).strip()
389 391 # we extracted mentions render with this using Mentions false
390 392 return cls.rst(mention_hl, safe=safe, mentions=False)
391 393
392 394 source = safe_unicode(source)
393 395 try:
394 396 docutils_settings = dict(
395 397 [(alias, None) for alias in
396 398 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
397 399
398 400 docutils_settings.update({
399 401 'input_encoding': 'unicode', 'report_level': 4})
400 402
401 403 for k, v in docutils_settings.iteritems():
402 404 directives.register_directive(k, v)
403 405
404 406 parts = publish_parts(source=source,
405 407 writer=RhodeCodeWriter(),
406 408 settings_overrides=docutils_settings)
407 409 rendered = parts["fragment"]
408 410 if clean_html:
409 411 rendered = cls.bleach_clean(rendered)
410 412 return parts['html_title'] + rendered
411 413 except Exception:
412 414 log.exception('Error when rendering RST')
413 415 if safe:
414 416 log.debug('Fallbacking to render in plain mode')
415 417 return cls.plain(source)
416 418 else:
417 419 raise
418 420
419 421 @classmethod
420 422 def jupyter(cls, source, safe=True):
421 423 from rhodecode.lib import helpers
422 424
423 425 from traitlets.config import Config
424 426 import nbformat
425 427 from nbconvert import HTMLExporter
426 428 from nbconvert.preprocessors import Preprocessor
427 429
428 430 class CustomHTMLExporter(HTMLExporter):
429 431 def _template_file_default(self):
430 432 return 'basic'
431 433
432 434 class Sandbox(Preprocessor):
433 435
434 436 def preprocess(self, nb, resources):
435 437 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
436 438 for cell in nb['cells']:
437 439 if safe and 'outputs' in cell:
438 440 for cell_output in cell['outputs']:
439 441 if 'data' in cell_output:
440 442 if 'application/javascript' in cell_output['data']:
441 443 cell_output['data']['text/plain'] = sandbox_text
442 444 cell_output['data'].pop('application/javascript', None)
443 445 return nb, resources
444 446
445 447 def _sanitize_resources(resources):
446 448 """
447 449 Skip/sanitize some of the CSS generated and included in jupyter
448 450 so it doesn't messes up UI so much
449 451 """
450 452
451 453 # TODO(marcink): probably we should replace this with whole custom
452 454 # CSS set that doesn't screw up, but jupyter generated html has some
453 455 # special markers, so it requires Custom HTML exporter template with
454 456 # _default_template_path_default, to achieve that
455 457
456 458 # strip the reset CSS
457 459 resources[0] = resources[0][resources[0].find('/*! Source'):]
458 460 return resources
459 461
460 462 def as_html(notebook):
461 463 conf = Config()
462 464 conf.CustomHTMLExporter.preprocessors = [Sandbox]
463 465 html_exporter = CustomHTMLExporter(config=conf)
464 466
465 467 (body, resources) = html_exporter.from_notebook_node(notebook)
466 468 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
467 469 js = MakoTemplate(r'''
468 470 <!-- Load mathjax -->
469 471 <!-- MathJax configuration -->
470 472 <script type="text/x-mathjax-config">
471 473 MathJax.Hub.Config({
472 474 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
473 475 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
474 476 TeX: {
475 477 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
476 478 },
477 479 tex2jax: {
478 480 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
479 481 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
480 482 processEscapes: true,
481 483 processEnvironments: true
482 484 },
483 485 // Center justify equations in code and markdown cells. Elsewhere
484 486 // we use CSS to left justify single line equations in code cells.
485 487 displayAlign: 'center',
486 488 "HTML-CSS": {
487 489 styles: {'.MathJax_Display': {"margin": 0}},
488 490 linebreaks: { automatic: true },
489 491 availableFonts: ["STIX", "TeX"]
490 492 },
491 493 showMathMenu: false
492 494 });
493 495 </script>
494 496 <!-- End of mathjax configuration -->
495 497 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
496 498 ''').render(h=helpers)
497 499
498 500 css = '<style>{}</style>'.format(
499 501 ''.join(_sanitize_resources(resources['inlining']['css'])))
500 502
501 503 body = '\n'.join([header, css, js, body])
502 504 return body, resources
503 505
504 506 notebook = nbformat.reads(source, as_version=4)
505 507 (body, resources) = as_html(notebook)
506 508 return body
507 509
508 510
509 511 class RstTemplateRenderer(object):
510 512
511 513 def __init__(self):
512 514 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
513 515 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
514 516 self.template_store = TemplateLookup(
515 517 directories=rst_template_dirs,
516 518 input_encoding='utf-8',
517 519 imports=['from rhodecode.lib import helpers as h'])
518 520
519 521 def _get_template(self, templatename):
520 522 return self.template_store.get_template(templatename)
521 523
522 524 def render(self, template_name, **kwargs):
523 525 template = self._get_template(template_name)
524 526 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now