##// END OF EJS Templates
security: use custom writer for RST rendering to prevent injection of javascript: tags.
marcink -
r1833:56150ab5 default
parent child Browse files
Show More
@@ -1,461 +1,488 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 31
32 32 from mako.lookup import TemplateLookup
33 33 from mako.template import Template as MakoTemplate
34 34
35 35 from docutils.core import publish_parts
36 36 from docutils.parsers.rst import directives
37 from docutils import writers
38 from docutils.writers import html4css1
37 39 import markdown
38 40
39 41 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
40 42 from rhodecode.lib.utils2 import (
41 43 safe_str, safe_unicode, md5_safe, MENTIONS_REGEX)
42 44
43 45 log = logging.getLogger(__name__)
44 46
45 47 # default renderer used to generate automated comments
46 48 DEFAULT_COMMENTS_RENDERER = 'rst'
47 49
48 50
51 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
52 """
53 Custom HTML Translator used for sandboxing potential
54 JS injections in ref links
55 """
56
57 def visit_reference(self, node):
58 if 'refuri' in node.attributes:
59 refuri = node['refuri']
60 if ':' in refuri:
61 prefix, link = refuri.lstrip().split(':', 1)
62 if prefix == 'javascript':
63 # we don't allow javascript type of refs...
64 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
65
66 # old style class requires this...
67 return html4css1.HTMLTranslator.visit_reference(self, node)
68
69
70 class RhodeCodeWriter(writers.html4css1.Writer):
71 def __init__(self):
72 writers.Writer.__init__(self)
73 self.translator_class = CustomHTMLTranslator
74
75
49 76 def relative_links(html_source, server_path):
50 77 if not html_source:
51 78 return html_source
52 79
53 80 try:
54 81 doc = lxml.html.fromstring(html_source)
55 82 except Exception:
56 83 return html_source
57 84
58 85 for el in doc.cssselect('img, video'):
59 86 src = el.attrib['src']
60 87 if src:
61 88 el.attrib['src'] = relative_path(src, server_path)
62 89
63 90 for el in doc.cssselect('a:not(.gfm)'):
64 91 src = el.attrib['href']
65 92 if src:
66 93 el.attrib['href'] = relative_path(src, server_path)
67 94
68 95 return lxml.html.tostring(doc)
69 96
70 97
71 98 def relative_path(path, request_path, is_repo_file=None):
72 99 """
73 100 relative link support, path is a rel path, and request_path is current
74 101 server path (not absolute)
75 102
76 103 e.g.
77 104
78 105 path = '../logo.png'
79 106 request_path= '/repo/files/path/file.md'
80 107 produces: '/repo/files/logo.png'
81 108 """
82 109 # TODO(marcink): unicode/str support ?
83 110 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
84 111
85 112 def dummy_check(p):
86 113 return True # assume default is a valid file path
87 114
88 115 is_repo_file = is_repo_file or dummy_check
89 116 if not path:
90 117 return request_path
91 118
92 119 path = safe_unicode(path)
93 120 request_path = safe_unicode(request_path)
94 121
95 122 if path.startswith((u'data:', u'javascript:', u'#', u':')):
96 123 # skip data, anchor, invalid links
97 124 return path
98 125
99 126 is_absolute = bool(urlparse.urlparse(path).netloc)
100 127 if is_absolute:
101 128 return path
102 129
103 130 if not request_path:
104 131 return path
105 132
106 133 if path.startswith(u'/'):
107 134 path = path[1:]
108 135
109 136 if path.startswith(u'./'):
110 137 path = path[2:]
111 138
112 139 parts = request_path.split('/')
113 140 # compute how deep we need to traverse the request_path
114 141 depth = 0
115 142
116 143 if is_repo_file(request_path):
117 144 # if request path is a VALID file, we use a relative path with
118 145 # one level up
119 146 depth += 1
120 147
121 148 while path.startswith(u'../'):
122 149 depth += 1
123 150 path = path[3:]
124 151
125 152 if depth > 0:
126 153 parts = parts[:-depth]
127 154
128 155 parts.append(path)
129 156 final_path = u'/'.join(parts).lstrip(u'/')
130 157
131 158 return u'/' + final_path
132 159
133 160
134 161 class MarkupRenderer(object):
135 162 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
136 163
137 164 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
138 165 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
139 166 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
140 167 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
141 168
142 169 extensions = ['codehilite', 'extra', 'def_list', 'sane_lists']
143 170 markdown_renderer = markdown.Markdown(
144 171 extensions, safe_mode=True, enable_attributes=False)
145 172
146 173 markdown_renderer_flavored = markdown.Markdown(
147 174 extensions + [GithubFlavoredMarkdownExtension()], safe_mode=True,
148 175 enable_attributes=False)
149 176
150 177 # extension together with weights. Lower is first means we control how
151 178 # extensions are attached to readme names with those.
152 179 PLAIN_EXTS = [
153 180 # prefer no extension
154 181 ('', 0), # special case that renders READMES names without extension
155 182 ('.text', 2), ('.TEXT', 2),
156 183 ('.txt', 3), ('.TXT', 3)
157 184 ]
158 185
159 186 RST_EXTS = [
160 187 ('.rst', 1), ('.rest', 1),
161 188 ('.RST', 2), ('.REST', 2)
162 189 ]
163 190
164 191 MARKDOWN_EXTS = [
165 192 ('.md', 1), ('.MD', 1),
166 193 ('.mkdn', 2), ('.MKDN', 2),
167 194 ('.mdown', 3), ('.MDOWN', 3),
168 195 ('.markdown', 4), ('.MARKDOWN', 4)
169 196 ]
170 197
171 198 def _detect_renderer(self, source, filename=None):
172 199 """
173 200 runs detection of what renderer should be used for generating html
174 201 from a markup language
175 202
176 203 filename can be also explicitly a renderer name
177 204
178 205 :param source:
179 206 :param filename:
180 207 """
181 208
182 209 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
183 210 detected_renderer = 'markdown'
184 211 elif MarkupRenderer.RST_PAT.findall(filename):
185 212 detected_renderer = 'rst'
186 213 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
187 214 detected_renderer = 'jupyter'
188 215 elif MarkupRenderer.PLAIN_PAT.findall(filename):
189 216 detected_renderer = 'plain'
190 217 else:
191 218 detected_renderer = 'plain'
192 219
193 220 return getattr(MarkupRenderer, detected_renderer)
194 221
195 222 @classmethod
196 223 def renderer_from_filename(cls, filename, exclude):
197 224 """
198 225 Detect renderer markdown/rst from filename and optionally use exclude
199 226 list to remove some options. This is mostly used in helpers.
200 227 Returns None when no renderer can be detected.
201 228 """
202 229 def _filter(elements):
203 230 if isinstance(exclude, (list, tuple)):
204 231 return [x for x in elements if x not in exclude]
205 232 return elements
206 233
207 234 if filename.endswith(
208 235 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
209 236 return 'markdown'
210 237 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
211 238 return 'rst'
212 239
213 240 return None
214 241
215 242 def render(self, source, filename=None):
216 243 """
217 244 Renders a given filename using detected renderer
218 245 it detects renderers based on file extension or mimetype.
219 246 At last it will just do a simple html replacing new lines with <br/>
220 247
221 248 :param file_name:
222 249 :param source:
223 250 """
224 251
225 252 renderer = self._detect_renderer(source, filename)
226 253 readme_data = renderer(source)
227 254 return readme_data
228 255
229 256 @classmethod
230 257 def _flavored_markdown(cls, text):
231 258 """
232 259 Github style flavored markdown
233 260
234 261 :param text:
235 262 """
236 263
237 264 # Extract pre blocks.
238 265 extractions = {}
239 266
240 267 def pre_extraction_callback(matchobj):
241 268 digest = md5_safe(matchobj.group(0))
242 269 extractions[digest] = matchobj.group(0)
243 270 return "{gfm-extraction-%s}" % digest
244 271 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
245 272 text = re.sub(pattern, pre_extraction_callback, text)
246 273
247 274 # Prevent foo_bar_baz from ending up with an italic word in the middle.
248 275 def italic_callback(matchobj):
249 276 s = matchobj.group(0)
250 277 if list(s).count('_') >= 2:
251 278 return s.replace('_', r'\_')
252 279 return s
253 280 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
254 281
255 282 # Insert pre block extractions.
256 283 def pre_insert_callback(matchobj):
257 284 return '\n\n' + extractions[matchobj.group(1)]
258 285 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
259 286 pre_insert_callback, text)
260 287
261 288 return text
262 289
263 290 @classmethod
264 291 def urlify_text(cls, text):
265 292 url_pat = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
266 293 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
267 294
268 295 def url_func(match_obj):
269 296 url_full = match_obj.groups()[0]
270 297 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
271 298
272 299 return url_pat.sub(url_func, text)
273 300
274 301 @classmethod
275 302 def plain(cls, source, universal_newline=True):
276 303 source = safe_unicode(source)
277 304 if universal_newline:
278 305 newline = '\n'
279 306 source = newline.join(source.splitlines())
280 307
281 308 source = cls.urlify_text(source)
282 309 return '<br />' + source.replace("\n", '<br />')
283 310
284 311 @classmethod
285 312 def markdown(cls, source, safe=True, flavored=True, mentions=False):
286 313 # It does not allow to insert inline HTML. In presence of HTML tags, it
287 314 # will replace them instead with [HTML_REMOVED]. This is controlled by
288 315 # the safe_mode=True parameter of the markdown method.
289 316
290 317 if flavored:
291 318 markdown_renderer = cls.markdown_renderer_flavored
292 319 else:
293 320 markdown_renderer = cls.markdown_renderer
294 321
295 322 if mentions:
296 323 mention_pat = re.compile(MENTIONS_REGEX)
297 324
298 325 def wrapp(match_obj):
299 326 uname = match_obj.groups()[0]
300 327 return ' **@%(uname)s** ' % {'uname': uname}
301 328 mention_hl = mention_pat.sub(wrapp, source).strip()
302 329 # we extracted mentions render with this using Mentions false
303 330 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
304 331 mentions=False)
305 332
306 333 source = safe_unicode(source)
307 334 try:
308 335 if flavored:
309 336 source = cls._flavored_markdown(source)
310 337 return markdown_renderer.convert(source)
311 338 except Exception:
312 339 log.exception('Error when rendering Markdown')
313 340 if safe:
314 341 log.debug('Fallback to render in plain mode')
315 342 return cls.plain(source)
316 343 else:
317 344 raise
318 345
319 346 @classmethod
320 347 def rst(cls, source, safe=True, mentions=False):
321 348 if mentions:
322 349 mention_pat = re.compile(MENTIONS_REGEX)
323 350
324 351 def wrapp(match_obj):
325 352 uname = match_obj.groups()[0]
326 353 return ' **@%(uname)s** ' % {'uname': uname}
327 354 mention_hl = mention_pat.sub(wrapp, source).strip()
328 355 # we extracted mentions render with this using Mentions false
329 356 return cls.rst(mention_hl, safe=safe, mentions=False)
330 357
331 358 source = safe_unicode(source)
332 359 try:
333 360 docutils_settings = dict(
334 361 [(alias, None) for alias in
335 362 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
336 363
337 364 docutils_settings.update({'input_encoding': 'unicode',
338 365 'report_level': 4})
339 366
340 367 for k, v in docutils_settings.iteritems():
341 368 directives.register_directive(k, v)
342 369
343 370 parts = publish_parts(source=source,
344 writer_name="html4css1",
371 writer=RhodeCodeWriter(),
345 372 settings_overrides=docutils_settings)
346 373
347 374 return parts['html_title'] + parts["fragment"]
348 375 except Exception:
349 376 log.exception('Error when rendering RST')
350 377 if safe:
351 378 log.debug('Fallbacking to render in plain mode')
352 379 return cls.plain(source)
353 380 else:
354 381 raise
355 382
356 383 @classmethod
357 384 def jupyter(cls, source, safe=True):
358 385 from rhodecode.lib import helpers
359 386
360 387 from traitlets.config import Config
361 388 import nbformat
362 389 from nbconvert import HTMLExporter
363 390 from nbconvert.preprocessors import Preprocessor
364 391
365 392 class CustomHTMLExporter(HTMLExporter):
366 393 def _template_file_default(self):
367 394 return 'basic'
368 395
369 396 class Sandbox(Preprocessor):
370 397
371 398 def preprocess(self, nb, resources):
372 399 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
373 400 for cell in nb['cells']:
374 401 if safe and 'outputs' in cell:
375 402 for cell_output in cell['outputs']:
376 403 if 'data' in cell_output:
377 404 if 'application/javascript' in cell_output['data']:
378 405 cell_output['data']['text/plain'] = sandbox_text
379 406 cell_output['data'].pop('application/javascript', None)
380 407 return nb, resources
381 408
382 409 def _sanitize_resources(resources):
383 410 """
384 411 Skip/sanitize some of the CSS generated and included in jupyter
385 412 so it doesn't messes up UI so much
386 413 """
387 414
388 415 # TODO(marcink): probably we should replace this with whole custom
389 416 # CSS set that doesn't screw up, but jupyter generated html has some
390 417 # special markers, so it requires Custom HTML exporter template with
391 418 # _default_template_path_default, to achieve that
392 419
393 420 # strip the reset CSS
394 421 resources[0] = resources[0][resources[0].find('/*! Source'):]
395 422 return resources
396 423
397 424 def as_html(notebook):
398 425 conf = Config()
399 426 conf.CustomHTMLExporter.preprocessors = [Sandbox]
400 427 html_exporter = CustomHTMLExporter(config=conf)
401 428
402 429 (body, resources) = html_exporter.from_notebook_node(notebook)
403 430 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
404 431 js = MakoTemplate(r'''
405 432 <!-- Load mathjax -->
406 433 <!-- MathJax configuration -->
407 434 <script type="text/x-mathjax-config">
408 435 MathJax.Hub.Config({
409 436 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
410 437 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
411 438 TeX: {
412 439 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
413 440 },
414 441 tex2jax: {
415 442 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
416 443 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
417 444 processEscapes: true,
418 445 processEnvironments: true
419 446 },
420 447 // Center justify equations in code and markdown cells. Elsewhere
421 448 // we use CSS to left justify single line equations in code cells.
422 449 displayAlign: 'center',
423 450 "HTML-CSS": {
424 451 styles: {'.MathJax_Display': {"margin": 0}},
425 452 linebreaks: { automatic: true },
426 453 availableFonts: ["STIX", "TeX"]
427 454 },
428 455 showMathMenu: false
429 456 });
430 457 </script>
431 458 <!-- End of mathjax configuration -->
432 459 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
433 460 ''').render(h=helpers)
434 461
435 462 css = '<style>{}</style>'.format(
436 463 ''.join(_sanitize_resources(resources['inlining']['css'])))
437 464
438 465 body = '\n'.join([header, css, js, body])
439 466 return body, resources
440 467
441 468 notebook = nbformat.reads(source, as_version=4)
442 469 (body, resources) = as_html(notebook)
443 470 return body
444 471
445 472
446 473 class RstTemplateRenderer(object):
447 474
448 475 def __init__(self):
449 476 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
450 477 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
451 478 self.template_store = TemplateLookup(
452 479 directories=rst_template_dirs,
453 480 input_encoding='utf-8',
454 481 imports=['from rhodecode.lib import helpers as h'])
455 482
456 483 def _get_template(self, templatename):
457 484 return self.template_store.get_template(templatename)
458 485
459 486 def render(self, template_name, **kwargs):
460 487 template = self._get_template(template_name)
461 488 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now