##// END OF EJS Templates
markup: allow better lxml import failure detection....
marcink -
r2002:1ea54f1f default
parent child Browse files
Show More
@@ -1,488 +1,495 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2017 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21
22 22 """
23 23 Renderer for markup languages with ability to parse using rst or markdown
24 24 """
25 25
26 26 import re
27 27 import os
28 28 import lxml
29 29 import logging
30 30 import urlparse
31 31
32 32 from mako.lookup import TemplateLookup
33 33 from mako.template import Template as MakoTemplate
34 34
35 35 from docutils.core import publish_parts
36 36 from docutils.parsers.rst import directives
37 37 from docutils import writers
38 38 from docutils.writers import html4css1
39 39 import markdown
40 40
41 41 from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension
42 42 from rhodecode.lib.utils2 import (
43 43 safe_str, safe_unicode, md5_safe, MENTIONS_REGEX)
44 44
45 45 log = logging.getLogger(__name__)
46 46
47 47 # default renderer used to generate automated comments
48 48 DEFAULT_COMMENTS_RENDERER = 'rst'
49 49
50 50
51 51 class CustomHTMLTranslator(writers.html4css1.HTMLTranslator):
52 52 """
53 53 Custom HTML Translator used for sandboxing potential
54 54 JS injections in ref links
55 55 """
56 56
57 57 def visit_reference(self, node):
58 58 if 'refuri' in node.attributes:
59 59 refuri = node['refuri']
60 60 if ':' in refuri:
61 61 prefix, link = refuri.lstrip().split(':', 1)
62 62 if prefix == 'javascript':
63 63 # we don't allow javascript type of refs...
64 64 node['refuri'] = 'javascript:alert("SandBoxedJavascript")'
65 65
66 66 # old style class requires this...
67 67 return html4css1.HTMLTranslator.visit_reference(self, node)
68 68
69 69
70 70 class RhodeCodeWriter(writers.html4css1.Writer):
71 71 def __init__(self):
72 72 writers.Writer.__init__(self)
73 73 self.translator_class = CustomHTMLTranslator
74 74
75 75
76 76 def relative_links(html_source, server_path):
77 77 if not html_source:
78 78 return html_source
79 79
80 80 try:
81 from lxml.html import fromstring
82 from lxml.html import tostring
83 except ImportError:
84 log.exception('Failed to import lxml')
85 return html_source
86
87 try:
81 88 doc = lxml.html.fromstring(html_source)
82 89 except Exception:
83 90 return html_source
84 91
85 92 for el in doc.cssselect('img, video'):
86 93 src = el.attrib.get('src')
87 94 if src:
88 95 el.attrib['src'] = relative_path(src, server_path)
89 96
90 97 for el in doc.cssselect('a:not(.gfm)'):
91 98 src = el.attrib.get('href')
92 99 if src:
93 100 el.attrib['href'] = relative_path(src, server_path)
94 101
95 102 return lxml.html.tostring(doc)
96 103
97 104
98 105 def relative_path(path, request_path, is_repo_file=None):
99 106 """
100 107 relative link support, path is a rel path, and request_path is current
101 108 server path (not absolute)
102 109
103 110 e.g.
104 111
105 112 path = '../logo.png'
106 113 request_path= '/repo/files/path/file.md'
107 114 produces: '/repo/files/logo.png'
108 115 """
109 116 # TODO(marcink): unicode/str support ?
110 117 # maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:'))
111 118
112 119 def dummy_check(p):
113 120 return True # assume default is a valid file path
114 121
115 122 is_repo_file = is_repo_file or dummy_check
116 123 if not path:
117 124 return request_path
118 125
119 126 path = safe_unicode(path)
120 127 request_path = safe_unicode(request_path)
121 128
122 129 if path.startswith((u'data:', u'javascript:', u'#', u':')):
123 130 # skip data, anchor, invalid links
124 131 return path
125 132
126 133 is_absolute = bool(urlparse.urlparse(path).netloc)
127 134 if is_absolute:
128 135 return path
129 136
130 137 if not request_path:
131 138 return path
132 139
133 140 if path.startswith(u'/'):
134 141 path = path[1:]
135 142
136 143 if path.startswith(u'./'):
137 144 path = path[2:]
138 145
139 146 parts = request_path.split('/')
140 147 # compute how deep we need to traverse the request_path
141 148 depth = 0
142 149
143 150 if is_repo_file(request_path):
144 151 # if request path is a VALID file, we use a relative path with
145 152 # one level up
146 153 depth += 1
147 154
148 155 while path.startswith(u'../'):
149 156 depth += 1
150 157 path = path[3:]
151 158
152 159 if depth > 0:
153 160 parts = parts[:-depth]
154 161
155 162 parts.append(path)
156 163 final_path = u'/'.join(parts).lstrip(u'/')
157 164
158 165 return u'/' + final_path
159 166
160 167
161 168 class MarkupRenderer(object):
162 169 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
163 170
164 171 MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE)
165 172 RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE)
166 173 JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE)
167 174 PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE)
168 175
169 176 extensions = ['codehilite', 'extra', 'def_list', 'sane_lists']
170 177 markdown_renderer = markdown.Markdown(
171 178 extensions, safe_mode=True, enable_attributes=False)
172 179
173 180 markdown_renderer_flavored = markdown.Markdown(
174 181 extensions + [GithubFlavoredMarkdownExtension()], safe_mode=True,
175 182 enable_attributes=False)
176 183
177 184 # extension together with weights. Lower is first means we control how
178 185 # extensions are attached to readme names with those.
179 186 PLAIN_EXTS = [
180 187 # prefer no extension
181 188 ('', 0), # special case that renders READMES names without extension
182 189 ('.text', 2), ('.TEXT', 2),
183 190 ('.txt', 3), ('.TXT', 3)
184 191 ]
185 192
186 193 RST_EXTS = [
187 194 ('.rst', 1), ('.rest', 1),
188 195 ('.RST', 2), ('.REST', 2)
189 196 ]
190 197
191 198 MARKDOWN_EXTS = [
192 199 ('.md', 1), ('.MD', 1),
193 200 ('.mkdn', 2), ('.MKDN', 2),
194 201 ('.mdown', 3), ('.MDOWN', 3),
195 202 ('.markdown', 4), ('.MARKDOWN', 4)
196 203 ]
197 204
198 205 def _detect_renderer(self, source, filename=None):
199 206 """
200 207 runs detection of what renderer should be used for generating html
201 208 from a markup language
202 209
203 210 filename can be also explicitly a renderer name
204 211
205 212 :param source:
206 213 :param filename:
207 214 """
208 215
209 216 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
210 217 detected_renderer = 'markdown'
211 218 elif MarkupRenderer.RST_PAT.findall(filename):
212 219 detected_renderer = 'rst'
213 220 elif MarkupRenderer.JUPYTER_PAT.findall(filename):
214 221 detected_renderer = 'jupyter'
215 222 elif MarkupRenderer.PLAIN_PAT.findall(filename):
216 223 detected_renderer = 'plain'
217 224 else:
218 225 detected_renderer = 'plain'
219 226
220 227 return getattr(MarkupRenderer, detected_renderer)
221 228
222 229 @classmethod
223 230 def renderer_from_filename(cls, filename, exclude):
224 231 """
225 232 Detect renderer markdown/rst from filename and optionally use exclude
226 233 list to remove some options. This is mostly used in helpers.
227 234 Returns None when no renderer can be detected.
228 235 """
229 236 def _filter(elements):
230 237 if isinstance(exclude, (list, tuple)):
231 238 return [x for x in elements if x not in exclude]
232 239 return elements
233 240
234 241 if filename.endswith(
235 242 tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))):
236 243 return 'markdown'
237 244 if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))):
238 245 return 'rst'
239 246
240 247 return None
241 248
242 249 def render(self, source, filename=None):
243 250 """
244 251 Renders a given filename using detected renderer
245 252 it detects renderers based on file extension or mimetype.
246 253 At last it will just do a simple html replacing new lines with <br/>
247 254
248 255 :param file_name:
249 256 :param source:
250 257 """
251 258
252 259 renderer = self._detect_renderer(source, filename)
253 260 readme_data = renderer(source)
254 261 return readme_data
255 262
256 263 @classmethod
257 264 def _flavored_markdown(cls, text):
258 265 """
259 266 Github style flavored markdown
260 267
261 268 :param text:
262 269 """
263 270
264 271 # Extract pre blocks.
265 272 extractions = {}
266 273
267 274 def pre_extraction_callback(matchobj):
268 275 digest = md5_safe(matchobj.group(0))
269 276 extractions[digest] = matchobj.group(0)
270 277 return "{gfm-extraction-%s}" % digest
271 278 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
272 279 text = re.sub(pattern, pre_extraction_callback, text)
273 280
274 281 # Prevent foo_bar_baz from ending up with an italic word in the middle.
275 282 def italic_callback(matchobj):
276 283 s = matchobj.group(0)
277 284 if list(s).count('_') >= 2:
278 285 return s.replace('_', r'\_')
279 286 return s
280 287 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
281 288
282 289 # Insert pre block extractions.
283 290 def pre_insert_callback(matchobj):
284 291 return '\n\n' + extractions[matchobj.group(1)]
285 292 text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}',
286 293 pre_insert_callback, text)
287 294
288 295 return text
289 296
290 297 @classmethod
291 298 def urlify_text(cls, text):
292 299 url_pat = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
293 300 r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
294 301
295 302 def url_func(match_obj):
296 303 url_full = match_obj.groups()[0]
297 304 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
298 305
299 306 return url_pat.sub(url_func, text)
300 307
301 308 @classmethod
302 309 def plain(cls, source, universal_newline=True):
303 310 source = safe_unicode(source)
304 311 if universal_newline:
305 312 newline = '\n'
306 313 source = newline.join(source.splitlines())
307 314
308 315 source = cls.urlify_text(source)
309 316 return '<br />' + source.replace("\n", '<br />')
310 317
311 318 @classmethod
312 319 def markdown(cls, source, safe=True, flavored=True, mentions=False):
313 320 # It does not allow to insert inline HTML. In presence of HTML tags, it
314 321 # will replace them instead with [HTML_REMOVED]. This is controlled by
315 322 # the safe_mode=True parameter of the markdown method.
316 323
317 324 if flavored:
318 325 markdown_renderer = cls.markdown_renderer_flavored
319 326 else:
320 327 markdown_renderer = cls.markdown_renderer
321 328
322 329 if mentions:
323 330 mention_pat = re.compile(MENTIONS_REGEX)
324 331
325 332 def wrapp(match_obj):
326 333 uname = match_obj.groups()[0]
327 334 return ' **@%(uname)s** ' % {'uname': uname}
328 335 mention_hl = mention_pat.sub(wrapp, source).strip()
329 336 # we extracted mentions render with this using Mentions false
330 337 return cls.markdown(mention_hl, safe=safe, flavored=flavored,
331 338 mentions=False)
332 339
333 340 source = safe_unicode(source)
334 341 try:
335 342 if flavored:
336 343 source = cls._flavored_markdown(source)
337 344 return markdown_renderer.convert(source)
338 345 except Exception:
339 346 log.exception('Error when rendering Markdown')
340 347 if safe:
341 348 log.debug('Fallback to render in plain mode')
342 349 return cls.plain(source)
343 350 else:
344 351 raise
345 352
346 353 @classmethod
347 354 def rst(cls, source, safe=True, mentions=False):
348 355 if mentions:
349 356 mention_pat = re.compile(MENTIONS_REGEX)
350 357
351 358 def wrapp(match_obj):
352 359 uname = match_obj.groups()[0]
353 360 return ' **@%(uname)s** ' % {'uname': uname}
354 361 mention_hl = mention_pat.sub(wrapp, source).strip()
355 362 # we extracted mentions render with this using Mentions false
356 363 return cls.rst(mention_hl, safe=safe, mentions=False)
357 364
358 365 source = safe_unicode(source)
359 366 try:
360 367 docutils_settings = dict(
361 368 [(alias, None) for alias in
362 369 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
363 370
364 371 docutils_settings.update({'input_encoding': 'unicode',
365 372 'report_level': 4})
366 373
367 374 for k, v in docutils_settings.iteritems():
368 375 directives.register_directive(k, v)
369 376
370 377 parts = publish_parts(source=source,
371 378 writer=RhodeCodeWriter(),
372 379 settings_overrides=docutils_settings)
373 380
374 381 return parts['html_title'] + parts["fragment"]
375 382 except Exception:
376 383 log.exception('Error when rendering RST')
377 384 if safe:
378 385 log.debug('Fallbacking to render in plain mode')
379 386 return cls.plain(source)
380 387 else:
381 388 raise
382 389
383 390 @classmethod
384 391 def jupyter(cls, source, safe=True):
385 392 from rhodecode.lib import helpers
386 393
387 394 from traitlets.config import Config
388 395 import nbformat
389 396 from nbconvert import HTMLExporter
390 397 from nbconvert.preprocessors import Preprocessor
391 398
392 399 class CustomHTMLExporter(HTMLExporter):
393 400 def _template_file_default(self):
394 401 return 'basic'
395 402
396 403 class Sandbox(Preprocessor):
397 404
398 405 def preprocess(self, nb, resources):
399 406 sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)'
400 407 for cell in nb['cells']:
401 408 if safe and 'outputs' in cell:
402 409 for cell_output in cell['outputs']:
403 410 if 'data' in cell_output:
404 411 if 'application/javascript' in cell_output['data']:
405 412 cell_output['data']['text/plain'] = sandbox_text
406 413 cell_output['data'].pop('application/javascript', None)
407 414 return nb, resources
408 415
409 416 def _sanitize_resources(resources):
410 417 """
411 418 Skip/sanitize some of the CSS generated and included in jupyter
412 419 so it doesn't messes up UI so much
413 420 """
414 421
415 422 # TODO(marcink): probably we should replace this with whole custom
416 423 # CSS set that doesn't screw up, but jupyter generated html has some
417 424 # special markers, so it requires Custom HTML exporter template with
418 425 # _default_template_path_default, to achieve that
419 426
420 427 # strip the reset CSS
421 428 resources[0] = resources[0][resources[0].find('/*! Source'):]
422 429 return resources
423 430
424 431 def as_html(notebook):
425 432 conf = Config()
426 433 conf.CustomHTMLExporter.preprocessors = [Sandbox]
427 434 html_exporter = CustomHTMLExporter(config=conf)
428 435
429 436 (body, resources) = html_exporter.from_notebook_node(notebook)
430 437 header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->'
431 438 js = MakoTemplate(r'''
432 439 <!-- Load mathjax -->
433 440 <!-- MathJax configuration -->
434 441 <script type="text/x-mathjax-config">
435 442 MathJax.Hub.Config({
436 443 jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"],
437 444 extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
438 445 TeX: {
439 446 extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"]
440 447 },
441 448 tex2jax: {
442 449 inlineMath: [ ['$','$'], ["\\(","\\)"] ],
443 450 displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
444 451 processEscapes: true,
445 452 processEnvironments: true
446 453 },
447 454 // Center justify equations in code and markdown cells. Elsewhere
448 455 // we use CSS to left justify single line equations in code cells.
449 456 displayAlign: 'center',
450 457 "HTML-CSS": {
451 458 styles: {'.MathJax_Display': {"margin": 0}},
452 459 linebreaks: { automatic: true },
453 460 availableFonts: ["STIX", "TeX"]
454 461 },
455 462 showMathMenu: false
456 463 });
457 464 </script>
458 465 <!-- End of mathjax configuration -->
459 466 <script src="${h.asset('js/src/math_jax/MathJax.js')}"></script>
460 467 ''').render(h=helpers)
461 468
462 469 css = '<style>{}</style>'.format(
463 470 ''.join(_sanitize_resources(resources['inlining']['css'])))
464 471
465 472 body = '\n'.join([header, css, js, body])
466 473 return body, resources
467 474
468 475 notebook = nbformat.reads(source, as_version=4)
469 476 (body, resources) = as_html(notebook)
470 477 return body
471 478
472 479
473 480 class RstTemplateRenderer(object):
474 481
475 482 def __init__(self):
476 483 base = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
477 484 rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')]
478 485 self.template_store = TemplateLookup(
479 486 directories=rst_template_dirs,
480 487 input_encoding='utf-8',
481 488 imports=['from rhodecode.lib import helpers as h'])
482 489
483 490 def _get_template(self, templatename):
484 491 return self.template_store.get_template(templatename)
485 492
486 493 def render(self, template_name, **kwargs):
487 494 template = self._get_template(template_name)
488 495 return template.render(**kwargs)
General Comments 0
You need to be logged in to leave comments. Login now