markup_renderer.py
580 lines
| 19.4 KiB
| text/x-python
|
PythonLexer
r1 | # -*- coding: utf-8 -*- | |||
r4306 | # Copyright (C) 2011-2020 RhodeCode GmbH | |||
r1 | # | |||
# This program is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU Affero General Public License, version 3 | ||||
# (only), as published by the Free Software Foundation. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU Affero General Public License | ||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
# | ||||
# This program is dual-licensed. If you wish to learn more about the | ||||
# RhodeCode Enterprise Edition, including its added features, Support services, | ||||
# and proprietary license terms, please see https://rhodecode.com/licenses/ | ||||
""" | ||||
Renderer for markup languages with ability to parse using rst or markdown | ||||
""" | ||||
import re | ||||
import os | ||||
r1527 | import lxml | |||
r1 | import logging | |||
r4919 | import urllib.parse | |||
r2440 | import bleach | |||
r396 | ||||
r1 | from mako.lookup import TemplateLookup | |||
r1491 | from mako.template import Template as MakoTemplate | |||
r1 | ||||
from docutils.core import publish_parts | ||||
from docutils.parsers.rst import directives | ||||
r1833 | from docutils import writers | |||
from docutils.writers import html4css1 | ||||
r1 | import markdown | |||
r1527 | from rhodecode.lib.markdown_ext import GithubFlavoredMarkdownExtension | |||
r3239 | from rhodecode.lib.utils2 import (safe_unicode, md5_safe, MENTIONS_REGEX) | |||
r1 | ||||
log = logging.getLogger(__name__) | ||||
# default renderer used to generate automated comments | ||||
DEFAULT_COMMENTS_RENDERER = 'rst' | ||||
r4221 | try: | |||
from lxml.html import fromstring | ||||
from lxml.html import tostring | ||||
except ImportError: | ||||
log.exception('Failed to import lxml') | ||||
fromstring = None | ||||
tostring = None | ||||
r1 | ||||
r1833 | class CustomHTMLTranslator(writers.html4css1.HTMLTranslator): | |||
""" | ||||
Custom HTML Translator used for sandboxing potential | ||||
JS injections in ref links | ||||
""" | ||||
r4117 | def visit_literal_block(self, node): | |||
self.body.append(self.starttag(node, 'pre', CLASS='codehilite literal-block')) | ||||
r1833 | ||||
def visit_reference(self, node): | ||||
if 'refuri' in node.attributes: | ||||
refuri = node['refuri'] | ||||
if ':' in refuri: | ||||
prefix, link = refuri.lstrip().split(':', 1) | ||||
r3147 | prefix = prefix or '' | |||
if prefix.lower() == 'javascript': | ||||
r1833 | # we don't allow javascript type of refs... | |||
node['refuri'] = 'javascript:alert("SandBoxedJavascript")' | ||||
# old style class requires this... | ||||
return html4css1.HTMLTranslator.visit_reference(self, node) | ||||
class RhodeCodeWriter(writers.html4css1.Writer): | ||||
def __init__(self): | ||||
writers.Writer.__init__(self) | ||||
self.translator_class = CustomHTMLTranslator | ||||
r2003 | def relative_links(html_source, server_paths): | |||
r1529 | if not html_source: | |||
return html_source | ||||
r4221 | if not fromstring and tostring: | |||
r2002 | return html_source | |||
try: | ||||
r1529 | doc = lxml.html.fromstring(html_source) | |||
except Exception: | ||||
return html_source | ||||
r1527 | for el in doc.cssselect('img, video'): | |||
r1840 | src = el.attrib.get('src') | |||
r1527 | if src: | |||
r2003 | el.attrib['src'] = relative_path(src, server_paths['raw']) | |||
r1527 | ||||
for el in doc.cssselect('a:not(.gfm)'): | ||||
r1840 | src = el.attrib.get('href') | |||
r1527 | if src: | |||
r2003 | raw_mode = el.attrib['href'].endswith('?raw=1') | |||
if raw_mode: | ||||
el.attrib['href'] = relative_path(src, server_paths['raw']) | ||||
else: | ||||
el.attrib['href'] = relative_path(src, server_paths['standard']) | ||||
r1527 | ||||
return lxml.html.tostring(doc) | ||||
def relative_path(path, request_path, is_repo_file=None): | ||||
""" | ||||
relative link support, path is a rel path, and request_path is current | ||||
server path (not absolute) | ||||
e.g. | ||||
path = '../logo.png' | ||||
request_path= '/repo/files/path/file.md' | ||||
produces: '/repo/files/logo.png' | ||||
""" | ||||
# TODO(marcink): unicode/str support ? | ||||
# maybe=> safe_unicode(urllib.quote(safe_str(final_path), '/:')) | ||||
def dummy_check(p): | ||||
return True # assume default is a valid file path | ||||
is_repo_file = is_repo_file or dummy_check | ||||
if not path: | ||||
return request_path | ||||
path = safe_unicode(path) | ||||
request_path = safe_unicode(request_path) | ||||
r1529 | if path.startswith((u'data:', u'javascript:', u'#', u':')): | |||
r1527 | # skip data, anchor, invalid links | |||
return path | ||||
r4919 | is_absolute = bool(urllib.parse.urlparse.urlparse(path).netloc) | |||
r1527 | if is_absolute: | |||
return path | ||||
if not request_path: | ||||
return path | ||||
if path.startswith(u'/'): | ||||
path = path[1:] | ||||
if path.startswith(u'./'): | ||||
path = path[2:] | ||||
parts = request_path.split('/') | ||||
# compute how deep we need to traverse the request_path | ||||
depth = 0 | ||||
if is_repo_file(request_path): | ||||
# if request path is a VALID file, we use a relative path with | ||||
# one level up | ||||
depth += 1 | ||||
while path.startswith(u'../'): | ||||
depth += 1 | ||||
path = path[3:] | ||||
if depth > 0: | ||||
parts = parts[:-depth] | ||||
parts.append(path) | ||||
final_path = u'/'.join(parts).lstrip(u'/') | ||||
return u'/' + final_path | ||||
r3239 | _cached_markdown_renderer = None | |||
def get_markdown_renderer(extensions, output_format): | ||||
global _cached_markdown_renderer | ||||
if _cached_markdown_renderer is None: | ||||
_cached_markdown_renderer = markdown.Markdown( | ||||
extensions=extensions, | ||||
enable_attributes=False, output_format=output_format) | ||||
return _cached_markdown_renderer | ||||
_cached_markdown_renderer_flavored = None | ||||
def get_markdown_renderer_flavored(extensions, output_format): | ||||
global _cached_markdown_renderer_flavored | ||||
if _cached_markdown_renderer_flavored is None: | ||||
_cached_markdown_renderer_flavored = markdown.Markdown( | ||||
extensions=extensions + [GithubFlavoredMarkdownExtension()], | ||||
enable_attributes=False, output_format=output_format) | ||||
return _cached_markdown_renderer_flavored | ||||
r1 | class MarkupRenderer(object): | |||
RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw'] | ||||
MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE) | ||||
RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE) | ||||
r1491 | JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE) | |||
r1 | PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE) | |||
r2090 | URL_PAT = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]' | |||
r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)') | ||||
r4221 | MENTION_PAT = re.compile(MENTIONS_REGEX) | |||
r3239 | extensions = ['markdown.extensions.codehilite', 'markdown.extensions.extra', | |||
'markdown.extensions.def_list', 'markdown.extensions.sane_lists'] | ||||
r2440 | output_format = 'html4' | |||
r1353 | ||||
r396 | # extension together with weights. Lower is first means we control how | |||
# extensions are attached to readme names with those. | ||||
PLAIN_EXTS = [ | ||||
r773 | # prefer no extension | |||
r396 | ('', 0), # special case that renders READMES names without extension | |||
('.text', 2), ('.TEXT', 2), | ||||
('.txt', 3), ('.TXT', 3) | ||||
] | ||||
RST_EXTS = [ | ||||
('.rst', 1), ('.rest', 1), | ||||
('.RST', 2), ('.REST', 2) | ||||
] | ||||
MARKDOWN_EXTS = [ | ||||
('.md', 1), ('.MD', 1), | ||||
('.mkdn', 2), ('.MKDN', 2), | ||||
('.mdown', 3), ('.MDOWN', 3), | ||||
('.markdown', 4), ('.MARKDOWN', 4) | ||||
] | ||||
r1 | def _detect_renderer(self, source, filename=None): | |||
""" | ||||
runs detection of what renderer should be used for generating html | ||||
from a markup language | ||||
filename can be also explicitly a renderer name | ||||
:param source: | ||||
:param filename: | ||||
""" | ||||
if MarkupRenderer.MARKDOWN_PAT.findall(filename): | ||||
detected_renderer = 'markdown' | ||||
elif MarkupRenderer.RST_PAT.findall(filename): | ||||
detected_renderer = 'rst' | ||||
r1491 | elif MarkupRenderer.JUPYTER_PAT.findall(filename): | |||
detected_renderer = 'jupyter' | ||||
r1 | elif MarkupRenderer.PLAIN_PAT.findall(filename): | |||
r1289 | detected_renderer = 'plain' | |||
r1 | else: | |||
detected_renderer = 'plain' | ||||
return getattr(MarkupRenderer, detected_renderer) | ||||
r396 | @classmethod | |||
r2440 | def bleach_clean(cls, text): | |||
from .bleach_whitelist import markdown_attrs, markdown_tags | ||||
allowed_tags = markdown_tags | ||||
allowed_attrs = markdown_attrs | ||||
r2992 | ||||
try: | ||||
return bleach.clean(text, tags=allowed_tags, attributes=allowed_attrs) | ||||
except Exception: | ||||
return 'UNPARSEABLE TEXT' | ||||
r2440 | ||||
@classmethod | ||||
r396 | def renderer_from_filename(cls, filename, exclude): | |||
""" | ||||
r401 | Detect renderer markdown/rst from filename and optionally use exclude | |||
list to remove some options. This is mostly used in helpers. | ||||
Returns None when no renderer can be detected. | ||||
r396 | """ | |||
def _filter(elements): | ||||
if isinstance(exclude, (list, tuple)): | ||||
return [x for x in elements if x not in exclude] | ||||
return elements | ||||
if filename.endswith( | ||||
tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))): | ||||
return 'markdown' | ||||
if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))): | ||||
return 'rst' | ||||
r401 | return None | |||
r396 | ||||
r1 | def render(self, source, filename=None): | |||
""" | ||||
Renders a given filename using detected renderer | ||||
it detects renderers based on file extension or mimetype. | ||||
At last it will just do a simple html replacing new lines with <br/> | ||||
:param file_name: | ||||
:param source: | ||||
""" | ||||
renderer = self._detect_renderer(source, filename) | ||||
readme_data = renderer(source) | ||||
return readme_data | ||||
@classmethod | ||||
def _flavored_markdown(cls, text): | ||||
""" | ||||
Github style flavored markdown | ||||
:param text: | ||||
""" | ||||
# Extract pre blocks. | ||||
extractions = {} | ||||
def pre_extraction_callback(matchobj): | ||||
digest = md5_safe(matchobj.group(0)) | ||||
extractions[digest] = matchobj.group(0) | ||||
return "{gfm-extraction-%s}" % digest | ||||
pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL) | ||||
text = re.sub(pattern, pre_extraction_callback, text) | ||||
# Prevent foo_bar_baz from ending up with an italic word in the middle. | ||||
def italic_callback(matchobj): | ||||
s = matchobj.group(0) | ||||
if list(s).count('_') >= 2: | ||||
return s.replace('_', r'\_') | ||||
return s | ||||
text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text) | ||||
# Insert pre block extractions. | ||||
def pre_insert_callback(matchobj): | ||||
return '\n\n' + extractions[matchobj.group(1)] | ||||
text = re.sub(r'\{gfm-extraction-([0-9a-f]{32})\}', | ||||
pre_insert_callback, text) | ||||
return text | ||||
@classmethod | ||||
def urlify_text(cls, text): | ||||
def url_func(match_obj): | ||||
url_full = match_obj.groups()[0] | ||||
return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full}) | ||||
r2090 | return cls.URL_PAT.sub(url_func, text) | |||
r1 | ||||
@classmethod | ||||
r4221 | def convert_mentions(cls, text, mode): | |||
mention_pat = cls.MENTION_PAT | ||||
def wrapp(match_obj): | ||||
uname = match_obj.groups()[0] | ||||
hovercard_url = "pyroutes.url('hovercard_username', {'username': '%s'});" % uname | ||||
if mode == 'markdown': | ||||
tmpl = '<strong class="tooltip-hovercard" data-hovercard-alt="{uname}" data-hovercard-url="{hovercard_url}">@{uname}</strong>' | ||||
elif mode == 'rst': | ||||
tmpl = ' **@{uname}** ' | ||||
else: | ||||
raise ValueError('mode must be rst or markdown') | ||||
return tmpl.format(**{'uname': uname, | ||||
'hovercard_url': hovercard_url}) | ||||
return mention_pat.sub(wrapp, text).strip() | ||||
@classmethod | ||||
r2903 | def plain(cls, source, universal_newline=True, leading_newline=True): | |||
r1 | source = safe_unicode(source) | |||
if universal_newline: | ||||
newline = '\n' | ||||
source = newline.join(source.splitlines()) | ||||
r2903 | rendered_source = cls.urlify_text(source) | |||
source = '' | ||||
if leading_newline: | ||||
source += '<br />' | ||||
source += rendered_source.replace("\n", '<br />') | ||||
r3485 | ||||
rendered = cls.bleach_clean(source) | ||||
return rendered | ||||
r1 | ||||
@classmethod | ||||
r2440 | def markdown(cls, source, safe=True, flavored=True, mentions=False, | |||
clean_html=True): | ||||
""" | ||||
returns markdown rendered code cleaned by the bleach library | ||||
""" | ||||
r1353 | ||||
r1 | if flavored: | |||
r3239 | markdown_renderer = get_markdown_renderer_flavored( | |||
cls.extensions, cls.output_format) | ||||
r1353 | else: | |||
r3239 | markdown_renderer = get_markdown_renderer( | |||
cls.extensions, cls.output_format) | ||||
r1 | ||||
if mentions: | ||||
r4221 | mention_hl = cls.convert_mentions(source, mode='markdown') | |||
r1 | # we extracted mentions render with this using Mentions false | |||
return cls.markdown(mention_hl, safe=safe, flavored=flavored, | ||||
mentions=False) | ||||
source = safe_unicode(source) | ||||
r2440 | ||||
r1 | try: | |||
if flavored: | ||||
source = cls._flavored_markdown(source) | ||||
r2440 | rendered = markdown_renderer.convert(source) | |||
r1 | except Exception: | |||
log.exception('Error when rendering Markdown') | ||||
if safe: | ||||
r318 | log.debug('Fallback to render in plain mode') | |||
r2992 | rendered = cls.plain(source) | |||
r1 | else: | |||
raise | ||||
r2992 | if clean_html: | |||
rendered = cls.bleach_clean(rendered) | ||||
return rendered | ||||
r1 | @classmethod | |||
r2440 | def rst(cls, source, safe=True, mentions=False, clean_html=False): | |||
r1 | if mentions: | |||
r4221 | mention_hl = cls.convert_mentions(source, mode='rst') | |||
r1 | # we extracted mentions render with this using Mentions false | |||
return cls.rst(mention_hl, safe=safe, mentions=False) | ||||
source = safe_unicode(source) | ||||
try: | ||||
r318 | docutils_settings = dict( | |||
[(alias, None) for alias in | ||||
cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES]) | ||||
r1 | ||||
r2440 | docutils_settings.update({ | |||
r4117 | 'input_encoding': 'unicode', | |||
'report_level': 4, | ||||
'syntax_highlight': 'short', | ||||
}) | ||||
r1 | ||||
for k, v in docutils_settings.iteritems(): | ||||
directives.register_directive(k, v) | ||||
parts = publish_parts(source=source, | ||||
r1833 | writer=RhodeCodeWriter(), | |||
r1 | settings_overrides=docutils_settings) | |||
r2440 | rendered = parts["fragment"] | |||
if clean_html: | ||||
rendered = cls.bleach_clean(rendered) | ||||
return parts['html_title'] + rendered | ||||
r1 | except Exception: | |||
log.exception('Error when rendering RST') | ||||
if safe: | ||||
r4221 | log.debug('Fallback to render in plain mode') | |||
r1 | return cls.plain(source) | |||
else: | ||||
raise | ||||
r1491 | @classmethod | |||
r1495 | def jupyter(cls, source, safe=True): | |||
r1491 | from rhodecode.lib import helpers | |||
r1495 | ||||
from traitlets.config import Config | ||||
r1491 | import nbformat | |||
from nbconvert import HTMLExporter | ||||
r1495 | from nbconvert.preprocessors import Preprocessor | |||
r1491 | ||||
class CustomHTMLExporter(HTMLExporter): | ||||
def _template_file_default(self): | ||||
return 'basic' | ||||
r1495 | class Sandbox(Preprocessor): | |||
def preprocess(self, nb, resources): | ||||
sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)' | ||||
for cell in nb['cells']: | ||||
r3148 | if not safe: | |||
continue | ||||
if 'outputs' in cell: | ||||
r1495 | for cell_output in cell['outputs']: | |||
if 'data' in cell_output: | ||||
if 'application/javascript' in cell_output['data']: | ||||
cell_output['data']['text/plain'] = sandbox_text | ||||
cell_output['data'].pop('application/javascript', None) | ||||
r3148 | ||||
if 'source' in cell and cell['cell_type'] == 'markdown': | ||||
# sanitize similar like in markdown | ||||
cell['source'] = cls.bleach_clean(cell['source']) | ||||
r1495 | return nb, resources | |||
r3239 | def _sanitize_resources(input_resources): | |||
r1491 | """ | |||
Skip/sanitize some of the CSS generated and included in jupyter | ||||
so it doesn't messes up UI so much | ||||
""" | ||||
# TODO(marcink): probably we should replace this with whole custom | ||||
# CSS set that doesn't screw up, but jupyter generated html has some | ||||
# special markers, so it requires Custom HTML exporter template with | ||||
# _default_template_path_default, to achieve that | ||||
# strip the reset CSS | ||||
r3239 | input_resources[0] = input_resources[0][input_resources[0].find('/*! Source'):] | |||
return input_resources | ||||
r1491 | ||||
def as_html(notebook): | ||||
conf = Config() | ||||
r1495 | conf.CustomHTMLExporter.preprocessors = [Sandbox] | |||
r1491 | html_exporter = CustomHTMLExporter(config=conf) | |||
(body, resources) = html_exporter.from_notebook_node(notebook) | ||||
header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->' | ||||
js = MakoTemplate(r''' | ||||
r3784 | <!-- MathJax configuration --> | |||
<script type="text/x-mathjax-config"> | ||||
MathJax.Hub.Config({ | ||||
jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"], | ||||
extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"], | ||||
TeX: { | ||||
extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"] | ||||
}, | ||||
tex2jax: { | ||||
inlineMath: [ ['$','$'], ["\\(","\\)"] ], | ||||
displayMath: [ ['$$','$$'], ["\\[","\\]"] ], | ||||
processEscapes: true, | ||||
processEnvironments: true | ||||
}, | ||||
// Center justify equations in code and markdown cells. Elsewhere | ||||
// we use CSS to left justify single line equations in code cells. | ||||
displayAlign: 'center', | ||||
"HTML-CSS": { | ||||
styles: {'.MathJax_Display': {"margin": 0}}, | ||||
linebreaks: { automatic: true }, | ||||
availableFonts: ["STIX", "TeX"] | ||||
}, | ||||
showMathMenu: false | ||||
}); | ||||
</script> | ||||
<!-- End of MathJax configuration --> | ||||
<script src="${h.asset('js/src/math_jax/MathJax.js')}"></script> | ||||
r1491 | ''').render(h=helpers) | |||
r3784 | css = MakoTemplate(r''' | |||
<link rel="stylesheet" type="text/css" href="${h.asset('css/style-ipython.css', ver=ver)}" media="screen"/> | ||||
''').render(h=helpers, ver='ver1') | ||||
r1491 | ||||
body = '\n'.join([header, css, js, body]) | ||||
return body, resources | ||||
notebook = nbformat.reads(source, as_version=4) | ||||
(body, resources) = as_html(notebook) | ||||
return body | ||||
r1 | ||||
class RstTemplateRenderer(object): | ||||
def __init__(self): | ||||
base = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) | ||||
rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')] | ||||
self.template_store = TemplateLookup( | ||||
directories=rst_template_dirs, | ||||
input_encoding='utf-8', | ||||
imports=['from rhodecode.lib import helpers as h']) | ||||
def _get_template(self, templatename): | ||||
return self.template_store.get_template(templatename) | ||||
def render(self, template_name, **kwargs): | ||||
template = self._get_template(template_name) | ||||
return template.render(**kwargs) | ||||