markup_renderer.py
579 lines
| 19.6 KiB
| text/x-python
|
PythonLexer
r5608 | # Copyright (C) 2011-2024 RhodeCode GmbH | |||
r1 | # | |||
# This program is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU Affero General Public License, version 3 | ||||
# (only), as published by the Free Software Foundation. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU Affero General Public License | ||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
# | ||||
# This program is dual-licensed. If you wish to learn more about the | ||||
# RhodeCode Enterprise Edition, including its added features, Support services, | ||||
# and proprietary license terms, please see https://rhodecode.com/licenses/ | ||||
""" | ||||
Renderer for markup languages with ability to parse using rst or markdown | ||||
""" | ||||
import re | ||||
import os | ||||
r1527 | import lxml | |||
r1 | import logging | |||
r4919 | import urllib.parse | |||
r5079 | import pycmarkgfm | |||
r396 | ||||
r1 | from mako.lookup import TemplateLookup | |||
r1491 | from mako.template import Template as MakoTemplate | |||
r1 | ||||
from docutils.core import publish_parts | ||||
from docutils.parsers.rst import directives | ||||
r1833 | from docutils import writers | |||
from docutils.writers import html4css1 | ||||
r1 | import markdown | |||
r5098 | from rhodecode.lib.utils2 import safe_str, MENTIONS_REGEX | |||
r1 | ||||
log = logging.getLogger(__name__) | ||||
# default renderer used to generate automated comments | ||||
DEFAULT_COMMENTS_RENDERER = 'rst' | ||||
r4221 | try: | |||
from lxml.html import fromstring | ||||
from lxml.html import tostring | ||||
except ImportError: | ||||
log.exception('Failed to import lxml') | ||||
fromstring = None | ||||
tostring = None | ||||
r1 | ||||
r1833 | class CustomHTMLTranslator(writers.html4css1.HTMLTranslator): | |||
""" | ||||
Custom HTML Translator used for sandboxing potential | ||||
JS injections in ref links | ||||
""" | ||||
r4117 | def visit_literal_block(self, node): | |||
self.body.append(self.starttag(node, 'pre', CLASS='codehilite literal-block')) | ||||
r1833 | ||||
def visit_reference(self, node): | ||||
if 'refuri' in node.attributes: | ||||
refuri = node['refuri'] | ||||
if ':' in refuri: | ||||
prefix, link = refuri.lstrip().split(':', 1) | ||||
r3147 | prefix = prefix or '' | |||
if prefix.lower() == 'javascript': | ||||
r1833 | # we don't allow javascript type of refs... | |||
node['refuri'] = 'javascript:alert("SandBoxedJavascript")' | ||||
# old style class requires this... | ||||
return html4css1.HTMLTranslator.visit_reference(self, node) | ||||
class RhodeCodeWriter(writers.html4css1.Writer): | ||||
def __init__(self): | ||||
r5079 | super(RhodeCodeWriter, self).__init__() | |||
r1833 | self.translator_class = CustomHTMLTranslator | |||
r2003 | def relative_links(html_source, server_paths): | |||
r1529 | if not html_source: | |||
return html_source | ||||
r4221 | if not fromstring and tostring: | |||
r2002 | return html_source | |||
try: | ||||
r1529 | doc = lxml.html.fromstring(html_source) | |||
except Exception: | ||||
return html_source | ||||
r1527 | for el in doc.cssselect('img, video'): | |||
r1840 | src = el.attrib.get('src') | |||
r1527 | if src: | |||
r2003 | el.attrib['src'] = relative_path(src, server_paths['raw']) | |||
r1527 | ||||
for el in doc.cssselect('a:not(.gfm)'): | ||||
r1840 | src = el.attrib.get('href') | |||
r1527 | if src: | |||
r2003 | raw_mode = el.attrib['href'].endswith('?raw=1') | |||
if raw_mode: | ||||
el.attrib['href'] = relative_path(src, server_paths['raw']) | ||||
else: | ||||
el.attrib['href'] = relative_path(src, server_paths['standard']) | ||||
r1527 | ||||
r5079 | return lxml.html.tostring(doc, encoding='unicode') | |||
r1527 | ||||
def relative_path(path, request_path, is_repo_file=None): | ||||
""" | ||||
relative link support, path is a rel path, and request_path is current | ||||
server path (not absolute) | ||||
e.g. | ||||
path = '../logo.png' | ||||
request_path= '/repo/files/path/file.md' | ||||
produces: '/repo/files/logo.png' | ||||
""" | ||||
# TODO(marcink): unicode/str support ? | ||||
r5079 | # maybe=> safe_str(urllib.quote(safe_str(final_path), '/:')) | |||
r1527 | ||||
def dummy_check(p): | ||||
return True # assume default is a valid file path | ||||
is_repo_file = is_repo_file or dummy_check | ||||
if not path: | ||||
return request_path | ||||
r5079 | path = safe_str(path) | |||
request_path = safe_str(request_path) | ||||
r1527 | ||||
r4973 | if path.startswith(('data:', 'javascript:', '#', ':')): | |||
r1527 | # skip data, anchor, invalid links | |||
return path | ||||
r4950 | is_absolute = bool(urllib.parse.urlparse(path).netloc) | |||
r1527 | if is_absolute: | |||
return path | ||||
if not request_path: | ||||
return path | ||||
r4973 | if path.startswith('/'): | |||
r1527 | path = path[1:] | |||
r4973 | if path.startswith('./'): | |||
r1527 | path = path[2:] | |||
parts = request_path.split('/') | ||||
# compute how deep we need to traverse the request_path | ||||
depth = 0 | ||||
if is_repo_file(request_path): | ||||
# if request path is a VALID file, we use a relative path with | ||||
# one level up | ||||
depth += 1 | ||||
r4973 | while path.startswith('../'): | |||
r1527 | depth += 1 | |||
path = path[3:] | ||||
if depth > 0: | ||||
parts = parts[:-depth] | ||||
parts.append(path) | ||||
r4973 | final_path = '/'.join(parts).lstrip('/') | |||
r1527 | ||||
r4973 | return '/' + final_path | |||
r1527 | ||||
r3239 | _cached_markdown_renderer = None | |||
def get_markdown_renderer(extensions, output_format): | ||||
global _cached_markdown_renderer | ||||
if _cached_markdown_renderer is None: | ||||
_cached_markdown_renderer = markdown.Markdown( | ||||
r5079 | extensions=extensions + ['legacy_attrs'], | |||
output_format=output_format) | ||||
r3239 | return _cached_markdown_renderer | |||
r5079 | def get_markdown_renderer_flavored(extensions, output_format): | |||
""" | ||||
Dummy wrapper to mimic markdown API and render github HTML rendered | ||||
r3239 | ||||
r5079 | """ | |||
md = get_markdown_renderer(extensions, output_format) | ||||
r3239 | ||||
r5079 | class GFM(object): | |||
def convert(self, source): | ||||
r5117 | with pycmarkgfm.parse_gfm(source, options=pycmarkgfm.options.hardbreaks) as document: | |||
r5079 | parsed_md = document.to_commonmark() | |||
return md.convert(parsed_md) | ||||
return GFM() | ||||
r3239 | ||||
r1 | class MarkupRenderer(object): | |||
RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw'] | ||||
MARKDOWN_PAT = re.compile(r'\.(md|mkdn?|mdown|markdown)$', re.IGNORECASE) | ||||
RST_PAT = re.compile(r'\.re?st$', re.IGNORECASE) | ||||
r1491 | JUPYTER_PAT = re.compile(r'\.(ipynb)$', re.IGNORECASE) | |||
r1 | PLAIN_PAT = re.compile(r'^readme$', re.IGNORECASE) | |||
r2090 | URL_PAT = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]' | |||
r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)') | ||||
r4221 | MENTION_PAT = re.compile(MENTIONS_REGEX) | |||
r3239 | extensions = ['markdown.extensions.codehilite', 'markdown.extensions.extra', | |||
'markdown.extensions.def_list', 'markdown.extensions.sane_lists'] | ||||
r2440 | output_format = 'html4' | |||
r1353 | ||||
r396 | # extension together with weights. Lower is first means we control how | |||
# extensions are attached to readme names with those. | ||||
PLAIN_EXTS = [ | ||||
r773 | # prefer no extension | |||
r396 | ('', 0), # special case that renders READMES names without extension | |||
('.text', 2), ('.TEXT', 2), | ||||
('.txt', 3), ('.TXT', 3) | ||||
] | ||||
RST_EXTS = [ | ||||
('.rst', 1), ('.rest', 1), | ||||
('.RST', 2), ('.REST', 2) | ||||
] | ||||
MARKDOWN_EXTS = [ | ||||
('.md', 1), ('.MD', 1), | ||||
('.mkdn', 2), ('.MKDN', 2), | ||||
('.mdown', 3), ('.MDOWN', 3), | ||||
('.markdown', 4), ('.MARKDOWN', 4) | ||||
] | ||||
r1 | def _detect_renderer(self, source, filename=None): | |||
""" | ||||
runs detection of what renderer should be used for generating html | ||||
from a markup language | ||||
filename can be also explicitly a renderer name | ||||
:param source: | ||||
:param filename: | ||||
""" | ||||
if MarkupRenderer.MARKDOWN_PAT.findall(filename): | ||||
detected_renderer = 'markdown' | ||||
elif MarkupRenderer.RST_PAT.findall(filename): | ||||
detected_renderer = 'rst' | ||||
r1491 | elif MarkupRenderer.JUPYTER_PAT.findall(filename): | |||
detected_renderer = 'jupyter' | ||||
r1 | elif MarkupRenderer.PLAIN_PAT.findall(filename): | |||
r1289 | detected_renderer = 'plain' | |||
r1 | else: | |||
detected_renderer = 'plain' | ||||
return getattr(MarkupRenderer, detected_renderer) | ||||
r396 | @classmethod | |||
r5079 | def sanitize_html(cls, text): | |||
r5098 | from .html_filters import sanitize_html | |||
return sanitize_html(text, markdown=True) | ||||
r2440 | ||||
@classmethod | ||||
r396 | def renderer_from_filename(cls, filename, exclude): | |||
""" | ||||
r401 | Detect renderer markdown/rst from filename and optionally use exclude | |||
list to remove some options. This is mostly used in helpers. | ||||
Returns None when no renderer can be detected. | ||||
r396 | """ | |||
def _filter(elements): | ||||
if isinstance(exclude, (list, tuple)): | ||||
return [x for x in elements if x not in exclude] | ||||
return elements | ||||
if filename.endswith( | ||||
tuple(_filter([x[0] for x in cls.MARKDOWN_EXTS if x[0]]))): | ||||
return 'markdown' | ||||
if filename.endswith(tuple(_filter([x[0] for x in cls.RST_EXTS if x[0]]))): | ||||
return 'rst' | ||||
r401 | return None | |||
r396 | ||||
r1 | def render(self, source, filename=None): | |||
""" | ||||
Renders a given filename using detected renderer | ||||
it detects renderers based on file extension or mimetype. | ||||
At last it will just do a simple html replacing new lines with <br/> | ||||
""" | ||||
renderer = self._detect_renderer(source, filename) | ||||
readme_data = renderer(source) | ||||
return readme_data | ||||
@classmethod | ||||
def urlify_text(cls, text): | ||||
def url_func(match_obj): | ||||
url_full = match_obj.groups()[0] | ||||
r5079 | return f'<a href="{url_full}">{url_full}</a>' | |||
r1 | ||||
r2090 | return cls.URL_PAT.sub(url_func, text) | |||
r1 | ||||
@classmethod | ||||
r4221 | def convert_mentions(cls, text, mode): | |||
mention_pat = cls.MENTION_PAT | ||||
def wrapp(match_obj): | ||||
uname = match_obj.groups()[0] | ||||
hovercard_url = "pyroutes.url('hovercard_username', {'username': '%s'});" % uname | ||||
if mode == 'markdown': | ||||
tmpl = '<strong class="tooltip-hovercard" data-hovercard-alt="{uname}" data-hovercard-url="{hovercard_url}">@{uname}</strong>' | ||||
elif mode == 'rst': | ||||
tmpl = ' **@{uname}** ' | ||||
else: | ||||
raise ValueError('mode must be rst or markdown') | ||||
return tmpl.format(**{'uname': uname, | ||||
'hovercard_url': hovercard_url}) | ||||
return mention_pat.sub(wrapp, text).strip() | ||||
@classmethod | ||||
r2903 | def plain(cls, source, universal_newline=True, leading_newline=True): | |||
r5079 | source = safe_str(source) | |||
r1 | if universal_newline: | |||
newline = '\n' | ||||
source = newline.join(source.splitlines()) | ||||
r2903 | rendered_source = cls.urlify_text(source) | |||
source = '' | ||||
if leading_newline: | ||||
source += '<br />' | ||||
source += rendered_source.replace("\n", '<br />') | ||||
r3485 | ||||
r5079 | rendered = cls.sanitize_html(source) | |||
r3485 | return rendered | |||
r1 | ||||
@classmethod | ||||
r2440 | def markdown(cls, source, safe=True, flavored=True, mentions=False, | |||
clean_html=True): | ||||
""" | ||||
returns markdown rendered code cleaned by the bleach library | ||||
""" | ||||
r1353 | ||||
r1 | if flavored: | |||
r3239 | markdown_renderer = get_markdown_renderer_flavored( | |||
cls.extensions, cls.output_format) | ||||
r1353 | else: | |||
r3239 | markdown_renderer = get_markdown_renderer( | |||
cls.extensions, cls.output_format) | ||||
r1 | ||||
if mentions: | ||||
r4221 | mention_hl = cls.convert_mentions(source, mode='markdown') | |||
r1 | # we extracted mentions render with this using Mentions false | |||
return cls.markdown(mention_hl, safe=safe, flavored=flavored, | ||||
mentions=False) | ||||
r5079 | try: | |||
rendered = markdown_renderer.convert(source) | ||||
r2440 | ||||
r1 | except Exception: | |||
log.exception('Error when rendering Markdown') | ||||
if safe: | ||||
r318 | log.debug('Fallback to render in plain mode') | |||
r2992 | rendered = cls.plain(source) | |||
r1 | else: | |||
raise | ||||
r2992 | if clean_html: | |||
r5079 | rendered = cls.sanitize_html(rendered) | |||
r2992 | return rendered | |||
r1 | @classmethod | |||
r2440 | def rst(cls, source, safe=True, mentions=False, clean_html=False): | |||
r5079 | ||||
r1 | if mentions: | |||
r4221 | mention_hl = cls.convert_mentions(source, mode='rst') | |||
r1 | # we extracted mentions render with this using Mentions false | |||
return cls.rst(mention_hl, safe=safe, mentions=False) | ||||
r5079 | source = safe_str(source) | |||
r1 | try: | |||
r318 | docutils_settings = dict( | |||
[(alias, None) for alias in | ||||
cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES]) | ||||
r1 | ||||
r2440 | docutils_settings.update({ | |||
r4117 | 'input_encoding': 'unicode', | |||
'report_level': 4, | ||||
'syntax_highlight': 'short', | ||||
}) | ||||
r1 | ||||
r5079 | for k, v in list(docutils_settings.items()): | |||
r1 | directives.register_directive(k, v) | |||
parts = publish_parts(source=source, | ||||
r1833 | writer=RhodeCodeWriter(), | |||
r1 | settings_overrides=docutils_settings) | |||
r2440 | rendered = parts["fragment"] | |||
if clean_html: | ||||
r5079 | rendered = cls.sanitize_html(rendered) | |||
r2440 | return parts['html_title'] + rendered | |||
r1 | except Exception: | |||
log.exception('Error when rendering RST') | ||||
if safe: | ||||
r4221 | log.debug('Fallback to render in plain mode') | |||
r1 | return cls.plain(source) | |||
else: | ||||
raise | ||||
r1491 | @classmethod | |||
r1495 | def jupyter(cls, source, safe=True): | |||
r1491 | from rhodecode.lib import helpers | |||
r5252 | from .html_sanitizer_defs import markdown_attrs, all_tags, all_styles | |||
r1495 | ||||
r5109 | from traitlets import default, config | |||
r1491 | import nbformat | |||
from nbconvert import HTMLExporter | ||||
r1495 | from nbconvert.preprocessors import Preprocessor | |||
r5252 | from nbconvert.preprocessors.sanitize import SanitizeHTML | |||
r1491 | ||||
class CustomHTMLExporter(HTMLExporter): | ||||
r5109 | ||||
@default("template_file") | ||||
r1491 | def _template_file_default(self): | |||
r5109 | if self.template_extension: | |||
return "basic/index" + self.template_extension | ||||
r1491 | ||||
r1495 | class Sandbox(Preprocessor): | |||
r5252 | def preprocess_cell(self, cell, resources, cell_index): | |||
if not safe: | ||||
return cell, resources | ||||
r1495 | sandbox_text = 'SandBoxed(IPython.core.display.Javascript object)' | |||
r5252 | if cell.cell_type == "markdown": | |||
cell.source = cls.sanitize_html(cell.source) | ||||
return cell, resources | ||||
r3148 | ||||
r5273 | for cell_output in cell.get('outputs', []): | |||
r5252 | if 'data' in cell_output: | |||
if 'application/javascript' in cell_output['data']: | ||||
cell_output['data']['text/plain'] = sandbox_text | ||||
cell_output['data'].pop('application/javascript', None) | ||||
return cell, resources | ||||
r1495 | ||||
r3239 | def _sanitize_resources(input_resources): | |||
r1491 | """ | |||
Skip/sanitize some of the CSS generated and included in jupyter | ||||
r5109 | so it doesn't mess up UI so much | |||
r1491 | """ | |||
# TODO(marcink): probably we should replace this with whole custom | ||||
# CSS set that doesn't screw up, but jupyter generated html has some | ||||
# special markers, so it requires Custom HTML exporter template with | ||||
# _default_template_path_default, to achieve that | ||||
# strip the reset CSS | ||||
r3239 | input_resources[0] = input_resources[0][input_resources[0].find('/*! Source'):] | |||
return input_resources | ||||
r1491 | ||||
def as_html(notebook): | ||||
r5109 | conf = config.Config() | |||
r5252 | # TODO: Keep an eye on the order of preprocessors | |||
conf.CustomHTMLExporter.default_preprocessors = [Sandbox, SanitizeHTML] | ||||
r5249 | conf.Sandbox.enabled = True | |||
r5252 | conf.SanitizeHTML.enabled = True | |||
conf.SanitizeHTML.attributes = markdown_attrs | ||||
conf.SanitizeHTML.tags = all_tags | ||||
conf.SanitizeHTML.styles = all_styles | ||||
conf.SanitizeHTML.sanitized_output_types = { | ||||
"text/html", | ||||
"text/markdown", | ||||
} | ||||
conf.SanitizeHTML.safe_output_keys = { | ||||
"metadata", | ||||
"text/plain", | ||||
"text/latex", | ||||
"application/json", | ||||
"image/png", | ||||
"image/jpg" | ||||
"image/jpeg", | ||||
"image/svg", | ||||
"image/svg+xml" | ||||
} | ||||
r1491 | html_exporter = CustomHTMLExporter(config=conf) | |||
(body, resources) = html_exporter.from_notebook_node(notebook) | ||||
r5109 | ||||
r1491 | header = '<!-- ## IPYTHON NOTEBOOK RENDERING ## -->' | |||
js = MakoTemplate(r''' | ||||
r3784 | <!-- MathJax configuration --> | |||
<script type="text/x-mathjax-config"> | ||||
MathJax.Hub.Config({ | ||||
jax: ["input/TeX","output/HTML-CSS", "output/PreviewHTML"], | ||||
extensions: ["tex2jax.js","MathMenu.js","MathZoom.js", "fast-preview.js", "AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"], | ||||
TeX: { | ||||
extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"] | ||||
}, | ||||
tex2jax: { | ||||
inlineMath: [ ['$','$'], ["\\(","\\)"] ], | ||||
displayMath: [ ['$$','$$'], ["\\[","\\]"] ], | ||||
processEscapes: true, | ||||
processEnvironments: true | ||||
}, | ||||
// Center justify equations in code and markdown cells. Elsewhere | ||||
// we use CSS to left justify single line equations in code cells. | ||||
displayAlign: 'center', | ||||
"HTML-CSS": { | ||||
styles: {'.MathJax_Display': {"margin": 0}}, | ||||
linebreaks: { automatic: true }, | ||||
availableFonts: ["STIX", "TeX"] | ||||
}, | ||||
showMathMenu: false | ||||
}); | ||||
</script> | ||||
<!-- End of MathJax configuration --> | ||||
<script src="${h.asset('js/src/math_jax/MathJax.js')}"></script> | ||||
r1491 | ''').render(h=helpers) | |||
r3784 | css = MakoTemplate(r''' | |||
<link rel="stylesheet" type="text/css" href="${h.asset('css/style-ipython.css', ver=ver)}" media="screen"/> | ||||
''').render(h=helpers, ver='ver1') | ||||
r1491 | ||||
body = '\n'.join([header, css, js, body]) | ||||
return body, resources | ||||
r5263 | captured_errors = {} | |||
r5265 | error_body = """ | |||
<div style="text-align: center;"> | ||||
<h3>Invalid Notebook!</h3> | ||||
<p>{}</p> | ||||
</div> | ||||
""" | ||||
r5263 | # TODO: In the event of a newer jupyter notebook version, consider increasing the as_version parameter | |||
notebook = nbformat.reads(source, as_version=4, capture_validation_error=captured_errors) | ||||
if captured_errors: | ||||
error_messages = '<br>'.join(str(error) for error in captured_errors.values()) | ||||
r5265 | body = error_body.format(error_messages) | |||
r5263 | else: | |||
r5265 | try: | |||
body, _ = as_html(notebook) | ||||
r5273 | except (AttributeError, nbformat.ValidationError): | |||
r5265 | try: | |||
nbformat.validate(nbformat.reader.reads(source)) | ||||
except nbformat.ValidationError as exc: | ||||
body = error_body.format(str(exc)) | ||||
else: | ||||
raise | ||||
r5263 | return body | |||
r1491 | ||||
r1 | ||||
class RstTemplateRenderer(object): | ||||
def __init__(self): | ||||
base = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) | ||||
rst_template_dirs = [os.path.join(base, 'templates', 'rst_templates')] | ||||
self.template_store = TemplateLookup( | ||||
directories=rst_template_dirs, | ||||
input_encoding='utf-8', | ||||
imports=['from rhodecode.lib import helpers as h']) | ||||
def _get_template(self, templatename): | ||||
return self.template_store.get_template(templatename) | ||||
def render(self, template_name, **kwargs): | ||||
template = self._get_template(template_name) | ||||
return template.render(**kwargs) | ||||