markup_renderer.py
246 lines
| 9.6 KiB
| text/x-python
|
PythonLexer
Bradley M. Kuhn
|
r4187 | # -*- coding: utf-8 -*- | ||
# This program is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU General Public License as published by | ||||
# the Free Software Foundation, either version 3 of the License, or | ||||
# (at your option) any later version. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU General Public License | ||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
""" | ||||
kallithea.lib.markup_renderer | ||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
Renderer for markup languages with ability to parse using rst or markdown | ||||
Bradley M. Kuhn
|
r4211 | This file was forked by the Kallithea project in July 2014. | ||
Original author and date, and relevant copyright and licensing information is below: | ||||
Bradley M. Kuhn
|
r4187 | :created_on: Oct 27, 2011 | ||
:author: marcink | ||||
Bradley M. Kuhn
|
r4211 | :copyright: (c) 2013 RhodeCode GmbH, and others. | ||
Bradley M. Kuhn
|
r4208 | :license: GPLv3, see LICENSE.md for more details. | ||
Bradley M. Kuhn
|
r4187 | """ | ||
Mads Kiilerich
|
r8437 | import hashlib | ||
Mads Kiilerich
|
r7718 | import logging | ||
Bradley M. Kuhn
|
r4187 | import re | ||
import traceback | ||||
Mads Kiilerich
|
r7718 | import bleach | ||
Mads Kiilerich
|
r7321 | import markdown as markdown_mod | ||
Mads Kiilerich
|
r8437 | from docutils.core import publish_parts | ||
from docutils.parsers.rst import directives | ||||
Mads Kiilerich
|
r7321 | |||
Mads Kiilerich
|
r8565 | from kallithea.lib import webutils | ||
Mads Kiilerich
|
r7718 | |||
Bradley M. Kuhn
|
r4187 | |||
log = logging.getLogger(__name__) | ||||
Mads Kiilerich
|
r6147 | url_re = re.compile(r'''\bhttps?://(?:[\da-zA-Z0-9@:.-]+)''' | ||
r'''(?:[/a-zA-Z0-9_=@#~&+%.,:;?!*()-]*[/a-zA-Z0-9_=@#~])?''') | ||||
Mads Kiilerich
|
r4691 | |||
Bradley M. Kuhn
|
r4187 | class MarkupRenderer(object): | ||
RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw'] | ||||
MARKDOWN_PAT = re.compile(r'md|mkdn?|mdown|markdown', re.IGNORECASE) | ||||
RST_PAT = re.compile(r're?st', re.IGNORECASE) | ||||
PLAIN_PAT = re.compile(r'readme', re.IGNORECASE) | ||||
Mads Kiilerich
|
r7569 | @classmethod | ||
def _detect_renderer(cls, source, filename): | ||||
Bradley M. Kuhn
|
r4187 | """ | ||
runs detection of what renderer should be used for generating html | ||||
from a markup language | ||||
filename can be also explicitly a renderer name | ||||
""" | ||||
Mads Kiilerich
|
r7569 | if cls.MARKDOWN_PAT.findall(filename): | ||
return cls.markdown | ||||
elif cls.RST_PAT.findall(filename): | ||||
return cls.rst | ||||
elif cls.PLAIN_PAT.findall(filename): | ||||
return cls.rst | ||||
return cls.plain | ||||
Bradley M. Kuhn
|
r4187 | |||
@classmethod | ||||
def _flavored_markdown(cls, text): | ||||
""" | ||||
Github style flavored markdown | ||||
:param text: | ||||
""" | ||||
# Extract pre blocks. | ||||
extractions = {} | ||||
Lars Kruse
|
r6789 | |||
Bradley M. Kuhn
|
r4187 | def pre_extraction_callback(matchobj): | ||
Mads Kiilerich
|
r8437 | digest = hashlib.sha1(matchobj.group(0)).hexdigest() | ||
Bradley M. Kuhn
|
r4187 | extractions[digest] = matchobj.group(0) | ||
return "{gfm-extraction-%s}" % digest | ||||
pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL) | ||||
text = re.sub(pattern, pre_extraction_callback, text) | ||||
# Prevent foo_bar_baz from ending up with an italic word in the middle. | ||||
def italic_callback(matchobj): | ||||
s = matchobj.group(0) | ||||
if list(s).count('_') >= 2: | ||||
Mads Kiilerich
|
r7720 | return s.replace('_', r'\_') | ||
Bradley M. Kuhn
|
r4187 | return s | ||
text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text) | ||||
# In very clear cases, let newlines become <br /> tags. | ||||
def newline_callback(matchobj): | ||||
if len(matchobj.group(1)) == 1: | ||||
return matchobj.group(0).rstrip() + ' \n' | ||||
else: | ||||
return matchobj.group(0) | ||||
pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE) | ||||
text = re.sub(pattern, newline_callback, text) | ||||
# Insert pre block extractions. | ||||
def pre_insert_callback(matchobj): | ||||
return '\n\n' + extractions[matchobj.group(1)] | ||||
text = re.sub(r'{gfm-extraction-([0-9a-f]{32})\}', | ||||
pre_insert_callback, text) | ||||
return text | ||||
Mads Kiilerich
|
r7569 | @classmethod | ||
def render(cls, source, filename=None): | ||||
Bradley M. Kuhn
|
r4187 | """ | ||
Renders a given filename using detected renderer | ||||
it detects renderers based on file extension or mimetype. | ||||
At last it will just do a simple html replacing new lines with <br/> | ||||
Mads Kiilerich
|
r7570 | |||
>>> MarkupRenderer.render('''<img id="a" style="margin-top:-1000px;color:red" src="http://example.com/test.jpg">''', '.md') | ||||
Mads Kiilerich
|
r8087 | '<p><img id="a" src="http://example.com/test.jpg" style="color: red;"></p>' | ||
Mads Kiilerich
|
r7570 | >>> MarkupRenderer.render('''<img class="c d" src="file://localhost/test.jpg">''', 'b.mkd') | ||
Mads Kiilerich
|
r8087 | '<p><img class="c d"></p>' | ||
Mads Kiilerich
|
r7570 | >>> MarkupRenderer.render('''<a href="foo">foo</a>''', 'c.mkdn') | ||
Mads Kiilerich
|
r8087 | '<p><a href="foo">foo</a></p>' | ||
Mads Kiilerich
|
r7570 | >>> MarkupRenderer.render('''<script>alert(1)</script>''', 'd.mdown') | ||
Mads Kiilerich
|
r8087 | '<script>alert(1)</script>' | ||
Mads Kiilerich
|
r7570 | >>> MarkupRenderer.render('''<div onclick="alert(2)">yo</div>''', 'markdown') | ||
Mads Kiilerich
|
r8087 | '<div>yo</div>' | ||
Mads Kiilerich
|
r7570 | >>> MarkupRenderer.render('''<a href="javascript:alert(3)">yo</a>''', 'md') | ||
Mads Kiilerich
|
r8087 | '<p><a>yo</a></p>' | ||
Bradley M. Kuhn
|
r4187 | """ | ||
Mads Kiilerich
|
r7569 | renderer = cls._detect_renderer(source, filename) | ||
Bradley M. Kuhn
|
r4187 | readme_data = renderer(source) | ||
Thomas De Schampheleire
|
r7468 | # Allow most HTML, while preventing XSS issues: | ||
# no <script> tags, no onclick attributes, no javascript | ||||
# "protocol", and also limit styling to prevent defacing. | ||||
return bleach.clean(readme_data, | ||||
tags=['a', 'abbr', 'b', 'blockquote', 'br', 'code', 'dd', | ||||
'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', | ||||
'h6', 'hr', 'i', 'img', 'li', 'ol', 'p', 'pre', 'span', | ||||
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', | ||||
'thead', 'tr', 'ul'], | ||||
attributes=['class', 'id', 'style', 'label', 'title', 'alt', 'href', 'src'], | ||||
styles=['color'], | ||||
protocols=['http', 'https', 'mailto'], | ||||
) | ||||
Bradley M. Kuhn
|
r4187 | |||
@classmethod | ||||
def plain(cls, source, universal_newline=True): | ||||
Mads Kiilerich
|
r8099 | """ | ||
>>> MarkupRenderer.plain('https://example.com/') | ||||
'<br /><a href="https://example.com/">https://example.com/</a>' | ||||
""" | ||||
Bradley M. Kuhn
|
r4187 | if universal_newline: | ||
newline = '\n' | ||||
source = newline.join(source.splitlines()) | ||||
Mads Kiilerich
|
r4691 | def url_func(match_obj): | ||
Mads Kiilerich
|
r6147 | url_full = match_obj.group(0) | ||
Mads Kiilerich
|
r4691 | return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full}) | ||
source = url_re.sub(url_func, source) | ||||
Bradley M. Kuhn
|
r4187 | return '<br />' + source.replace("\n", '<br />') | ||
@classmethod | ||||
def markdown(cls, source, safe=True, flavored=False): | ||||
Mads Kiilerich
|
r7320 | """ | ||
Mads Kiilerich
|
r7570 | Convert Markdown (possibly GitHub Flavored) to INSECURE HTML, possibly | ||
with "safe" fall-back to plaintext. Output from this method should be sanitized before use. | ||||
Mads Kiilerich
|
r7320 | |||
>>> MarkupRenderer.markdown('''<img id="a" style="margin-top:-1000px;color:red" src="http://example.com/test.jpg">''') | ||||
Mads Kiilerich
|
r8087 | '<p><img id="a" style="margin-top:-1000px;color:red" src="http://example.com/test.jpg"></p>' | ||
Mads Kiilerich
|
r7320 | >>> MarkupRenderer.markdown('''<img class="c d" src="file://localhost/test.jpg">''') | ||
Mads Kiilerich
|
r8087 | '<p><img class="c d" src="file://localhost/test.jpg"></p>' | ||
Mads Kiilerich
|
r7320 | >>> MarkupRenderer.markdown('''<a href="foo">foo</a>''') | ||
Mads Kiilerich
|
r8087 | '<p><a href="foo">foo</a></p>' | ||
Mads Kiilerich
|
r7320 | >>> MarkupRenderer.markdown('''<script>alert(1)</script>''') | ||
Mads Kiilerich
|
r8087 | '<script>alert(1)</script>' | ||
Mads Kiilerich
|
r7320 | >>> MarkupRenderer.markdown('''<div onclick="alert(2)">yo</div>''') | ||
Mads Kiilerich
|
r8087 | '<div onclick="alert(2)">yo</div>' | ||
Mads Kiilerich
|
r7320 | >>> MarkupRenderer.markdown('''<a href="javascript:alert(3)">yo</a>''') | ||
Mads Kiilerich
|
r8087 | '<p><a href="javascript:alert(3)">yo</a></p>' | ||
Mads Kiilerich
|
r7645 | >>> MarkupRenderer.markdown('''## Foo''') | ||
Mads Kiilerich
|
r8087 | '<h2>Foo</h2>' | ||
>>> print(MarkupRenderer.markdown(''' | ||||
Mads Kiilerich
|
r7645 | ... #!/bin/bash | ||
... echo "hello" | ||||
Mads Kiilerich
|
r8087 | ... ''')) | ||
Mads Kiilerich
|
r7645 | <table class="code-highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre>1 | ||
2</pre></div></td><td class="code"><div class="code-highlight"><pre><span></span><span class="ch">#!/bin/bash</span> | ||||
<span class="nb">echo</span> <span class="s2">"hello"</span> | ||||
</pre></div> | ||||
</td></tr></table> | ||||
Mads Kiilerich
|
r7320 | """ | ||
Bradley M. Kuhn
|
r4187 | try: | ||
if flavored: | ||||
source = cls._flavored_markdown(source) | ||||
Thomas De Schampheleire
|
r7460 | return markdown_mod.markdown( | ||
source, | ||||
Mads Kiilerich
|
r7646 | extensions=['markdown.extensions.codehilite', 'markdown.extensions.extra'], | ||
extension_configs={'markdown.extensions.codehilite': {'css_class': 'code-highlight'}}) | ||||
Bradley M. Kuhn
|
r4187 | except Exception: | ||
log.error(traceback.format_exc()) | ||||
if safe: | ||||
Thomas De Schampheleire
|
r4918 | log.debug('Falling back to render in plain mode') | ||
Bradley M. Kuhn
|
r4187 | return cls.plain(source) | ||
else: | ||||
raise | ||||
@classmethod | ||||
def rst(cls, source, safe=True): | ||||
try: | ||||
docutils_settings = dict([(alias, None) for alias in | ||||
cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES]) | ||||
docutils_settings.update({'input_encoding': 'unicode', | ||||
'report_level': 4}) | ||||
Mads Kiilerich
|
r8059 | for k, v in docutils_settings.items(): | ||
Bradley M. Kuhn
|
r4187 | directives.register_directive(k, v) | ||
parts = publish_parts(source=source, | ||||
writer_name="html4css1", | ||||
settings_overrides=docutils_settings) | ||||
return parts['html_title'] + parts["fragment"] | ||||
except Exception: | ||||
log.error(traceback.format_exc()) | ||||
if safe: | ||||
Thomas De Schampheleire
|
r4918 | log.debug('Falling back to render in plain mode') | ||
Bradley M. Kuhn
|
r4187 | return cls.plain(source) | ||
else: | ||||
raise | ||||
@classmethod | ||||
def rst_with_mentions(cls, source): | ||||
def wrapp(match_obj): | ||||
uname = match_obj.groups()[0] | ||||
Mads Kiilerich
|
r7720 | return r'\ **@%(uname)s**\ ' % {'uname': uname} | ||
Mads Kiilerich
|
r8565 | mention_hl = webutils.MENTIONS_REGEX.sub(wrapp, source).strip() | ||
Bradley M. Kuhn
|
r4187 | return cls.rst(mention_hl) | ||