##// END OF EJS Templates
lib: use bleach to sanitize HTML generated from markdown - fix XSS issue when repo front page shows README.md...
Mads Kiilerich -
r7322:5746cc3b stable
parent child Browse files
Show More
@@ -1,214 +1,227 b''
1 1 # -*- coding: utf-8 -*-
2 2 # This program is free software: you can redistribute it and/or modify
3 3 # it under the terms of the GNU General Public License as published by
4 4 # the Free Software Foundation, either version 3 of the License, or
5 5 # (at your option) any later version.
6 6 #
7 7 # This program is distributed in the hope that it will be useful,
8 8 # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 9 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 10 # GNU General Public License for more details.
11 11 #
12 12 # You should have received a copy of the GNU General Public License
13 13 # along with this program. If not, see <http://www.gnu.org/licenses/>.
14 14 """
15 15 kallithea.lib.markup_renderer
16 16 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17 17
18 18 Renderer for markup languages with ability to parse using rst or markdown
19 19
20 20 This file was forked by the Kallithea project in July 2014.
21 21 Original author and date, and relevant copyright and licensing information is below:
22 22 :created_on: Oct 27, 2011
23 23 :author: marcink
24 24 :copyright: (c) 2013 RhodeCode GmbH, and others.
25 25 :license: GPLv3, see LICENSE.md for more details.
26 26 """
27 27
28 28
29 29 import re
30 30 import logging
31 31 import traceback
32 32
33 33 import markdown as markdown_mod
34 import bleach
34 35
35 36 from kallithea.lib.utils2 import safe_unicode, MENTIONS_REGEX
36 37
37 38 log = logging.getLogger(__name__)
38 39
39 40
40 41 url_re = re.compile(r'''(\bhttps?://(?:[\da-zA-Z0-9@:.-]+)'''
41 42 r'''(?:[/a-zA-Z0-9_=@#~&+%.,:;?!*()-]*[/a-zA-Z0-9_=@#~])?)''')
42 43
43 44 class MarkupRenderer(object):
44 45 RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES = ['include', 'meta', 'raw']
45 46
46 47 MARKDOWN_PAT = re.compile(r'md|mkdn?|mdown|markdown', re.IGNORECASE)
47 48 RST_PAT = re.compile(r're?st', re.IGNORECASE)
48 49 PLAIN_PAT = re.compile(r'readme', re.IGNORECASE)
49 50
50 51 def _detect_renderer(self, source, filename=None):
51 52 """
52 53 runs detection of what renderer should be used for generating html
53 54 from a markup language
54 55
55 56 filename can be also explicitly a renderer name
56 57
57 58 :param source:
58 59 :param filename:
59 60 """
60 61
61 62 if MarkupRenderer.MARKDOWN_PAT.findall(filename):
62 63 detected_renderer = 'markdown'
63 64 elif MarkupRenderer.RST_PAT.findall(filename):
64 65 detected_renderer = 'rst'
65 66 elif MarkupRenderer.PLAIN_PAT.findall(filename):
66 67 detected_renderer = 'rst'
67 68 else:
68 69 detected_renderer = 'plain'
69 70
70 71 return getattr(MarkupRenderer, detected_renderer)
71 72
72 73 @classmethod
73 74 def _flavored_markdown(cls, text):
74 75 """
75 76 Github style flavored markdown
76 77
77 78 :param text:
78 79 """
79 80 from hashlib import md5
80 81
81 82 # Extract pre blocks.
82 83 extractions = {}
83 84 def pre_extraction_callback(matchobj):
84 85 digest = md5(matchobj.group(0)).hexdigest()
85 86 extractions[digest] = matchobj.group(0)
86 87 return "{gfm-extraction-%s}" % digest
87 88 pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
88 89 text = re.sub(pattern, pre_extraction_callback, text)
89 90
90 91 # Prevent foo_bar_baz from ending up with an italic word in the middle.
91 92 def italic_callback(matchobj):
92 93 s = matchobj.group(0)
93 94 if list(s).count('_') >= 2:
94 95 return s.replace('_', '\_')
95 96 return s
96 97 text = re.sub(r'^(?! {4}|\t)\w+_\w+_\w[\w_]*', italic_callback, text)
97 98
98 99 # In very clear cases, let newlines become <br /> tags.
99 100 def newline_callback(matchobj):
100 101 if len(matchobj.group(1)) == 1:
101 102 return matchobj.group(0).rstrip() + ' \n'
102 103 else:
103 104 return matchobj.group(0)
104 105 pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE)
105 106 text = re.sub(pattern, newline_callback, text)
106 107
107 108 # Insert pre block extractions.
108 109 def pre_insert_callback(matchobj):
109 110 return '\n\n' + extractions[matchobj.group(1)]
110 111 text = re.sub(r'{gfm-extraction-([0-9a-f]{32})\}',
111 112 pre_insert_callback, text)
112 113
113 114 return text
114 115
115 116 def render(self, source, filename=None):
116 117 """
117 118 Renders a given filename using detected renderer
118 119 it detects renderers based on file extension or mimetype.
119 120 At last it will just do a simple html replacing new lines with <br/>
120 121
121 122 :param file_name:
122 123 :param source:
123 124 """
124 125
125 126 renderer = self._detect_renderer(source, filename)
126 127 readme_data = renderer(source)
127 128 return readme_data
128 129
129 130 @classmethod
130 131 def plain(cls, source, universal_newline=True):
131 132 source = safe_unicode(source)
132 133 if universal_newline:
133 134 newline = '\n'
134 135 source = newline.join(source.splitlines())
135 136
136 137 def url_func(match_obj):
137 138 url_full = match_obj.groups()[0]
138 139 return '<a href="%(url)s">%(url)s</a>' % ({'url': url_full})
139 140 source = url_re.sub(url_func, source)
140 141 return '<br />' + source.replace("\n", '<br />')
141 142
142 143 @classmethod
143 144 def markdown(cls, source, safe=True, flavored=False):
144 145 """
145 Convert Markdown (possibly GitHub Flavored) to HTML, possibly
146 Convert Markdown (possibly GitHub Flavored) to XSS safe HTML, possibly
146 147 with "safe" fall-back to plaintext.
147 148
148 149 >>> MarkupRenderer.markdown('''<img id="a" style="margin-top:-1000px;color:red" src="http://example.com/test.jpg">''')
149 u'<p><img id="a" style="margin-top:-1000px;color:red" src="http://example.com/test.jpg"></p>'
150 u'<p><img id="a" src="http://example.com/test.jpg" style="color: red;"></p>'
150 151 >>> MarkupRenderer.markdown('''<img class="c d" src="file://localhost/test.jpg">''')
151 u'<p><img class="c d" src="file://localhost/test.jpg"></p>'
152 u'<p><img class="c d"></p>'
152 153 >>> MarkupRenderer.markdown('''<a href="foo">foo</a>''')
153 154 u'<p><a href="foo">foo</a></p>'
154 155 >>> MarkupRenderer.markdown('''<script>alert(1)</script>''')
155 u'<script>alert(1)</script>'
156 u'&lt;script&gt;alert(1)&lt;/script&gt;'
156 157 >>> MarkupRenderer.markdown('''<div onclick="alert(2)">yo</div>''')
157 u'<div onclick="alert(2)">yo</div>'
158 u'<div>yo</div>'
158 159 >>> MarkupRenderer.markdown('''<a href="javascript:alert(3)">yo</a>''')
159 u'<p><a href="javascript:alert(3)">yo</a></p>'
160 u'<p><a>yo</a></p>'
160 161 """
161 162 source = safe_unicode(source)
162 163 try:
163 164 if flavored:
164 165 source = cls._flavored_markdown(source)
165 166 markdown_html = markdown_mod.markdown(source, ['codehilite', 'extra'])
166 return markdown_html
167 # Allow most HTML, while preventing XSS issues:
168 # no <script> tags, no onclick attributes, no javascript
169 # "protocol", and also limit styling to prevent defacing.
170 return bleach.clean(markdown_html,
171 tags=['a', 'abbr', 'b', 'blockquote', 'br', 'code', 'dd',
172 'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5',
173 'h6', 'hr', 'i', 'img', 'li', 'ol', 'p', 'pre', 'span',
174 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th',
175 'thead', 'tr', 'ul'],
176 attributes=['class', 'id', 'style', 'label', 'title', 'alt', 'href', 'src'],
177 styles=['color'],
178 protocols=['http', 'https', 'mailto'],
179 )
167 180 except Exception:
168 181 log.error(traceback.format_exc())
169 182 if safe:
170 183 log.debug('Falling back to render in plain mode')
171 184 return cls.plain(source)
172 185 else:
173 186 raise
174 187
175 188 @classmethod
176 189 def rst(cls, source, safe=True):
177 190 source = safe_unicode(source)
178 191 try:
179 192 from docutils.core import publish_parts
180 193 from docutils.parsers.rst import directives
181 194 docutils_settings = dict([(alias, None) for alias in
182 195 cls.RESTRUCTUREDTEXT_DISALLOWED_DIRECTIVES])
183 196
184 197 docutils_settings.update({'input_encoding': 'unicode',
185 198 'report_level': 4})
186 199
187 200 for k, v in docutils_settings.iteritems():
188 201 directives.register_directive(k, v)
189 202
190 203 parts = publish_parts(source=source,
191 204 writer_name="html4css1",
192 205 settings_overrides=docutils_settings)
193 206
194 207 return parts['html_title'] + parts["fragment"]
195 208 except ImportError:
196 209 log.warning('Install docutils to use this function')
197 210 return cls.plain(source)
198 211 except Exception:
199 212 log.error(traceback.format_exc())
200 213 if safe:
201 214 log.debug('Falling back to render in plain mode')
202 215 return cls.plain(source)
203 216 else:
204 217 raise
205 218
206 219 @classmethod
207 220 def rst_with_mentions(cls, source):
208 221 mention_pat = re.compile(MENTIONS_REGEX)
209 222
210 223 def wrapp(match_obj):
211 224 uname = match_obj.groups()[0]
212 225 return '\ **@%(uname)s**\ ' % {'uname': uname}
213 226 mention_hl = mention_pat.sub(wrapp, source).strip()
214 227 return cls.rst(mention_hl)
@@ -1,192 +1,193 b''
1 1 #!/usr/bin/env python2
2 2 # -*- coding: utf-8 -*-
3 3 import os
4 4 import sys
5 5 import platform
6 6
7 7 if sys.version_info < (2, 6):
8 8 raise Exception('Kallithea requires python 2.6 or 2.7')
9 9
10 10
11 11 here = os.path.abspath(os.path.dirname(__file__))
12 12
13 13
14 14 def _get_meta_var(name, data, callback_handler=None):
15 15 import re
16 16 matches = re.compile(r'(?:%s)\s*=\s*(.*)' % name).search(data)
17 17 if matches:
18 18 if not callable(callback_handler):
19 19 callback_handler = lambda v: v
20 20
21 21 return callback_handler(eval(matches.groups()[0]))
22 22
23 23 _meta = open(os.path.join(here, 'kallithea', '__init__.py'), 'rb')
24 24 _metadata = _meta.read()
25 25 _meta.close()
26 26
27 27 callback = lambda V: ('.'.join(map(str, V[:3])) + '.'.join(V[3:]))
28 28 __version__ = _get_meta_var('VERSION', _metadata, callback)
29 29 __license__ = _get_meta_var('__license__', _metadata)
30 30 __author__ = _get_meta_var('__author__', _metadata)
31 31 __url__ = _get_meta_var('__url__', _metadata)
32 32 # defines current platform
33 33 __platform__ = platform.system()
34 34
35 35 is_windows = __platform__ in ['Windows']
36 36
37 37 requirements = [
38 38 "setuptools<34", # setuptools==34 has an undeclared requirement of pyparsing >=2.1, but celery<2.3 requires pyparsing<2
39 39 "waitress==0.8.8",
40 40 "webob>=1.0.8,<=1.1.1",
41 41 "webtest==1.4.3",
42 42 "Pylons>=1.0.0,<=1.0.3",
43 43 "Beaker==1.6.4",
44 44 "WebHelpers==1.3",
45 45 "formencode>=1.2.4,<=1.2.6",
46 46 "SQLAlchemy==0.7.10",
47 47 "Mako>=0.9.0,<=1.0.0",
48 48 "pygments>=1.5",
49 49 "whoosh>=2.4.0,<=2.5.7",
50 50 "celery>=2.2.5,<2.3",
51 51 "babel>=0.9.6,<=1.3",
52 52 "python-dateutil>=1.5.0,<2.0.0",
53 53 "markdown==2.2.1",
54 54 "docutils>=0.8.1,<=0.11",
55 55 "mock",
56 56 "URLObject==2.3.4",
57 57 "Routes==1.13",
58 58 "dulwich>=0.9.9,<=0.9.9",
59 59 "mercurial>=2.9,<4.3",
60 "bleach >= 3.0, < 3.1",
60 61 ]
61 62
62 63 if sys.version_info < (2, 7):
63 64 requirements.append("importlib==1.0.1")
64 65 requirements.append("unittest2")
65 66 requirements.append("argparse")
66 67
67 68 if not is_windows:
68 69 requirements.append("py-bcrypt>=0.3.0,<=0.4")
69 70
70 71
71 72 dependency_links = [
72 73 ]
73 74
74 75 classifiers = [
75 76 'Development Status :: 4 - Beta',
76 77 'Environment :: Web Environment',
77 78 'Framework :: Pylons',
78 79 'Intended Audience :: Developers',
79 80 'License :: OSI Approved :: GNU General Public License (GPL)',
80 81 'Operating System :: OS Independent',
81 82 'Programming Language :: Python',
82 83 'Programming Language :: Python :: 2.6',
83 84 'Programming Language :: Python :: 2.7',
84 85 'Topic :: Software Development :: Version Control',
85 86 ]
86 87
87 88
88 89 # additional files from project that goes somewhere in the filesystem
89 90 # relative to sys.prefix
90 91 data_files = []
91 92
92 93 # additional files that goes into package itself
93 94 package_data = {'kallithea': ['i18n/*/LC_MESSAGES/*.mo', ], }
94 95
95 96 description = ('Kallithea is a fast and powerful management tool '
96 97 'for Mercurial and Git with a built in push/pull server, '
97 98 'full text search and code-review.')
98 99
99 100 keywords = ' '.join([
100 101 'kallithea', 'mercurial', 'git', 'code review',
101 102 'repo groups', 'ldap', 'repository management', 'hgweb replacement',
102 103 'hgwebdir', 'gitweb replacement', 'serving hgweb',
103 104 ])
104 105
105 106 # long description
106 107 README_FILE = 'README.rst'
107 108 CHANGELOG_FILE = 'docs/changelog.rst'
108 109 try:
109 110 long_description = open(README_FILE).read() + '\n\n' + \
110 111 open(CHANGELOG_FILE).read()
111 112
112 113 except IOError as err:
113 114 sys.stderr.write(
114 115 "[WARNING] Cannot find file specified as long_description (%s)\n or "
115 116 "changelog (%s) skipping that file" % (README_FILE, CHANGELOG_FILE)
116 117 )
117 118 long_description = description
118 119
119 120 try:
120 121 from setuptools import setup, find_packages
121 122 except ImportError:
122 123 from ez_setup import use_setuptools
123 124 use_setuptools()
124 125 from setuptools import setup, find_packages
125 126
126 127 # monkey patch setuptools to use distutils owner/group functionality
127 128 from setuptools.command import sdist
128 129 sdist_org = sdist.sdist
129 130 class sdist_new(sdist_org):
130 131 def initialize_options(self):
131 132 sdist_org.initialize_options(self)
132 133 self.owner = self.group = 'root'
133 134 sdist.sdist = sdist_new
134 135
135 136 # packages
136 137 packages = find_packages(exclude=['ez_setup'])
137 138
138 139 setup(
139 140 name='Kallithea',
140 141 version=__version__,
141 142 description=description,
142 143 long_description=long_description,
143 144 keywords=keywords,
144 145 license=__license__,
145 146 author=__author__,
146 147 author_email='kallithea@sfconservancy.org',
147 148 dependency_links=dependency_links,
148 149 url=__url__,
149 150 install_requires=requirements,
150 151 classifiers=classifiers,
151 152 setup_requires=["PasteScript>=1.6.3"],
152 153 data_files=data_files,
153 154 packages=packages,
154 155 include_package_data=True,
155 156 test_suite='nose.collector',
156 157 package_data=package_data,
157 158 message_extractors={'kallithea': [
158 159 ('**.py', 'python', None),
159 160 ('templates/**.mako', 'mako', {'input_encoding': 'utf-8'}),
160 161 ('templates/**.html', 'mako', {'input_encoding': 'utf-8'}),
161 162 ('public/**', 'ignore', None)]},
162 163 zip_safe=False,
163 164 paster_plugins=['PasteScript', 'Pylons'],
164 165 entry_points="""
165 166 [console_scripts]
166 167 kallithea-api = kallithea.bin.kallithea_api:main
167 168 kallithea-gist = kallithea.bin.kallithea_gist:main
168 169 kallithea-config = kallithea.bin.kallithea_config:main
169 170
170 171 [paste.app_factory]
171 172 main = kallithea.config.middleware:make_app
172 173
173 174 [paste.app_install]
174 175 main = pylons.util:PylonsInstaller
175 176
176 177 [paste.global_paster_command]
177 178 setup-db=kallithea.lib.paster_commands.setup_db:Command
178 179 cleanup-repos=kallithea.lib.paster_commands.cleanup:Command
179 180 update-repoinfo=kallithea.lib.paster_commands.update_repoinfo:Command
180 181 make-rcext=kallithea.lib.paster_commands.make_rcextensions:Command
181 182 repo-scan=kallithea.lib.paster_commands.repo_scan:Command
182 183 cache-keys=kallithea.lib.paster_commands.cache_keys:Command
183 184 ishell=kallithea.lib.paster_commands.ishell:Command
184 185 make-index=kallithea.lib.paster_commands.make_index:Command
185 186 upgrade-db=kallithea.lib.dbmigrate:UpgradeDb
186 187 celeryd=kallithea.lib.celerypylons.commands:CeleryDaemonCommand
187 188 install-iis=kallithea.lib.paster_commands.install_iis:Command
188 189
189 190 [nose.plugins]
190 191 pylons = pylons.test:PylonsPlugin
191 192 """,
192 193 )
General Comments 0
You need to be logged in to leave comments. Login now