# HG changeset patch # User RhodeCode Admin # Date 2023-07-19 08:26:46 # Node ID 34f9ec3879146b3b2023dfc78dc4ad5c6321b577 # Parent ff28a588dca2aa784ed9b53698630e08a1df8167 html_sanitizer: abstracted bleach into own function/code for later replacement - fixed bleach related test failures diff --git a/rhodecode/lib/helpers.py b/rhodecode/lib/helpers.py --- a/rhodecode/lib/helpers.py +++ b/rhodecode/lib/helpers.py @@ -44,7 +44,6 @@ from collections import OrderedDict import pygments import itertools import fnmatch -import bleach from datetime import datetime from functools import partial @@ -78,6 +77,7 @@ from webhelpers2.number import format_by from rhodecode.lib._vendor.webhelpers_backports import raw_select from rhodecode.lib.action_parser import action_parser +from rhodecode.lib.html_filters import sanitize_html from rhodecode.lib.pagination import Page, RepoPage, SqlPage from rhodecode.lib import ext_json from rhodecode.lib.ext_json import json @@ -1645,7 +1645,7 @@ def _process_url_func(match_obj, repo_na 'id-repr': issue_id, 'issue-prefix': entry['pref'], 'serv': entry['url'], - 'title': bleach.clean(desc, strip=True), + 'title': sanitize_html(desc, strip=True), 'hovercard_url': hovercard_url } diff --git a/rhodecode/lib/html_filters.py b/rhodecode/lib/html_filters.py --- a/rhodecode/lib/html_filters.py +++ b/rhodecode/lib/html_filters.py @@ -16,8 +16,47 @@ # RhodeCode Enterprise Edition, including its added features, Support services, # and proprietary license terms, please see https://rhodecode.com/licenses/ +import functools +import logging +from .html_sanitizer_defs import markdown_attrs, markdown_tags, all_tags, all_styles + + +log = logging.getLogger(__name__) + + # base64 filter e.g ${ example | base64,n } def base64(text): from rhodecode.lib.str_utils import base64_to_str return base64_to_str(text) + +def sanitize_html(text, **kwargs): + # TODO: replace this with https://nh3.readthedocs.io/en/latest + # bleach is abandoned and deprecated :/ + import bleach + from bleach.css_sanitizer import CSSSanitizer + + css_sanitizer = CSSSanitizer(allowed_css_properties=all_styles) + + markdown = kwargs.pop('markdown', False) + + allowed_attrs = markdown_attrs + + cleaner = functools.partial(bleach.clean, + tags=all_tags, + attributes=allowed_attrs, + css_sanitizer=css_sanitizer, + strip_comments=False, **kwargs) + + if markdown: + cleaner = functools.partial(bleach.clean, + tags=markdown_tags, + attributes=markdown_attrs, + css_sanitizer=css_sanitizer, + strip_comments=False, **kwargs) + + try: + return cleaner(text) + except Exception: + log.exception('Failed to sanitize html') + return 'TEXT CANNOT BE PARSED USING HTML SANITIZE' diff --git a/rhodecode/lib/bleach_whitelist.py b/rhodecode/lib/html_sanitizer_defs.py rename from rhodecode/lib/bleach_whitelist.py rename to rhodecode/lib/html_sanitizer_defs.py --- a/rhodecode/lib/bleach_whitelist.py +++ b/rhodecode/lib/html_sanitizer_defs.py @@ -62,7 +62,8 @@ markdown_tags = [ "a", "input", "details", - "summary" + "summary", + "div" ] markdown_attrs = { diff --git a/rhodecode/lib/markup_renderer.py b/rhodecode/lib/markup_renderer.py --- a/rhodecode/lib/markup_renderer.py +++ b/rhodecode/lib/markup_renderer.py @@ -28,7 +28,6 @@ import os import lxml import logging import urllib.parse -import bleach import pycmarkgfm from mako.lookup import TemplateLookup @@ -40,7 +39,7 @@ from docutils import writers from docutils.writers import html4css1 import markdown -from rhodecode.lib.utils2 import safe_str, md5_safe, MENTIONS_REGEX +from rhodecode.lib.utils2 import safe_str, MENTIONS_REGEX log = logging.getLogger(__name__) @@ -271,17 +270,8 @@ class MarkupRenderer(object): @classmethod def sanitize_html(cls, text): - # TODO: replace this with https://nh3.readthedocs.io/en/latest - # bleach is abandoned and deprecated :/ - - from .bleach_whitelist import markdown_attrs, markdown_tags - allowed_tags = markdown_tags - allowed_attrs = markdown_attrs - - try: - return bleach.clean(text, tags=allowed_tags, attributes=allowed_attrs) - except Exception: - return 'TEXT CANNOT BE PARSED USING SANITIZE' + from .html_filters import sanitize_html + return sanitize_html(text, markdown=True) @classmethod def renderer_from_filename(cls, filename, exclude): diff --git a/rhodecode/model/settings.py b/rhodecode/model/settings.py --- a/rhodecode/model/settings.py +++ b/rhodecode/model/settings.py @@ -21,13 +21,13 @@ import re import logging import time import functools -import bleach from collections import namedtuple from pyramid.threadlocal import get_current_request from rhodecode.lib import rc_cache from rhodecode.lib.hash_utils import sha1_safe +from rhodecode.lib.html_filters import sanitize_html from rhodecode.lib.utils2 import ( Optional, AttributeDict, safe_str, remove_prefix, str2bool) from rhodecode.lib.vcs.backends import base @@ -376,7 +376,7 @@ class IssueTrackerSettingsModel(object): def url_cleaner(input_str): input_str = input_str.replace('"', '').replace("'", '') - input_str = bleach.clean(input_str, strip=True) + input_str = sanitize_html(input_str, strip=True) return input_str # populate @@ -394,7 +394,7 @@ class IssueTrackerSettingsModel(object): 'pat_compiled': pat_compiled, 'url': url_cleaner( qs.get(self._get_keyname('url', uid, 'rhodecode_')) or ''), - 'pref': bleach.clean( + 'pref': sanitize_html( qs.get(self._get_keyname('pref', uid, 'rhodecode_')) or ''), 'desc': qs.get( self._get_keyname('desc', uid, 'rhodecode_')), diff --git a/rhodecode/tests/lib/test_html_sanitizer.py b/rhodecode/tests/lib/test_html_sanitizer.py new file mode 100644 --- /dev/null +++ b/rhodecode/tests/lib/test_html_sanitizer.py @@ -0,0 +1,38 @@ + +# Copyright (C) 2010-2023 RhodeCode GmbH +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License, version 3 +# (only), as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# This program is dual-licensed. If you wish to learn more about the +# RhodeCode Enterprise Edition, including its added features, Support services, +# and proprietary license terms, please see https://rhodecode.com/licenses/ + +import pytest + +from rhodecode.lib.html_filters import sanitize_html + + +@pytest.mark.parametrize( + "src_html, expected_html", + [ + ('
ITEM
', '
ITEM
'), + ('
ITEM
', '
ITEM
'), + ('
ITEM
', '
ITEM
'), + ('
ITEM
', '
ITEM
'), + ('ITEM', 'ITEM'), + ('', ''), + ('', ''), + ]) +def test_html_sanitizer_options(src_html, expected_html): + parsed_html = sanitize_html(src_html) + assert parsed_html == expected_html