diff --git a/pkgs/python-packages.nix b/pkgs/python-packages.nix --- a/pkgs/python-packages.nix +++ b/pkgs/python-packages.nix @@ -407,30 +407,75 @@ self: super: { }; }; "elasticsearch" = super.buildPythonPackage { - name = "elasticsearch-2.3.0"; + name = "elasticsearch-6.3.1"; doCheck = false; propagatedBuildInputs = [ self."urllib3" ]; src = fetchurl { - url = "https://files.pythonhosted.org/packages/10/35/5fd52c5f0b0ee405ed4b5195e8bce44c5e041787680dc7b94b8071cac600/elasticsearch-2.3.0.tar.gz"; - sha256 = "10ad2dk73xsys9vajwsncibs69asa63w1hgwz6lz1prjpyi80c5y"; + url = "https://files.pythonhosted.org/packages/9d/ce/c4664e8380e379a9402ecfbaf158e56396da90d520daba21cfa840e0eb71/elasticsearch-6.3.1.tar.gz"; + sha256 = "12y93v0yn7a4xmf969239g8gb3l4cdkclfpbk1qc8hx5qkymrnma"; }; meta = { license = [ pkgs.lib.licenses.asl20 ]; }; }; "elasticsearch-dsl" = super.buildPythonPackage { - name = "elasticsearch-dsl-2.2.0"; + name = "elasticsearch-dsl-6.3.1"; doCheck = false; propagatedBuildInputs = [ self."six" self."python-dateutil" self."elasticsearch" + self."ipaddress" + ]; + src = fetchurl { + url = "https://files.pythonhosted.org/packages/4c/0d/1549f50c591db6bb4e66cbcc8d34a6e537c3d89aa426b167c244fd46420a/elasticsearch-dsl-6.3.1.tar.gz"; + sha256 = "1gh8a0shqi105k325hgwb9avrpdjh0mc6mxwfg9ba7g6lssb702z"; + }; + meta = { + license = [ pkgs.lib.licenses.asl20 ]; + }; + }; + "elasticsearch1" = super.buildPythonPackage { + name = "elasticsearch1-1.10.0"; + doCheck = false; + propagatedBuildInputs = [ + self."urllib3" ]; src = fetchurl { - url = "https://files.pythonhosted.org/packages/66/2f/52a086968788e58461641570f45c3207a52d46ebbe9b77dc22b6a8ffda66/elasticsearch-dsl-2.2.0.tar.gz"; - sha256 = "1g4kxzxsdwlsl2a9kscmx11pafgimhj7y8wrfksv8pgvpkfb9fwr"; + url = "https://files.pythonhosted.org/packages/a6/eb/73e75f9681fa71e3157b8ee878534235d57f24ee64f0e77f8d995fb57076/elasticsearch1-1.10.0.tar.gz"; + sha256 = "0g89444kd5zwql4vbvyrmi2m6l6dcj6ga98j4hqxyyyz6z20aki2"; + }; + meta = { + license = [ pkgs.lib.licenses.asl20 ]; + }; + }; + "elasticsearch1-dsl" = super.buildPythonPackage { + name = "elasticsearch1-dsl-0.0.12"; + doCheck = false; + propagatedBuildInputs = [ + self."six" + self."python-dateutil" + self."elasticsearch1" + ]; + src = fetchurl { + url = "https://files.pythonhosted.org/packages/eb/9d/785342775cb10eddc9b8d7457d618a423b4f0b89d8b2b2d1bc27190d71db/elasticsearch1-dsl-0.0.12.tar.gz"; + sha256 = "0ig1ly39v93hba0z975wnhbmzwj28w6w1sqlr2g7cn5spp732bhk"; + }; + meta = { + license = [ pkgs.lib.licenses.asl20 ]; + }; + }; + "elasticsearch2" = super.buildPythonPackage { + name = "elasticsearch2-2.5.0"; + doCheck = false; + propagatedBuildInputs = [ + self."urllib3" + ]; + src = fetchurl { + url = "https://files.pythonhosted.org/packages/84/77/63cf63d4ba11d913b5278406f2a37b0712bec6fc85edfb6151a33eaeba25/elasticsearch2-2.5.0.tar.gz"; + sha256 = "0ky0q16lbvz022yv6q3pix7aamf026p1y994537ccjf0p0dxnbxr"; }; meta = { license = [ pkgs.lib.licenses.asl20 ]; @@ -818,11 +863,11 @@ self: super: { }; }; "markupsafe" = super.buildPythonPackage { - name = "markupsafe-1.0"; + name = "markupsafe-1.1.0"; doCheck = false; src = fetchurl { - url = "https://files.pythonhosted.org/packages/4d/de/32d741db316d8fdb7680822dd37001ef7a448255de9699ab4bfcbdf4172b/MarkupSafe-1.0.tar.gz"; - sha256 = "0rdn1s8x9ni7ss8rfiacj7x1085lx8mh2zdwqslnw8xc3l4nkgm6"; + url = "https://files.pythonhosted.org/packages/ac/7e/1b4c2e05809a4414ebce0892fe1e32c14ace86ca7d50c70f00979ca9b3a3/MarkupSafe-1.1.0.tar.gz"; + sha256 = "1lxirjypbdd3l9jl4vliilhfnhy7c7f2vlldqg1b0i74khn375sf"; }; meta = { license = [ pkgs.lib.licenses.bsdOriginal ]; @@ -1271,11 +1316,11 @@ self: super: { }; }; "pyparsing" = super.buildPythonPackage { - name = "pyparsing-1.5.7"; + name = "pyparsing-2.3.0"; doCheck = false; src = fetchurl { - url = "https://files.pythonhosted.org/packages/6f/2c/47457771c02a8ff0f302b695e094ec309e30452232bd79198ee94fda689f/pyparsing-1.5.7.tar.gz"; - sha256 = "17z7ws076z977sclj628fvwrp8y9j2rvdjcsq42v129n1gwi8vk4"; + url = "https://files.pythonhosted.org/packages/d0/09/3e6a5eeb6e04467b737d55f8bba15247ac0876f98fae659e58cd744430c6/pyparsing-2.3.0.tar.gz"; + sha256 = "14k5v7n3xqw8kzf42x06bzp184spnlkya2dpjyflax6l3yrallzk"; }; meta = { license = [ pkgs.lib.licenses.mit ]; @@ -1642,7 +1687,7 @@ self: super: { }; }; "rhodecode-enterprise-ce" = super.buildPythonPackage { - name = "rhodecode-enterprise-ce-4.15.0"; + name = "rhodecode-enterprise-ce-4.16.0"; buildInputs = [ self."pytest" self."py" @@ -1788,7 +1833,7 @@ self: super: { }; }; "rhodecode-tools" = super.buildPythonPackage { - name = "rhodecode-tools-1.0.1"; + name = "rhodecode-tools-1.1.0"; doCheck = false; propagatedBuildInputs = [ self."click" @@ -1797,14 +1842,16 @@ self: super: { self."mako" self."markupsafe" self."requests" - self."elasticsearch" - self."elasticsearch-dsl" self."urllib3" self."whoosh" + self."elasticsearch" + self."elasticsearch-dsl" + self."elasticsearch2" + self."elasticsearch1-dsl" ]; src = fetchurl { - url = "https://code.rhodecode.com/rhodecode-tools-ce/archive/v1.0.1.tar.gz?md5=ffb5d6bcb855305b93cfe23ad42e500b"; - sha256 = "0nr300s4sg685qs4wgbwlplwriawrwi6jq79z37frcnpyc89gpvm"; + url = "https://code.rhodecode.com/rhodecode-tools-ce/archive/v1.1.0.tar.gz?md5=cc320c277cb2add546220290ac9be626"; + sha256 = "1wbnnfrzyp0d4ys55vj5vnfrzfhwlqgdhc8yv8i6kwinizf8hfrn"; }; meta = { license = [ { fullName = "Apache 2.0 and Proprietary"; } ]; @@ -1848,11 +1895,11 @@ self: super: { }; }; "setuptools" = super.buildPythonPackage { - name = "setuptools-40.6.2"; + name = "setuptools-40.6.3"; doCheck = false; src = fetchurl { - url = "https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip"; - sha256 = "0r2c5hapirlzm34h7pl1lgkm6gk7bcrlrdj28qgsvaqg3f74vfw6"; + url = "https://files.pythonhosted.org/packages/37/1b/b25507861991beeade31473868463dad0e58b1978c209de27384ae541b0b/setuptools-40.6.3.zip"; + sha256 = "1y085dnk574sxw9aymdng9gijvrsbw86hsv9hqnhv7y4d6nlsirv"; }; meta = { license = [ pkgs.lib.licenses.mit ]; @@ -2043,11 +2090,11 @@ self: super: { }; }; "urllib3" = super.buildPythonPackage { - name = "urllib3-1.21"; + name = "urllib3-1.24.1"; doCheck = false; src = fetchurl { - url = "https://files.pythonhosted.org/packages/34/95/7b28259d0006ed681c424cd71a668363265eac92b67dddd018eb9a22bff8/urllib3-1.21.tar.gz"; - sha256 = "0irnj4wvh2y36s4q3l2vas9qr9m766w6w418nb490j3mf8a8zw6h"; + url = "https://files.pythonhosted.org/packages/b1/53/37d82ab391393565f2f831b8eedbffd57db5a718216f82f1a8b4d381a1c1/urllib3-1.24.1.tar.gz"; + sha256 = "08lwd9f3hqznyf32vnzwvp87pchx062nkbgyrf67rwlkgj0jk5fy"; }; meta = { license = [ pkgs.lib.licenses.mit ]; diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -36,7 +36,7 @@ kombu==4.2.0 lxml==4.2.5 mako==1.0.7 markdown==2.6.11 -markupsafe==1.0.0 +markupsafe==1.1.0 msgpack-python==0.5.6 pyotp==2.2.7 packaging==15.2 @@ -51,7 +51,7 @@ pycrypto==2.6.1 pycurl==7.43.0.2 pyflakes==0.8.1 pygments==2.3.0 -pyparsing==1.5.7 +pyparsing==2.3.0 pyramid-beaker==0.8 pyramid-debugtoolbar==4.4.0 pyramid-jinja2==2.7 @@ -79,7 +79,7 @@ subprocess32==3.5.2 supervisor==3.3.4 tempita==0.5.2 translationstring==1.3 -urllib3==1.21 +urllib3==1.24.1 urlobject==2.4.3 venusian==1.1.0 weberror==0.10.3 @@ -123,7 +123,7 @@ ipdb==0.11.0 ipython==5.1.0 ## rhodecode-tools, special case -https://code.rhodecode.com/rhodecode-tools-ce/archive/v1.0.1.tar.gz?md5=ffb5d6bcb855305b93cfe23ad42e500b#egg=rhodecode-tools==1.0.1 +https://code.rhodecode.com/rhodecode-tools-ce/archive/v1.1.0.tar.gz?md5=cc320c277cb2add546220290ac9be626#egg=rhodecode-tools==1.1.0 ## appenlight appenlight-client==0.6.26 diff --git a/rhodecode/apps/admin/views/settings.py b/rhodecode/apps/admin/views/settings.py --- a/rhodecode/apps/admin/views/settings.py +++ b/rhodecode/apps/admin/views/settings.py @@ -666,8 +666,8 @@ class AdminSettingsView(BaseAppView): c = self.load_default_context() c.active = 'search' - searcher = searcher_from_config(self.request.registry.settings) - c.statistics = searcher.statistics(self.request.translate) + c.searcher = searcher_from_config(self.request.registry.settings) + c.statistics = c.searcher.statistics(self.request.translate) return self._get_template_context(c) diff --git a/rhodecode/apps/home/views.py b/rhodecode/apps/home/views.py --- a/rhodecode/apps/home/views.py +++ b/rhodecode/apps/home/views.py @@ -246,9 +246,9 @@ class HomeView(BaseAppView): } for obj in acl_iter] - def _get_hash_commit_list(self, auth_user, query): + def _get_hash_commit_list(self, auth_user, searcher, query): org_query = query - if not query or len(query) < 3: + if not query or len(query) < 3 or not searcher: return [] commit_hashes = re.compile('(?:commit:)([0-9a-f]{2,40})').findall(query) @@ -257,9 +257,8 @@ class HomeView(BaseAppView): return [] commit_hash = commit_hashes[0] - searcher = searcher_from_config(self.request.registry.settings) result = searcher.search( - 'commit_id:%s*' % commit_hash, 'commit', auth_user, + 'commit_id:{}*'.format(commit_hash), 'commit', auth_user, raise_on_exc=False) return [ @@ -303,6 +302,84 @@ class HomeView(BaseAppView): } return data + def _get_default_search_queries(self, search_context, searcher, query): + if not searcher: + return [] + is_es_6 = searcher.is_es_6 + + queries = [] + repo_group_name, repo_name, repo_context = None, None, None + + # repo group context + if search_context.get('search_context[repo_group_name]'): + repo_group_name = search_context.get('search_context[repo_group_name]') + if search_context.get('search_context[repo_name]'): + repo_name = search_context.get('search_context[repo_name]') + repo_context = search_context.get('search_context[repo_view_type]') + + if is_es_6 and repo_name: + def query_modifier(): + qry = '{} repo_name.raw:{} '.format( + query, searcher.escape_specials(repo_name)) + return {'q': qry, 'type': 'content'} + label = u'Search for `{}` through files in this repository.'.format(query) + queries.append( + { + 'id': -10, + 'value': query, + 'value_display': label, + 'type': 'search', + 'url': h.route_path( + 'search_repo', repo_name=repo_name, _query=query_modifier()) + } + ) + + def query_modifier(): + qry = '{} repo_name.raw:{} '.format( + query, searcher.escape_specials(repo_name)) + return {'q': qry, 'type': 'commit'} + label = u'Search for `{}` through commits in this repository.'.format(query) + queries.append( + { + 'id': -10, + 'value': query, + 'value_display': label, + 'type': 'search', + 'url': h.route_path( + 'search_repo', repo_name=repo_name, _query=query_modifier()) + } + ) + + elif is_es_6 and repo_group_name: + def query_modifier(): + qry = '{} repo_name.raw:{} '.format( + query, searcher.escape_specials(repo_group_name + '/*')) + return {'q': qry, 'type': 'content'} + label = u'Search for `{}` through files in this repository group'.format(query) + queries.append( + { + 'id': -20, + 'value': query, + 'value_display': label, + 'type': 'search', + 'url': h.route_path('search', _query=query_modifier()) + } + ) + + if not queries: + queries.append( + { + 'id': -1, + 'value': query, + 'value_display': u'Search for: `{}`'.format(query), + 'type': 'search', + 'url': h.route_path('search', + _query={'q': query, 'type': 'content'}) + } + ) + + return queries + @LoginRequired() @view_config( route_name='goto_switcher_data', request_method='GET', @@ -315,26 +392,21 @@ class HomeView(BaseAppView): query = self.request.GET.get('query') log.debug('generating main filter data, query %s', query) - default_search_val = u'Full text search for: `{}`'.format(query) res = [] if not query: return {'suggestions': res} - res.append({ - 'id': -1, - 'value': query, - 'value_display': default_search_val, - 'type': 'search', - 'url': h.route_path( - 'search', _query={'q': query}) - }) - repo_group_id = safe_int(self.request.GET.get('repo_group_id')) + searcher = searcher_from_config(self.request.registry.settings) + for _q in self._get_default_search_queries(self.request.GET, searcher, query): + res.append(_q) + + repo_group_id = safe_int(self.request.GET.get('search_context[repo_group_id]')) if repo_group_id: repo_group = RepoGroup.get(repo_group_id) composed_hint = '{}/{}'.format(repo_group.group_name, query) show_hint = not query.startswith(repo_group.group_name) if repo_group and show_hint: - hint = u'Group search: `{}`'.format(composed_hint) + hint = u'Repository search inside: `{}`'.format(composed_hint) res.append({ 'id': -1, 'value': composed_hint, @@ -351,7 +423,7 @@ class HomeView(BaseAppView): for serialized_repo in repos: res.append(serialized_repo) - # TODO(marcink): permissions for that ? + # TODO(marcink): should all logged in users be allowed to search others? allowed_user_search = self._rhodecode_user.username != User.DEFAULT_USER if allowed_user_search: users = self._get_user_list(query) @@ -362,7 +434,7 @@ class HomeView(BaseAppView): for serialized_user_group in user_groups: res.append(serialized_user_group) - commits = self._get_hash_commit_list(c.auth_user, query) + commits = self._get_hash_commit_list(c.auth_user, searcher, query) if commits: unique_repos = collections.OrderedDict() for commit in commits: diff --git a/rhodecode/apps/search/views.py b/rhodecode/apps/search/views.py --- a/rhodecode/apps/search/views.py +++ b/rhodecode/apps/search/views.py @@ -45,11 +45,14 @@ def search(request, tmpl_context, repo_n errors = [] try: search_params = schema.deserialize( - dict(search_query=request.GET.get('q'), - search_type=request.GET.get('type'), - search_sort=request.GET.get('sort'), - page_limit=request.GET.get('page_limit'), - requested_page=request.GET.get('page')) + dict( + search_query=request.GET.get('q'), + search_type=request.GET.get('type'), + search_sort=request.GET.get('sort'), + search_max_lines=request.GET.get('max_lines'), + page_limit=request.GET.get('page_limit'), + requested_page=request.GET.get('page'), + ) ) except validation_schema.Invalid as e: errors = e.children @@ -57,12 +60,13 @@ def search(request, tmpl_context, repo_n def url_generator(**kw): q = urllib.quote(safe_str(search_query)) return update_params( - "?q=%s&type=%s" % (q, safe_str(search_type)), **kw) + "?q=%s&type=%s&max_lines=%s" % (q, safe_str(search_type), search_max_lines), **kw) c = tmpl_context search_query = search_params.get('search_query') search_type = search_params.get('search_type') search_sort = search_params.get('search_sort') + search_max_lines = search_params.get('search_max_lines') if search_params.get('search_query'): page_limit = search_params['page_limit'] requested_page = search_params['requested_page'] diff --git a/rhodecode/lib/helpers.py b/rhodecode/lib/helpers.py --- a/rhodecode/lib/helpers.py +++ b/rhodecode/lib/helpers.py @@ -48,7 +48,6 @@ import bleach from datetime import datetime from functools import partial from pygments.formatters.html import HtmlFormatter -from pygments import highlight as code_highlight from pygments.lexers import ( get_lexer_by_name, get_lexer_for_filename, get_lexer_for_mimetype) @@ -81,12 +80,14 @@ from rhodecode.lib.utils2 import str2boo from rhodecode.lib.markup_renderer import MarkupRenderer, relative_links from rhodecode.lib.vcs.exceptions import CommitDoesNotExistError from rhodecode.lib.vcs.backends.base import BaseChangeset, EmptyCommit +from rhodecode.lib.index.search_utils import get_matching_line_offsets from rhodecode.config.conf import DATE_FORMAT, DATETIME_FORMAT from rhodecode.model.changeset_status import ChangesetStatusModel from rhodecode.model.db import Permission, User, Repository from rhodecode.model.repo_group import RepoGroupModel from rhodecode.model.settings import IssueTrackerSettingsModel + log = logging.getLogger(__name__) @@ -260,6 +261,21 @@ def files_breadcrumbs(repo_name, commit_ return literal('/'.join(url_segments)) +def code_highlight(code, lexer, formatter, use_hl_filter=False): + """ + Lex ``code`` with ``lexer`` and format it with the formatter ``formatter``. + + If ``outfile`` is given and a valid file object (an object + with a ``write`` method), the result will be written to it, otherwise + it is returned as a string. + """ + if use_hl_filter: + # add HL filter + from rhodecode.lib.index import search_utils + lexer.add_filter(search_utils.ElasticSearchHLFilter()) + return pygments.format(pygments.lex(code, lexer), formatter) + + class CodeHtmlFormatter(HtmlFormatter): """ My code Html Formatter for source codes @@ -386,110 +402,9 @@ class SearchContentCodeHtmlFormatter(Cod current_line_number += 1 - yield 0, '' -def extract_phrases(text_query): - """ - Extracts phrases from search term string making sure phrases - contained in double quotes are kept together - and discarding empty values - or fully whitespace values eg. - - 'some text "a phrase" more' => ['some', 'text', 'a phrase', 'more'] - - """ - - in_phrase = False - buf = '' - phrases = [] - for char in text_query: - if in_phrase: - if char == '"': # end phrase - phrases.append(buf) - buf = '' - in_phrase = False - continue - else: - buf += char - continue - else: - if char == '"': # start phrase - in_phrase = True - phrases.append(buf) - buf = '' - continue - elif char == ' ': - phrases.append(buf) - buf = '' - continue - else: - buf += char - - phrases.append(buf) - phrases = [phrase.strip() for phrase in phrases if phrase.strip()] - return phrases - - -def get_matching_offsets(text, phrases): - """ - Returns a list of string offsets in `text` that the list of `terms` match - - >>> get_matching_offsets('some text here', ['some', 'here']) - [(0, 4), (10, 14)] - - """ - offsets = [] - for phrase in phrases: - for match in re.finditer(phrase, text): - offsets.append((match.start(), match.end())) - - return offsets - - -def normalize_text_for_matching(x): - """ - Replaces all non alnum characters to spaces and lower cases the string, - useful for comparing two text strings without punctuation - """ - return re.sub(r'[^\w]', ' ', x.lower()) - - -def get_matching_line_offsets(lines, terms): - """ Return a set of `lines` indices (starting from 1) matching a - text search query, along with `context` lines above/below matching lines - - :param lines: list of strings representing lines - :param terms: search term string to match in lines eg. 'some text' - :param context: number of lines above/below a matching line to add to result - :param max_lines: cut off for lines of interest - eg. - - text = ''' - words words words - words words words - some text some - words words words - words words words - text here what - ''' - get_matching_line_offsets(text, 'text', context=1) - {3: [(5, 9)], 6: [(0, 4)]] - - """ - matching_lines = {} - phrases = [normalize_text_for_matching(phrase) - for phrase in extract_phrases(terms)] - - for line_index, line in enumerate(lines, start=1): - match_offsets = get_matching_offsets( - normalize_text_for_matching(line), phrases) - if match_offsets: - matching_lines[line_index] = match_offsets - - return matching_lines - - def hsv_to_rgb(h, s, v): """ Convert hsv color values to rgb """ @@ -1904,25 +1819,6 @@ def journal_filter_help(request): ).format(actions=actions) -def search_filter_help(searcher, request): - _ = request.translate - - terms = '' - return _( - 'Example filter terms for `{searcher}` search:\n' + - '{terms}\n' + - 'Generate wildcards using \'*\' character:\n' + - ' "repo_name:vcs*" - search everything starting with \'vcs\'\n' + - ' "repo_name:*vcs*" - search for repository containing \'vcs\'\n' + - '\n' + - 'Optional AND / OR operators in queries\n' + - ' "repo_name:vcs OR repo_name:test"\n' + - ' "owner:test AND repo_name:test*"\n' + - 'More: {search_doc}' - ).format(searcher=searcher.name, - terms=terms, search_doc=searcher.query_lang_doc) - - def not_mapped_error(repo_name): from rhodecode.translation import _ flash(_('%s repository is not mapped to db perhaps' @@ -2107,3 +2003,15 @@ def go_import_header(request, db_repo=No def reviewer_as_json(*args, **kwargs): from rhodecode.apps.repository.utils import reviewer_as_json as _reviewer_as_json return _reviewer_as_json(*args, **kwargs) + + +def get_repo_view_type(request): + route_name = request.matched_route.name + route_to_view_type = { + 'repo_changelog': 'changelog', + 'repo_files': 'files', + 'repo_summary': 'summary', + 'repo_commit': 'commit' + + } + return route_to_view_type.get(route_name) diff --git a/rhodecode/lib/index/__init__.py b/rhodecode/lib/index/__init__.py --- a/rhodecode/lib/index/__init__.py +++ b/rhodecode/lib/index/__init__.py @@ -25,15 +25,27 @@ Index schema for RhodeCode import importlib import logging +from rhodecode.lib.index.search_utils import normalize_text_for_matching + log = logging.getLogger(__name__) # leave defaults for backward compat default_searcher = 'rhodecode.lib.index.whoosh' default_location = '%(here)s/data/index' +ES_VERSION_2 = '2' +ES_VERSION_6 = '6' +# for legacy reasons we keep 2 compat as default +DEFAULT_ES_VERSION = ES_VERSION_2 -class BaseSearch(object): +from rhodecode_tools.lib.fts_index.elasticsearch_engine_6 import \ + ES_CONFIG # pragma: no cover + + +class BaseSearcher(object): query_lang_doc = '' + es_version = None + name = None def __init__(self): pass @@ -45,15 +57,42 @@ class BaseSearch(object): raise_on_exc=True): raise Exception('NotImplemented') + @staticmethod + def query_to_mark(query, default_field=None): + """ + Formats the query to mark token for jquery.mark.js highlighting. ES could + have a different format optionally. -def searcher_from_config(config, prefix='search.'): + :param default_field: + :param query: + """ + return ' '.join(normalize_text_for_matching(query).split()) + + @property + def is_es_6(self): + return self.es_version == ES_VERSION_6 + + def get_handlers(self): + return {} + + +def search_config(config, prefix='search.'): _config = {} for key in config.keys(): if key.startswith(prefix): _config[key[len(prefix):]] = config[key] + return _config + + +def searcher_from_config(config, prefix='search.'): + _config = search_config(config, prefix) if 'location' not in _config: _config['location'] = default_location + if 'es_version' not in _config: + # use old legacy ES version set to 2 + _config['es_version'] = '2' + imported = importlib.import_module(_config.get('module', default_searcher)) - searcher = imported.Search(config=_config) + searcher = imported.Searcher(config=_config) return searcher diff --git a/rhodecode/lib/index/search_utils.py b/rhodecode/lib/index/search_utils.py new file mode 100644 --- /dev/null +++ b/rhodecode/lib/index/search_utils.py @@ -0,0 +1,257 @@ +# -*- coding: utf-8 -*- + +# Copyright (C) 2012-2018 RhodeCode GmbH +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License, version 3 +# (only), as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# This program is dual-licensed. If you wish to learn more about the +# RhodeCode Enterprise Edition, including its added features, Support services, +# and proprietary license terms, please see https://rhodecode.com/licenses/ +import re + +import pygments.filter +import pygments.filters +from pygments.token import Comment + +HL_BEG_MARKER = '__RCSearchHLMarkBEG__' +HL_END_MARKER = '__RCSearchHLMarkEND__' +HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER) + + +class ElasticSearchHLFilter(pygments.filters.Filter): + _names = [HL_BEG_MARKER, HL_END_MARKER] + + def __init__(self, **options): + pygments.filters.Filter.__init__(self, **options) + + def filter(self, lexer, stream): + def tokenize(_value): + for token in re.split('({}|{})'.format( + self._names[0], self._names[1]), _value): + if token: + yield token + + hl = False + for ttype, value in stream: + + if self._names[0] in value or self._names[1] in value: + for item in tokenize(value): + if item == self._names[0]: + # skip marker, but start HL + hl = True + continue + elif item == self._names[1]: + hl = False + continue + + if hl: + yield Comment.ElasticMatch, item + else: + yield ttype, item + else: + if hl: + yield Comment.ElasticMatch, value + else: + yield ttype, value + + +def extract_phrases(text_query): + """ + Extracts phrases from search term string making sure phrases + contained in double quotes are kept together - and discarding empty values + or fully whitespace values eg. + + 'some text "a phrase" more' => ['some', 'text', 'a phrase', 'more'] + + """ + + in_phrase = False + buf = '' + phrases = [] + for char in text_query: + if in_phrase: + if char == '"': # end phrase + phrases.append(buf) + buf = '' + in_phrase = False + continue + else: + buf += char + continue + else: + if char == '"': # start phrase + in_phrase = True + phrases.append(buf) + buf = '' + continue + elif char == ' ': + phrases.append(buf) + buf = '' + continue + else: + buf += char + + phrases.append(buf) + phrases = [phrase.strip() for phrase in phrases if phrase.strip()] + return phrases + + +def get_matching_phrase_offsets(text, phrases): + """ + Returns a list of string offsets in `text` that the list of `terms` match + + >>> get_matching_phrase_offsets('some text here', ['some', 'here']) + [(0, 4), (10, 14)] + + """ + phrases = phrases or [] + offsets = [] + + for phrase in phrases: + for match in re.finditer(phrase, text): + offsets.append((match.start(), match.end())) + + return offsets + + +def get_matching_markers_offsets(text, markers=None): + """ + Returns a list of string offsets in `text` that the are between matching markers + + >>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2']) + [(0, 5), (16, 22)] + + """ + markers = markers or [HL_MARKER_RE] + offsets = [] + + if markers: + for mark in markers: + for match in re.finditer(mark, text): + offsets.append((match.start(), match.end())) + + return offsets + + +def normalize_text_for_matching(x): + """ + Replaces all non alfanum characters to spaces and lower cases the string, + useful for comparing two text strings without punctuation + """ + return re.sub(r'[^\w]', ' ', x.lower()) + + +def get_matching_line_offsets(lines, terms=None, markers=None): + """ Return a set of `lines` indices (starting from 1) matching a + text search query, along with `context` lines above/below matching lines + + :param lines: list of strings representing lines + :param terms: search term string to match in lines eg. 'some text' + :param markers: instead of terms, use highlight markers instead that + mark beginning and end for matched item. eg. ['START(.*?)END'] + + eg. + + text = ''' + words words words + words words words + some text some + words words words + words words words + text here what + ''' + get_matching_line_offsets(text, 'text', context=1) + 6, {3: [(5, 9)], 6: [(0, 4)]] + + """ + matching_lines = {} + line_index = 0 + + if terms: + phrases = [normalize_text_for_matching(phrase) + for phrase in extract_phrases(terms)] + + for line_index, line in enumerate(lines.splitlines(), start=1): + normalized_line = normalize_text_for_matching(line) + match_offsets = get_matching_phrase_offsets(normalized_line, phrases) + if match_offsets: + matching_lines[line_index] = match_offsets + + else: + markers = markers or [HL_MARKER_RE] + for line_index, line in enumerate(lines.splitlines(), start=1): + match_offsets = get_matching_markers_offsets(line, markers=markers) + if match_offsets: + matching_lines[line_index] = match_offsets + + return line_index, matching_lines + + +def lucene_query_parser(): + # from pyparsing lucene_grammar + from pyparsing import ( + Literal, CaselessKeyword, Forward, Regex, QuotedString, Suppress, + Optional, Group, infixNotation, opAssoc, ParserElement, pyparsing_common) + + ParserElement.enablePackrat() + + COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map(Literal, ":[]{}~^") + LPAR, RPAR = map(Suppress, "()") + and_, or_, not_, to_ = map(CaselessKeyword, "AND OR NOT TO".split()) + keyword = and_ | or_ | not_ | to_ + + expression = Forward() + + valid_word = Regex(r'([a-zA-Z0-9*_+.-]|\\[!(){}\[\]^"~*?\\:])+').setName("word") + valid_word.setParseAction( + lambda t: t[0] + .replace('\\\\', chr(127)) + .replace('\\', '') + .replace(chr(127), '\\') + ) + + string = QuotedString('"') + + required_modifier = Literal("+")("required") + prohibit_modifier = Literal("-")("prohibit") + integer = Regex(r"\d+").setParseAction(lambda t: int(t[0])) + proximity_modifier = Group(TILDE + integer("proximity")) + number = pyparsing_common.fnumber() + fuzzy_modifier = TILDE + Optional(number, default=0.5)("fuzzy") + + term = Forward() + field_name = valid_word().setName("fieldname") + incl_range_search = Group(LBRACK + term("lower") + to_ + term("upper") + RBRACK) + excl_range_search = Group(LBRACE + term("lower") + to_ + term("upper") + RBRACE) + range_search = incl_range_search("incl_range") | excl_range_search("excl_range") + boost = (CARAT + number("boost")) + + string_expr = Group(string + proximity_modifier) | string + word_expr = Group(valid_word + fuzzy_modifier) | valid_word + term << (Optional(field_name("field") + COLON) + + (word_expr | string_expr | range_search | Group( + LPAR + expression + RPAR)) + + Optional(boost)) + term.setParseAction(lambda t: [t] if 'field' in t or 'boost' in t else None) + + expression << infixNotation( + term, + [ + (required_modifier | prohibit_modifier, 1, opAssoc.RIGHT), + ((not_ | '!').setParseAction(lambda: "NOT"), 1, opAssoc.RIGHT), + ((and_ | '&&').setParseAction(lambda: "AND"), 2, opAssoc.LEFT), + (Optional(or_ | '||').setParseAction(lambda: "OR"), 2, opAssoc.LEFT), + ] + ) + + return expression diff --git a/rhodecode/lib/index/whoosh.py b/rhodecode/lib/index/whoosh.py --- a/rhodecode/lib/index/whoosh.py +++ b/rhodecode/lib/index/whoosh.py @@ -33,7 +33,7 @@ from whoosh.index import create_in, open from whoosh.qparser import QueryParser, QueryParserError import rhodecode.lib.helpers as h -from rhodecode.lib.index import BaseSearch +from rhodecode.lib.index import BaseSearcher from rhodecode.lib.utils2 import safe_unicode log = logging.getLogger(__name__) @@ -59,13 +59,13 @@ FRAGMENTER = ContextFragmenter(200) log = logging.getLogger(__name__) -class Search(BaseSearch): +class WhooshSearcher(BaseSearcher): # this also shows in UI query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html' name = 'whoosh' def __init__(self, config): - super(Search, self).__init__() + super(Searcher, self).__init__() self.config = config if not os.path.isdir(self.config['location']): os.makedirs(self.config['location']) @@ -162,16 +162,17 @@ class Search(BaseSearch): _ = translator stats = [ {'key': _('Index Type'), 'value': 'Whoosh'}, + {'sep': True}, + {'key': _('File Index'), 'value': str(self.file_index)}, - {'key': _('Indexed documents'), - 'value': self.file_index.doc_count()}, - {'key': _('Last update'), - 'value': h.time_to_datetime(self.file_index.last_modified())}, + {'key': _('Indexed documents'), 'value': self.file_index.doc_count()}, + {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())}, + + {'sep': True}, + {'key': _('Commit index'), 'value': str(self.commit_index)}, - {'key': _('Indexed documents'), - 'value': str(self.commit_index.doc_count())}, - {'key': _('Last update'), - 'value': h.time_to_datetime(self.commit_index.last_modified())} + {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())}, + {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())} ] return stats @@ -227,6 +228,9 @@ class Search(BaseSearch): return self.searcher +Searcher = WhooshSearcher + + class WhooshResultWrapper(object): def __init__(self, search_type, total_hits, results): self.search_type = search_type @@ -263,6 +267,8 @@ class WhooshResultWrapper(object): # TODO: marcink: this feels like an overkill, there's a lot of data # inside hit object, and we don't need all res = dict(hit) + # elastic search uses that, we set it empty so it fallbacks to regular HL logic + res['content_highlight'] = '' f_path = '' # pragma: no cover if self.search_type in ['content', 'path']: diff --git a/rhodecode/lib/utils2.py b/rhodecode/lib/utils2.py --- a/rhodecode/lib/utils2.py +++ b/rhodecode/lib/utils2.py @@ -1009,3 +1009,14 @@ def glob2re(pat): else: res = res + re.escape(c) return res + '\Z(?ms)' + + +def parse_byte_string(size_str): + match = re.match(r'(\d+)(MB|KB)', size_str, re.IGNORECASE) + if not match: + raise ValueError('Given size:%s is invalid, please make sure ' + 'to use format of (MB|KB)' % size_str) + + _parts = match.groups() + num, type_ = _parts + return long(num) * {'mb': 1024*1024, 'kb': 1024}[type_.lower()] diff --git a/rhodecode/lib/vcs/utils/__init__.py b/rhodecode/lib/vcs/utils/__init__.py --- a/rhodecode/lib/vcs/utils/__init__.py +++ b/rhodecode/lib/vcs/utils/__init__.py @@ -58,7 +58,7 @@ def author_name(author): to get the username """ - if not author or not '@' in author: + if not author or '@' not in author: return author else: return author.replace(author_email(author), '').replace('<', '')\ diff --git a/rhodecode/model/validation_schema/schemas/search_schema.py b/rhodecode/model/validation_schema/schemas/search_schema.py --- a/rhodecode/model/validation_schema/schemas/search_schema.py +++ b/rhodecode/model/validation_schema/schemas/search_schema.py @@ -34,6 +34,9 @@ class SearchParamsSchema(colander.Mappin colander.String(), missing='newfirst', validator=colander.OneOf(['oldfirst', 'newfirst'])) + search_max_lines = colander.SchemaNode( + colander.Integer(), + missing=10) page_limit = colander.SchemaNode( colander.Integer(), missing=10, diff --git a/rhodecode/public/css/code-block.less b/rhodecode/public/css/code-block.less --- a/rhodecode/public/css/code-block.less +++ b/rhodecode/public/css/code-block.less @@ -572,6 +572,7 @@ div.annotatediv { margin-left: 2px; marg .code-highlight, /* TODO: dan: merge codehilite into code-highlight */ /* This can be generated with `pygmentize -S default -f html` */ .codehilite { + .c-ElasticMatch { background-color: #faffa6; padding: 0.2em;} .hll { background-color: #ffffcc } .c { color: #408080; font-style: italic } /* Comment */ .err, .codehilite .err { border: none } /* Error */ @@ -640,6 +641,7 @@ div.annotatediv { margin-left: 2px; marg .vi { color: #19177C } /* Name.Variable.Instance */ .vm { color: #19177C } /* Name.Variable.Magic */ .il { color: #666666 } /* Literal.Number.Integer.Long */ + } /* customized pre blocks for markdown/rst */ diff --git a/rhodecode/public/css/type.less b/rhodecode/public/css/type.less --- a/rhodecode/public/css/type.less +++ b/rhodecode/public/css/type.less @@ -166,7 +166,6 @@ small, mark, .mark { - background-color: @rclightblue; padding: .2em; } diff --git a/rhodecode/templates/admin/settings/settings_search.mako b/rhodecode/templates/admin/settings/settings_search.mako --- a/rhodecode/templates/admin/settings/settings_search.mako +++ b/rhodecode/templates/admin/settings/settings_search.mako @@ -5,8 +5,13 @@
% for stat in c.statistics: -
${stat['key']}
-
${stat['value']}
+ % if stat.get('sep'): +
+
--
+ % else: +
${stat['key']}
+
${stat['value']}
+ % endif % endfor
diff --git a/rhodecode/templates/base/root.mako b/rhodecode/templates/base/root.mako --- a/rhodecode/templates/base/root.mako +++ b/rhodecode/templates/base/root.mako @@ -7,9 +7,12 @@ go_import_header = '' if hasattr(c, 'rhodecode_db_repo'): c.template_context['repo_type'] = c.rhodecode_db_repo.repo_type c.template_context['repo_landing_commit'] = c.rhodecode_db_repo.landing_rev[1] + ## check repo context + c.template_context['repo_view_type'] = h.get_repo_view_type(request) if getattr(c, 'repo_group', None): c.template_context['repo_group_id'] = c.repo_group.group_id + c.template_context['repo_group_name'] = c.repo_group.group_name if getattr(c, 'rhodecode_user', None) and c.rhodecode_user.user_id: c.template_context['rhodecode_user']['username'] = c.rhodecode_user.username @@ -23,6 +26,12 @@ c.template_context['default_user'] = { 'username': h.DEFAULT_USER, 'user_id': 1 } +c.template_context['search_context'] = { + 'repo_group_id': c.template_context.get('repo_group_id'), + 'repo_group_name': c.template_context.get('repo_group_name'), + 'repo_name': c.template_context.get('repo_name'), + 'repo_view_type': c.template_context.get('repo_view_type'), +} %> diff --git a/rhodecode/templates/search/search.mako b/rhodecode/templates/search/search.mako --- a/rhodecode/templates/search/search.mako +++ b/rhodecode/templates/search/search.mako @@ -18,10 +18,7 @@ %else: ${_('Search inside all accessible repositories')} %endif - %if c.cur_query: - » - ${c.cur_query} - %endif + <%def name="menu_bar_nav()"> @@ -59,7 +56,8 @@
${h.text('q', c.cur_query, placeholder="Enter query...")} - ${h.select('type',c.search_type,[('content',_('File contents')), ('commit',_('Commit messages')), ('path',_('File names')),],id='id_search_type')} + ${h.select('type',c.search_type,[('content',_('Files')), ('path',_('File path')),('commit',_('Commits'))],id='id_search_type')} + ${h.hidden('max_lines', '10')}
@@ -72,8 +70,54 @@ % endfor
-

${_('Example Queries')}

- +

${_('Query Langague examples')}

+
${c.runtime}
@@ -96,6 +140,7 @@
+ +% endif diff --git a/rhodecode/templates/search/search_content.mako b/rhodecode/templates/search/search_content.mako --- a/rhodecode/templates/search/search_content.mako +++ b/rhodecode/templates/search/search_content.mako @@ -1,33 +1,10 @@ -<%def name="highlight_text_file(terms, text, url, line_context=3, - max_lines=10, - mimetype=None, filepath=None)"> -<% -lines = text.split('\n') -lines_of_interest = set() -matching_lines = h.get_matching_line_offsets(lines, terms) -shown_matching_lines = 0 -for line_number in matching_lines: - if len(lines_of_interest) < max_lines: - lines_of_interest |= set(range( - max(line_number - line_context, 0), - min(line_number + line_context, len(lines) + 1))) - shown_matching_lines += 1 - -%> -${h.code_highlight( - text, - h.get_lexer_safe( - mimetype=mimetype, - filepath=filepath, - ), - h.SearchContentCodeHtmlFormatter( - linenos=True, - cssclass="code-highlight", - url=url, - query_terms=terms, - only_line_numbers=lines_of_interest -))|n} +<%def name="highlight_text_file(has_matched_content, file_content, lexer, html_formatter, matching_lines, shown_matching_lines, url, use_hl_filter)"> +% if has_matched_content: + ${h.code_highlight(file_content, lexer, html_formatter, use_hl_filter=use_hl_filter)|n} +% else: + ${_('No content matched')}
+% endif %if len(matching_lines) > shown_matching_lines: @@ -37,12 +14,52 @@ for line_number in matching_lines:
+<% query_mark = c.searcher.query_to_mark(c.cur_query, 'content') %> + %for entry in c.formatted_results: + + <% + file_content = entry['content_highlight'] or entry['content'] + mimetype = entry.get('mimetype') + filepath = entry.get('path') + max_lines = h.safe_int(request.GET.get('max_lines', '10')) + line_context = h.safe_int(request.GET.get('line_contenxt', '3')) + + match_file_url=h.route_path('repo_files',repo_name=entry['repository'], commit_id=entry.get('commit_id', 'tip'),f_path=entry['f_path'], _query={"mark": query_mark}) + terms = c.cur_query + + if c.searcher.is_es_6: + # use empty terms so we default to markers usage + total_lines, matching_lines = h.get_matching_line_offsets(file_content, terms=None) + else: + total_lines, matching_lines = h.get_matching_line_offsets(file_content, terms) + + shown_matching_lines = 0 + lines_of_interest = set() + for line_number in matching_lines: + if len(lines_of_interest) < max_lines: + lines_of_interest |= set(range( + max(line_number - line_context, 0), + min(line_number + line_context, total_lines + 1))) + shown_matching_lines += 1 + lexer = h.get_lexer_safe(mimetype=mimetype, filepath=filepath) + + html_formatter = h.SearchContentCodeHtmlFormatter( + linenos=True, + cssclass="code-highlight", + url=match_file_url, + query_terms=terms, + only_line_numbers=lines_of_interest + ) + + has_matched_content = len(lines_of_interest) >= 1 + + %> ## search results are additionally filtered, and this check is just a safe gate % if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(entry['repository'], 'search results content check'):
- ${highlight_text_file(c.cur_query, entry['content'], - url=h.route_path('repo_files',repo_name=entry['repository'],commit_id=entry.get('commit_id', 'tip'),f_path=entry['f_path']), - mimetype=entry.get('mimetype'), filepath=entry.get('path'))} + + ${highlight_text_file( + has_matched_content=has_matched_content, + file_content=file_content, + lexer=lexer, + html_formatter=html_formatter, + matching_lines=matching_lines, + shown_matching_lines=shown_matching_lines, + url=match_file_url, + use_hl_filter=c.searcher.is_es_6 + )}
+
% endif %endfor @@ -91,10 +138,14 @@ for line_number in matching_lines: %if c.cur_query: -%endif \ No newline at end of file +%endif diff --git a/rhodecode/templates/search/search_path.mako b/rhodecode/templates/search/search_path.mako --- a/rhodecode/templates/search/search_path.mako +++ b/rhodecode/templates/search/search_path.mako @@ -1,3 +1,5 @@ +% if c.formatted_results: + @@ -27,8 +29,10 @@ %endfor
${_('Repository')}
-%if c.cur_query and c.formatted_results: +%if c.cur_query:
${c.formatted_results.pager('$link_previous ~2~ $link_next')}
-%endif \ No newline at end of file +%endif + +% endif diff --git a/rhodecode/tests/lib/test_helpers.py b/rhodecode/tests/lib/test_helpers.py --- a/rhodecode/tests/lib/test_helpers.py +++ b/rhodecode/tests/lib/test_helpers.py @@ -208,44 +208,3 @@ def test_get_visual_attr(baseapp): def test_chop_at(test_text, inclusive, expected_text): assert helpers.chop_at_smart( test_text, '\n', inclusive, '...') == expected_text - - -@pytest.mark.parametrize('test_text, expected_output', [ - ('some text', ['some', 'text']), - ('some text', ['some', 'text']), - ('some text "with a phrase"', ['some', 'text', 'with a phrase']), - ('"a phrase" "another phrase"', ['a phrase', 'another phrase']), - ('"justphrase"', ['justphrase']), - ('""', []), - ('', []), - (' ', []), - ('" "', []), -]) -def test_extract_phrases(test_text, expected_output): - assert helpers.extract_phrases(test_text) == expected_output - - -@pytest.mark.parametrize('test_text, text_phrases, expected_output', [ - ('some text here', ['some', 'here'], [(0, 4), (10, 14)]), - ('here here there', ['here'], [(0, 4), (5, 9), (11, 15)]), - ('irrelevant', ['not found'], []), - ('irrelevant', ['not found'], []), -]) -def test_get_matching_offsets(test_text, text_phrases, expected_output): - assert helpers.get_matching_offsets( - test_text, text_phrases) == expected_output - - -def test_normalize_text_for_matching(): - assert helpers.normalize_text_for_matching( - 'OJjfe)*#$*@)$JF*)3r2f80h') == 'ojjfe jf 3r2f80h' - - -def test_get_matching_line_offsets(): - assert helpers.get_matching_line_offsets([ - 'words words words', - 'words words words', - 'some text some', - 'words words words', - 'words words words', - 'text here what'], 'text') == {3: [(5, 9)], 6: [(0, 4)]} diff --git a/rhodecode/tests/lib/test_search_utils.py b/rhodecode/tests/lib/test_search_utils.py new file mode 100644 --- /dev/null +++ b/rhodecode/tests/lib/test_search_utils.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# Copyright (C) 2010-2018 RhodeCode GmbH +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License, version 3 +# (only), as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# This program is dual-licensed. If you wish to learn more about the +# RhodeCode Enterprise Edition, including its added features, Support services, +# and proprietary license terms, please see https://rhodecode.com/licenses/ + +import copy +import mock +import pytest + +from rhodecode.lib.index import search_utils + + +@pytest.mark.parametrize('test_text, expected_output', [ + ('some text', ['some', 'text']), + ('some text', ['some', 'text']), + ('some text "with a phrase"', ['some', 'text', 'with a phrase']), + ('"a phrase" "another phrase"', ['a phrase', 'another phrase']), + ('"justphrase"', ['justphrase']), + ('""', []), + ('', []), + (' ', []), + ('" "', []), +]) +def test_extract_phrases(test_text, expected_output): + assert search_utils.extract_phrases(test_text) == expected_output + + +@pytest.mark.parametrize('test_text, text_phrases, expected_output', [ + ('some text here', ['some', 'here'], [(0, 4), (10, 14)]), + ('here here there', ['here'], [(0, 4), (5, 9), (11, 15)]), + ('irrelevant', ['not found'], []), + ('irrelevant', ['not found'], []), +]) +def test_get_matching_phrase_offsets(test_text, text_phrases, expected_output): + assert search_utils.get_matching_phrase_offsets( + test_text, text_phrases) == expected_output + + +@pytest.mark.parametrize('test_text, text_phrases, expected_output', [ + ('__RCSearchHLMarkBEG__some__RCSearchHLMarkEND__ text __RCSearchHLMarkBEG__here__RCSearchHLMarkEND__', [], [(0, 46), (52, 98)]), + ('__RCSearchHLMarkBEG__here__RCSearchHLMarkEND__ __RCSearchHLMarkBEG__here__RCSearchHLMarkEND__ there', [], [(0, 46), (47, 93)]), + ('some text __RCSearchHLMarkBEG__here__RCSearchHLMarkEND__', [], [(10, 56)]), + ('__RCSearchHLMarkBEG__here__RCSearchHLMarkEND__ __RCSearchHLMarkBEG__here__RCSearchHLMarkEND__ __RCSearchHLMarkBEG__there__RCSearchHLMarkEND__', [], [(0, 46), (47, 93), (94, 141)]), + ('irrelevant', ['not found'], []), + ('irrelevant', ['not found'], []), +]) +def test_get_matching_marker_offsets(test_text, text_phrases, expected_output): + + assert search_utils.get_matching_markers_offsets(test_text) == expected_output + + +def test_normalize_text_for_matching(): + assert search_utils.normalize_text_for_matching( + 'OJjfe)*#$*@)$JF*)3r2f80h') == 'ojjfe jf 3r2f80h' + + +def test_get_matching_line_offsets(): + words = '\n'.join([ + 'words words words', + 'words words words', + 'some text some', + 'words words words', + 'words words words', + 'text here what' + ]) + total_lines, matched_offsets = \ + search_utils.get_matching_line_offsets(words, terms='text') + assert total_lines == 6 + assert matched_offsets == {3: [(5, 9)], 6: [(0, 4)]} + + +def test_get_matching_line_offsets_using_markers(): + words = '\n'.join([ + 'words words words', + 'words words words', + 'some __1__text__2__ some', + 'words words words', + 'words words words', + '__1__text__2__ here what' + ]) + total_lines, matched_offsets = \ + search_utils.get_matching_line_offsets(words, terms=None, + markers=['__1__(.*?)__2__']) + assert total_lines == 6 + assert matched_offsets == {3: [(5, 19)], 6: [(0, 14)]}