# Copyright (C) 2012-2023 RhodeCode GmbH # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License, version 3 # (only), as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # This program is dual-licensed. If you wish to learn more about the # RhodeCode Enterprise Edition, including its added features, Support services, # and proprietary license terms, please see https://rhodecode.com/licenses/ """ Index schema for RhodeCode """ import os import re import logging from whoosh import query as query_lib from whoosh.highlight import HtmlFormatter, ContextFragmenter from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError from whoosh.qparser import QueryParser, QueryParserError import rhodecode.lib.helpers as h from rhodecode.lib.index import BaseSearcher from rhodecode.lib.str_utils import safe_str log = logging.getLogger(__name__) try: # we first try to import from rhodecode tools, fallback to copies if # we're unable to from rhodecode_tools.lib.fts_index.whoosh_schema import ( ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME, COMMIT_SCHEMA) except ImportError: log.warning('rhodecode_tools schema not available, doing a fallback ' 'import from `rhodecode.lib.index.whoosh_fallback_schema`') from rhodecode.lib.index.whoosh_fallback_schema import ( ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME, COMMIT_SCHEMA) FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) log = logging.getLogger(__name__) class WhooshSearcher(BaseSearcher): # this also shows in UI query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html' name = 'whoosh' def __init__(self, config): super(Searcher, self).__init__() self.config = config if not os.path.isdir(self.config['location']): os.makedirs(self.config['location']) opener = create_in if exists_in(self.config['location'], indexname=FILE_INDEX_NAME): opener = open_dir file_index = opener(self.config['location'], schema=FILE_SCHEMA, indexname=FILE_INDEX_NAME) opener = create_in if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME): opener = open_dir changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA, indexname=COMMIT_INDEX_NAME) self.commit_schema = COMMIT_SCHEMA self.commit_index = changeset_index self.file_schema = FILE_SCHEMA self.file_index = file_index self.searcher = None def cleanup(self): if self.searcher: self.searcher.close() def _extend_query(self, query): hashes = re.compile('([0-9a-f]{5,40})').findall(query) if hashes: hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes) query = u'(%s) OR %s' % (query, hashes_or_query) return query def sort_def(self, search_type, direction, sort_field): if search_type == 'commit': field_defs = { 'message': 'message', 'date': 'date', 'author_email': 'author', } elif search_type == 'path': field_defs = { 'file': 'path', 'size': 'size', 'lines': 'lines', } elif search_type == 'content': # NOTE(dan): content doesn't support any sorting field_defs = {} else: return '' if sort_field in field_defs: return field_defs[sort_field] def search(self, query, document_type, search_user, repo_name=None, repo_group_name=None, requested_page=1, page_limit=10, sort=None, raise_on_exc=True): original_query = query query = self._extend_query(query) log.debug('QUERY: %s on %s', query, document_type) result = { 'results': [], 'count': 0, 'error': None, 'runtime': 0 } search_type, index_name, schema_defn = self._prepare_for_search( document_type) self._init_searcher(index_name) try: qp = QueryParser(search_type, schema=schema_defn) allowed_repos_filter = self._get_repo_filter( search_user, repo_name) try: query = qp.parse(safe_str(query)) log.debug('query: %s (%s)', query, repr(query)) reverse, sorted_by = False, None direction, sort_field = self.get_sort(search_type, sort) if sort_field: sort_definition = self.sort_def(search_type, direction, sort_field) if sort_definition: sorted_by = sort_definition if direction == Searcher.DIRECTION_DESC: reverse = True if direction == Searcher.DIRECTION_ASC: reverse = False whoosh_results = self.searcher.search( query, filter=allowed_repos_filter, limit=None, sortedby=sorted_by, reverse=reverse) # fixes for 32k limit that whoosh uses for highlight whoosh_results.fragmenter.charlimit = None res_ln = whoosh_results.scored_length() result['runtime'] = whoosh_results.runtime result['count'] = res_ln result['results'] = WhooshResultWrapper( search_type, res_ln, whoosh_results) except QueryParserError: result['error'] = 'Invalid search query. Try quoting it.' except (EmptyIndexError, IOError, OSError): msg = 'There is no index to search in. Please run whoosh indexer' log.exception(msg) result['error'] = msg except Exception: msg = 'An error occurred during this search operation' log.exception(msg) result['error'] = msg return result def statistics(self, translator): _ = translator stats = [ {'key': _('Index Type'), 'value': 'Whoosh'}, {'sep': True}, {'key': _('File Index'), 'value': str(self.file_index)}, {'key': _('Indexed documents'), 'value': self.file_index.doc_count()}, {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())}, {'sep': True}, {'key': _('Commit index'), 'value': str(self.commit_index)}, {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())}, {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())} ] return stats def _get_repo_filter(self, auth_user, repo_name): allowed_to_search = [ repo for repo, perm in auth_user.permissions['repositories'].items() if perm != 'repository.none'] if repo_name: repo_filter = [query_lib.Term('repository', repo_name)] elif 'hg.admin' in auth_user.permissions.get('global', []): return None else: repo_filter = [query_lib.Term('repository', _rn) for _rn in allowed_to_search] # in case we're not allowed to search anywhere, it's a trick # to tell whoosh we're filtering, on ALL results repo_filter = repo_filter or [query_lib.Term('repository', '')] return query_lib.Or(repo_filter) def _prepare_for_search(self, cur_type): search_type = { 'content': 'content', 'commit': 'message', 'path': 'path', 'repository': 'repository' }.get(cur_type, 'content') index_name = { 'content': FILE_INDEX_NAME, 'commit': COMMIT_INDEX_NAME, 'path': FILE_INDEX_NAME }.get(cur_type, FILE_INDEX_NAME) schema_defn = { 'content': self.file_schema, 'commit': self.commit_schema, 'path': self.file_schema }.get(cur_type, self.file_schema) log.debug('IDX: %s', index_name) log.debug('SCHEMA: %s', schema_defn) return search_type, index_name, schema_defn def _init_searcher(self, index_name): idx = open_dir(self.config['location'], indexname=index_name) self.searcher = idx.searcher() return self.searcher Searcher = WhooshSearcher class WhooshResultWrapper(object): def __init__(self, search_type, total_hits, results): self.search_type = search_type self.results = results self.total_hits = total_hits def __str__(self): return '<%s at %s>' % (self.__class__.__name__, len(self)) def __repr__(self): return self.__str__() def __len__(self): return self.total_hits def __iter__(self): """ Allows Iteration over results,and lazy generate content *Requires* implementation of ``__getitem__`` method. """ for hit in self.results: yield self.get_full_content(hit) def __getitem__(self, key): """ Slicing of resultWrapper """ i, j = key.start, key.stop for hit in self.results[i:j]: yield self.get_full_content(hit) def get_full_content(self, hit): # TODO: marcink: this feels like an overkill, there's a lot of data # inside hit object, and we don't need all res = dict(hit) # elastic search uses that, we set it empty so it fallbacks to regular HL logic res['content_highlight'] = '' f_path = '' # pragma: no cover if self.search_type in ['content', 'path']: f_path = res['path'][len(res['repository']):] f_path = f_path.lstrip(os.sep) if self.search_type == 'content': res.update({'content_short_hl': hit.highlights('content'), 'f_path': f_path}) elif self.search_type == 'path': res.update({'f_path': f_path}) elif self.search_type == 'message': res.update({'message_hl': hit.highlights('message')}) return res