whoosh.py
311 lines
| 10.7 KiB
| text/x-python
|
PythonLexer
r5054 | ||||
r1 | ||||
r5088 | # Copyright (C) 2012-2023 RhodeCode GmbH | |||
r1 | # | |||
# This program is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU Affero General Public License, version 3 | ||||
# (only), as published by the Free Software Foundation. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU Affero General Public License | ||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
# | ||||
# This program is dual-licensed. If you wish to learn more about the | ||||
# RhodeCode Enterprise Edition, including its added features, Support services, | ||||
# and proprietary license terms, please see https://rhodecode.com/licenses/ | ||||
""" | ||||
Index schema for RhodeCode | ||||
""" | ||||
r4912 | ||||
r1 | import os | |||
r62 | import re | |||
r2358 | import logging | |||
r1 | ||||
r2358 | from whoosh import query as query_lib | |||
r1 | from whoosh.highlight import HtmlFormatter, ContextFragmenter | |||
from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError | ||||
from whoosh.qparser import QueryParser, QueryParserError | ||||
import rhodecode.lib.helpers as h | ||||
r3319 | from rhodecode.lib.index import BaseSearcher | |||
r5065 | from rhodecode.lib.str_utils import safe_str | |||
r1 | ||||
log = logging.getLogger(__name__) | ||||
try: | ||||
# we first try to import from rhodecode tools, fallback to copies if | ||||
# we're unable to | ||||
from rhodecode_tools.lib.fts_index.whoosh_schema import ( | ||||
ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME, | ||||
COMMIT_SCHEMA) | ||||
except ImportError: | ||||
log.warning('rhodecode_tools schema not available, doing a fallback ' | ||||
'import from `rhodecode.lib.index.whoosh_fallback_schema`') | ||||
from rhodecode.lib.index.whoosh_fallback_schema import ( | ||||
ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME, | ||||
COMMIT_SCHEMA) | ||||
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') | ||||
FRAGMENTER = ContextFragmenter(200) | ||||
log = logging.getLogger(__name__) | ||||
r3319 | class WhooshSearcher(BaseSearcher): | |||
r1684 | # this also shows in UI | |||
query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html' | ||||
r1 | name = 'whoosh' | |||
def __init__(self, config): | ||||
r3319 | super(Searcher, self).__init__() | |||
r1 | self.config = config | |||
if not os.path.isdir(self.config['location']): | ||||
os.makedirs(self.config['location']) | ||||
opener = create_in | ||||
if exists_in(self.config['location'], indexname=FILE_INDEX_NAME): | ||||
opener = open_dir | ||||
file_index = opener(self.config['location'], schema=FILE_SCHEMA, | ||||
indexname=FILE_INDEX_NAME) | ||||
opener = create_in | ||||
if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME): | ||||
opener = open_dir | ||||
changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA, | ||||
indexname=COMMIT_INDEX_NAME) | ||||
self.commit_schema = COMMIT_SCHEMA | ||||
self.commit_index = changeset_index | ||||
self.file_schema = FILE_SCHEMA | ||||
self.file_index = file_index | ||||
self.searcher = None | ||||
def cleanup(self): | ||||
if self.searcher: | ||||
self.searcher.close() | ||||
r62 | def _extend_query(self, query): | |||
hashes = re.compile('([0-9a-f]{5,40})').findall(query) | ||||
if hashes: | ||||
hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes) | ||||
query = u'(%s) OR %s' % (query, hashes_or_query) | ||||
return query | ||||
r3968 | def sort_def(self, search_type, direction, sort_field): | |||
if search_type == 'commit': | ||||
field_defs = { | ||||
'message': 'message', | ||||
'date': 'date', | ||||
'author_email': 'author', | ||||
} | ||||
elif search_type == 'path': | ||||
field_defs = { | ||||
'file': 'path', | ||||
'size': 'size', | ||||
'lines': 'lines', | ||||
} | ||||
elif search_type == 'content': | ||||
# NOTE(dan): content doesn't support any sorting | ||||
field_defs = {} | ||||
else: | ||||
return '' | ||||
if sort_field in field_defs: | ||||
return field_defs[sort_field] | ||||
r1411 | def search(self, query, document_type, search_user, | |||
r3441 | repo_name=None, repo_group_name=None, | |||
requested_page=1, page_limit=10, sort=None, raise_on_exc=True): | ||||
r62 | ||||
original_query = query | ||||
query = self._extend_query(query) | ||||
r5113 | log.debug('QUERY: %s on %s', query, document_type) | |||
r1 | result = { | |||
'results': [], | ||||
'count': 0, | ||||
'error': None, | ||||
'runtime': 0 | ||||
} | ||||
search_type, index_name, schema_defn = self._prepare_for_search( | ||||
document_type) | ||||
self._init_searcher(index_name) | ||||
try: | ||||
qp = QueryParser(search_type, schema=schema_defn) | ||||
allowed_repos_filter = self._get_repo_filter( | ||||
search_user, repo_name) | ||||
try: | ||||
r5065 | query = qp.parse(safe_str(query)) | |||
r3061 | log.debug('query: %s (%s)', query, repr(query)) | |||
r1 | ||||
r3964 | reverse, sorted_by = False, None | |||
direction, sort_field = self.get_sort(search_type, sort) | ||||
if sort_field: | ||||
r3968 | sort_definition = self.sort_def(search_type, direction, sort_field) | |||
if sort_definition: | ||||
sorted_by = sort_definition | ||||
if direction == Searcher.DIRECTION_DESC: | ||||
reverse = True | ||||
if direction == Searcher.DIRECTION_ASC: | ||||
reverse = False | ||||
r1 | ||||
whoosh_results = self.searcher.search( | ||||
query, filter=allowed_repos_filter, limit=None, | ||||
r3964 | sortedby=sorted_by, reverse=reverse) | |||
r1 | ||||
# fixes for 32k limit that whoosh uses for highlight | ||||
whoosh_results.fragmenter.charlimit = None | ||||
res_ln = whoosh_results.scored_length() | ||||
result['runtime'] = whoosh_results.runtime | ||||
result['count'] = res_ln | ||||
result['results'] = WhooshResultWrapper( | ||||
search_type, res_ln, whoosh_results) | ||||
except QueryParserError: | ||||
r2358 | result['error'] = 'Invalid search query. Try quoting it.' | |||
r1 | except (EmptyIndexError, IOError, OSError): | |||
r2358 | msg = 'There is no index to search in. Please run whoosh indexer' | |||
r1 | log.exception(msg) | |||
result['error'] = msg | ||||
except Exception: | ||||
r2358 | msg = 'An error occurred during this search operation' | |||
r1 | log.exception(msg) | |||
result['error'] = msg | ||||
return result | ||||
r2358 | def statistics(self, translator): | |||
_ = translator | ||||
r1 | stats = [ | |||
{'key': _('Index Type'), 'value': 'Whoosh'}, | ||||
r3319 | {'sep': True}, | |||
r1 | {'key': _('File Index'), 'value': str(self.file_index)}, | |||
r3319 | {'key': _('Indexed documents'), 'value': self.file_index.doc_count()}, | |||
{'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())}, | ||||
{'sep': True}, | ||||
r1 | {'key': _('Commit index'), 'value': str(self.commit_index)}, | |||
r3319 | {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())}, | |||
{'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())} | ||||
r1 | ] | |||
return stats | ||||
def _get_repo_filter(self, auth_user, repo_name): | ||||
allowed_to_search = [ | ||||
repo for repo, perm in | ||||
auth_user.permissions['repositories'].items() | ||||
if perm != 'repository.none'] | ||||
if repo_name: | ||||
repo_filter = [query_lib.Term('repository', repo_name)] | ||||
elif 'hg.admin' in auth_user.permissions.get('global', []): | ||||
return None | ||||
else: | ||||
repo_filter = [query_lib.Term('repository', _rn) | ||||
for _rn in allowed_to_search] | ||||
# in case we're not allowed to search anywhere, it's a trick | ||||
# to tell whoosh we're filtering, on ALL results | ||||
repo_filter = repo_filter or [query_lib.Term('repository', '')] | ||||
return query_lib.Or(repo_filter) | ||||
def _prepare_for_search(self, cur_type): | ||||
search_type = { | ||||
'content': 'content', | ||||
'commit': 'message', | ||||
'path': 'path', | ||||
'repository': 'repository' | ||||
}.get(cur_type, 'content') | ||||
index_name = { | ||||
'content': FILE_INDEX_NAME, | ||||
'commit': COMMIT_INDEX_NAME, | ||||
'path': FILE_INDEX_NAME | ||||
}.get(cur_type, FILE_INDEX_NAME) | ||||
schema_defn = { | ||||
'content': self.file_schema, | ||||
'commit': self.commit_schema, | ||||
'path': self.file_schema | ||||
}.get(cur_type, self.file_schema) | ||||
r3061 | log.debug('IDX: %s', index_name) | |||
log.debug('SCHEMA: %s', schema_defn) | ||||
r1 | return search_type, index_name, schema_defn | |||
def _init_searcher(self, index_name): | ||||
idx = open_dir(self.config['location'], indexname=index_name) | ||||
self.searcher = idx.searcher() | ||||
return self.searcher | ||||
r3319 | Searcher = WhooshSearcher | |||
r1 | class WhooshResultWrapper(object): | |||
def __init__(self, search_type, total_hits, results): | ||||
self.search_type = search_type | ||||
self.results = results | ||||
self.total_hits = total_hits | ||||
def __str__(self): | ||||
return '<%s at %s>' % (self.__class__.__name__, len(self)) | ||||
def __repr__(self): | ||||
return self.__str__() | ||||
def __len__(self): | ||||
return self.total_hits | ||||
def __iter__(self): | ||||
""" | ||||
Allows Iteration over results,and lazy generate content | ||||
*Requires* implementation of ``__getitem__`` method. | ||||
""" | ||||
for hit in self.results: | ||||
yield self.get_full_content(hit) | ||||
def __getitem__(self, key): | ||||
""" | ||||
Slicing of resultWrapper | ||||
""" | ||||
i, j = key.start, key.stop | ||||
for hit in self.results[i:j]: | ||||
yield self.get_full_content(hit) | ||||
def get_full_content(self, hit): | ||||
# TODO: marcink: this feels like an overkill, there's a lot of data | ||||
# inside hit object, and we don't need all | ||||
res = dict(hit) | ||||
r3319 | # elastic search uses that, we set it empty so it fallbacks to regular HL logic | |||
res['content_highlight'] = '' | ||||
r1 | ||||
r3282 | f_path = '' # pragma: no cover | |||
r1 | if self.search_type in ['content', 'path']: | |||
r797 | f_path = res['path'][len(res['repository']):] | |||
r1 | f_path = f_path.lstrip(os.sep) | |||
if self.search_type == 'content': | ||||
res.update({'content_short_hl': hit.highlights('content'), | ||||
'f_path': f_path}) | ||||
elif self.search_type == 'path': | ||||
res.update({'f_path': f_path}) | ||||
elif self.search_type == 'message': | ||||
res.update({'message_hl': hit.highlights('message')}) | ||||
return res | ||||