##// END OF EJS Templates
core: avoid using rhodecode.test packages inside main packages as tests are removed during build which can cause some problems in some edge case calls
core: avoid using rhodecode.test packages inside main packages as tests are removed during build which can cause some problems in some edge case calls

File last commit:

r5608:6d33e504 default
r5618:bdbdb63f default
Show More
whoosh.py
309 lines | 10.7 KiB | text/x-python | PythonLexer
core: updated copyright to 2024
r5608 # Copyright (C) 2012-2024 RhodeCode GmbH
project: added all source files and assets
r1 #
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License, version 3
# (only), as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This program is dual-licensed. If you wish to learn more about the
# RhodeCode Enterprise Edition, including its added features, Support services,
# and proprietary license terms, please see https://rhodecode.com/licenses/
"""
Index schema for RhodeCode
"""
python3: removed from future imports
r4912
project: added all source files and assets
r1 import os
dan
feature: Go To switcher now searches commit hashes as well
r62 import re
pylons: fixed code and test suite after removal of pylons.
r2358 import logging
project: added all source files and assets
r1
pylons: fixed code and test suite after removal of pylons.
r2358 from whoosh import query as query_lib
project: added all source files and assets
r1 from whoosh.highlight import HtmlFormatter, ContextFragmenter
from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError
from whoosh.qparser import QueryParser, QueryParserError
import rhodecode.lib.helpers as h
dan
search: add support for elastic search 6...
r3319 from rhodecode.lib.index import BaseSearcher
core: multiple fixes to unicode vs str usage...
r5065 from rhodecode.lib.str_utils import safe_str
project: added all source files and assets
r1
log = logging.getLogger(__name__)
try:
# we first try to import from rhodecode tools, fallback to copies if
# we're unable to
from rhodecode_tools.lib.fts_index.whoosh_schema import (
ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
COMMIT_SCHEMA)
except ImportError:
log.warning('rhodecode_tools schema not available, doing a fallback '
'import from `rhodecode.lib.index.whoosh_fallback_schema`')
from rhodecode.lib.index.whoosh_fallback_schema import (
ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
COMMIT_SCHEMA)
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)
log = logging.getLogger(__name__)
dan
search: add support for elastic search 6...
r3319 class WhooshSearcher(BaseSearcher):
search: added basic example query block.
r1684 # this also shows in UI
query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'
project: added all source files and assets
r1 name = 'whoosh'
def __init__(self, config):
dan
search: add support for elastic search 6...
r3319 super(Searcher, self).__init__()
project: added all source files and assets
r1 self.config = config
if not os.path.isdir(self.config['location']):
os.makedirs(self.config['location'])
opener = create_in
if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):
opener = open_dir
file_index = opener(self.config['location'], schema=FILE_SCHEMA,
indexname=FILE_INDEX_NAME)
opener = create_in
if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):
opener = open_dir
changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,
indexname=COMMIT_INDEX_NAME)
self.commit_schema = COMMIT_SCHEMA
self.commit_index = changeset_index
self.file_schema = FILE_SCHEMA
self.file_index = file_index
self.searcher = None
def cleanup(self):
if self.searcher:
self.searcher.close()
dan
feature: Go To switcher now searches commit hashes as well
r62 def _extend_query(self, query):
hashes = re.compile('([0-9a-f]{5,40})').findall(query)
if hashes:
hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)
query = u'(%s) OR %s' % (query, hashes_or_query)
return query
dan
search: added per-backend sorting fields....
r3968 def sort_def(self, search_type, direction, sort_field):
if search_type == 'commit':
field_defs = {
'message': 'message',
'date': 'date',
'author_email': 'author',
}
elif search_type == 'path':
field_defs = {
'file': 'path',
'size': 'size',
'lines': 'lines',
}
elif search_type == 'content':
# NOTE(dan): content doesn't support any sorting
field_defs = {}
else:
return ''
if sort_field in field_defs:
return field_defs[sort_field]
search: goto commit search will now use a safe search option and never...
r1411 def search(self, query, document_type, search_user,
dan
search: add option to search within a repository group.
r3441 repo_name=None, repo_group_name=None,
requested_page=1, page_limit=10, sort=None, raise_on_exc=True):
dan
feature: Go To switcher now searches commit hashes as well
r62
original_query = query
query = self._extend_query(query)
search: fixed search tests
r5113 log.debug('QUERY: %s on %s', query, document_type)
project: added all source files and assets
r1 result = {
'results': [],
'count': 0,
'error': None,
'runtime': 0
}
search_type, index_name, schema_defn = self._prepare_for_search(
document_type)
self._init_searcher(index_name)
try:
qp = QueryParser(search_type, schema=schema_defn)
allowed_repos_filter = self._get_repo_filter(
search_user, repo_name)
try:
core: multiple fixes to unicode vs str usage...
r5065 query = qp.parse(safe_str(query))
logging: use lazy parameter evaluation in log calls.
r3061 log.debug('query: %s (%s)', query, repr(query))
project: added all source files and assets
r1
search: added sorting of search results for whoosh backend.
r3964 reverse, sorted_by = False, None
direction, sort_field = self.get_sort(search_type, sort)
if sort_field:
dan
search: added per-backend sorting fields....
r3968 sort_definition = self.sort_def(search_type, direction, sort_field)
if sort_definition:
sorted_by = sort_definition
if direction == Searcher.DIRECTION_DESC:
reverse = True
if direction == Searcher.DIRECTION_ASC:
reverse = False
project: added all source files and assets
r1
whoosh_results = self.searcher.search(
query, filter=allowed_repos_filter, limit=None,
search: added sorting of search results for whoosh backend.
r3964 sortedby=sorted_by, reverse=reverse)
project: added all source files and assets
r1
# fixes for 32k limit that whoosh uses for highlight
whoosh_results.fragmenter.charlimit = None
res_ln = whoosh_results.scored_length()
result['runtime'] = whoosh_results.runtime
result['count'] = res_ln
result['results'] = WhooshResultWrapper(
search_type, res_ln, whoosh_results)
except QueryParserError:
pylons: fixed code and test suite after removal of pylons.
r2358 result['error'] = 'Invalid search query. Try quoting it.'
project: added all source files and assets
r1 except (EmptyIndexError, IOError, OSError):
pylons: fixed code and test suite after removal of pylons.
r2358 msg = 'There is no index to search in. Please run whoosh indexer'
project: added all source files and assets
r1 log.exception(msg)
result['error'] = msg
except Exception:
pylons: fixed code and test suite after removal of pylons.
r2358 msg = 'An error occurred during this search operation'
project: added all source files and assets
r1 log.exception(msg)
result['error'] = msg
return result
pylons: fixed code and test suite after removal of pylons.
r2358 def statistics(self, translator):
_ = translator
project: added all source files and assets
r1 stats = [
{'key': _('Index Type'), 'value': 'Whoosh'},
dan
search: add support for elastic search 6...
r3319 {'sep': True},
project: added all source files and assets
r1 {'key': _('File Index'), 'value': str(self.file_index)},
dan
search: add support for elastic search 6...
r3319 {'key': _('Indexed documents'), 'value': self.file_index.doc_count()},
{'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},
{'sep': True},
project: added all source files and assets
r1 {'key': _('Commit index'), 'value': str(self.commit_index)},
dan
search: add support for elastic search 6...
r3319 {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},
{'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}
project: added all source files and assets
r1 ]
return stats
def _get_repo_filter(self, auth_user, repo_name):
allowed_to_search = [
repo for repo, perm in
auth_user.permissions['repositories'].items()
if perm != 'repository.none']
if repo_name:
repo_filter = [query_lib.Term('repository', repo_name)]
elif 'hg.admin' in auth_user.permissions.get('global', []):
return None
else:
repo_filter = [query_lib.Term('repository', _rn)
for _rn in allowed_to_search]
# in case we're not allowed to search anywhere, it's a trick
# to tell whoosh we're filtering, on ALL results
repo_filter = repo_filter or [query_lib.Term('repository', '')]
return query_lib.Or(repo_filter)
def _prepare_for_search(self, cur_type):
search_type = {
'content': 'content',
'commit': 'message',
'path': 'path',
'repository': 'repository'
}.get(cur_type, 'content')
index_name = {
'content': FILE_INDEX_NAME,
'commit': COMMIT_INDEX_NAME,
'path': FILE_INDEX_NAME
}.get(cur_type, FILE_INDEX_NAME)
schema_defn = {
'content': self.file_schema,
'commit': self.commit_schema,
'path': self.file_schema
}.get(cur_type, self.file_schema)
logging: use lazy parameter evaluation in log calls.
r3061 log.debug('IDX: %s', index_name)
log.debug('SCHEMA: %s', schema_defn)
project: added all source files and assets
r1 return search_type, index_name, schema_defn
def _init_searcher(self, index_name):
idx = open_dir(self.config['location'], indexname=index_name)
self.searcher = idx.searcher()
return self.searcher
dan
search: add support for elastic search 6...
r3319 Searcher = WhooshSearcher
project: added all source files and assets
r1 class WhooshResultWrapper(object):
def __init__(self, search_type, total_hits, results):
self.search_type = search_type
self.results = results
self.total_hits = total_hits
def __str__(self):
return '<%s at %s>' % (self.__class__.__name__, len(self))
def __repr__(self):
return self.__str__()
def __len__(self):
return self.total_hits
def __iter__(self):
"""
Allows Iteration over results,and lazy generate content
*Requires* implementation of ``__getitem__`` method.
"""
for hit in self.results:
yield self.get_full_content(hit)
def __getitem__(self, key):
"""
Slicing of resultWrapper
"""
i, j = key.start, key.stop
for hit in self.results[i:j]:
yield self.get_full_content(hit)
def get_full_content(self, hit):
# TODO: marcink: this feels like an overkill, there's a lot of data
# inside hit object, and we don't need all
res = dict(hit)
dan
search: add support for elastic search 6...
r3319 # elastic search uses that, we set it empty so it fallbacks to regular HL logic
res['content_highlight'] = ''
project: added all source files and assets
r1
code: unified coverage notes to # pragma: no cover
r3282 f_path = '' # pragma: no cover
project: added all source files and assets
r1 if self.search_type in ['content', 'path']:
dan
search: fix bug where file path link was wrong when the repository...
r797 f_path = res['path'][len(res['repository']):]
project: added all source files and assets
r1 f_path = f_path.lstrip(os.sep)
if self.search_type == 'content':
res.update({'content_short_hl': hit.highlights('content'),
'f_path': f_path})
elif self.search_type == 'path':
res.update({'f_path': f_path})
elif self.search_type == 'message':
res.update({'message_hl': hit.highlights('message')})
return res