rhodecode-enterprise-ce Commit - r3964:ec30b572

search: added sorting of search results for whoosh backend.

marcink -

r3964:ec30b572 default

parent child

rhodecode/lib/index/whoosh.py

0 +15 -8

              # -*- coding: utf-8 -*-
              # Copyright (C) 2012-2019 RhodeCode GmbH
              #
              # This program is free software: you can redistribute it and/or modify
              # it under the terms of the GNU Affero General Public License, version 3
              # (only), as published by the Free Software Foundation.
              #
              # This program is distributed in the hope that it will be useful,
              # but WITHOUT ANY WARRANTY; without even the implied warranty of
              # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              # GNU General Public License for more details.
              #
              # You should have received a copy of the GNU Affero General Public License
              # along with this program.  If not, see <http://www.gnu.org/licenses/>.
              #
              # This program is dual-licensed. If you wish to learn more about the
              # RhodeCode Enterprise Edition, including its added features, Support services,
              # and proprietary license terms, please see https://rhodecode.com/licenses/
              """
              Index schema for RhodeCode
              """
              from __future__ import absolute_import
              import os
              import re
              import logging
              from whoosh import query as query_lib
              from whoosh.highlight import HtmlFormatter, ContextFragmenter
              from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError
              from whoosh.qparser import QueryParser, QueryParserError
              import rhodecode.lib.helpers as h
              from rhodecode.lib.index import BaseSearcher
              from rhodecode.lib.utils2 import safe_unicode
              log = logging.getLogger(__name__)
              try:
                  # we first try to import from rhodecode tools, fallback to copies if
                  # we're unable to
                  from rhodecode_tools.lib.fts_index.whoosh_schema import (
                      ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
                      COMMIT_SCHEMA)
              except ImportError:
                  log.warning('rhodecode_tools schema not available, doing a fallback '
                              'import from `rhodecode.lib.index.whoosh_fallback_schema`')
                  from rhodecode.lib.index.whoosh_fallback_schema import (
                      ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
                      COMMIT_SCHEMA)
              FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
              FRAGMENTER = ContextFragmenter(200)
              log = logging.getLogger(__name__)
              class WhooshSearcher(BaseSearcher):
                  # this also shows in UI
                  query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'
                  name = 'whoosh'
                  def __init__(self, config):
                      super(Searcher, self).__init__()
                      self.config = config
                      if not os.path.isdir(self.config['location']):
                          os.makedirs(self.config['location'])
                      opener = create_in
                      if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):
                          opener = open_dir
                      file_index = opener(self.config['location'], schema=FILE_SCHEMA,
                                          indexname=FILE_INDEX_NAME)
                      opener = create_in
                      if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):
                          opener = open_dir
                      changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,
                                               indexname=COMMIT_INDEX_NAME)
                      self.commit_schema = COMMIT_SCHEMA
                      self.commit_index = changeset_index
                      self.file_schema = FILE_SCHEMA
                      self.file_index = file_index
                      self.searcher = None
                  def cleanup(self):
                      if self.searcher:
                          self.searcher.close()
                  def _extend_query(self, query):
                      hashes = re.compile('([0-9a-f]{5,40})').findall(query)
                      if hashes:
                          hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)
                          query = u'(%s) OR %s' % (query, hashes_or_query)
                      return query
                  def search(self, query, document_type, search_user,
                             repo_name=None, repo_group_name=None,
                             requested_page=1, page_limit=10, sort=None, raise_on_exc=True):
                      original_query = query
                      query = self._extend_query(query)
                      log.debug(u'QUERY: %s on %s', query, document_type)
                      result = {
                          'results': [],
                          'count': 0,
                          'error': None,
                          'runtime': 0
                      }
                      search_type, index_name, schema_defn = self._prepare_for_search(
                          document_type)
                      self._init_searcher(index_name)
                      try:
                          qp = QueryParser(search_type, schema=schema_defn)
                          allowed_repos_filter = self._get_repo_filter(
                              search_user, repo_name)
                          try:
                              query = qp.parse(safe_unicode(query))
                              log.debug('query: %s (%s)', query, repr(query))
-                             reverse, sortedby = False, None
-                             if search_type == 'message':
-                                 if sort == 'oldfirst':
-                                     sortedby = 'date'
+                             def sort_def(_direction, _sort_field):
+                                 field2whoosh = {
+                                     'message.raw': 'message',
+                                     'author.email.raw': 'author',
+                                 }
+                                 return field2whoosh.get(_sort_field) or _sort_field
+                             reverse, sorted_by = False, None
+                             direction, sort_field = self.get_sort(search_type, sort)
+                             if sort_field:
+                                 if direction == Searcher.DIRECTION_DESC:
+                                     reverse = True
+                                 if direction == Searcher.DIRECTION_ASC:
                                      reverse = False
-                                 elif sort == 'newfirst':
-                                     sortedby = 'date'
-                                     reverse = True
+                                 sorted_by = sort_def(direction, sort_field)
                              whoosh_results = self.searcher.search(
                                  query, filter=allowed_repos_filter, limit=None,
-                                 sortedby=sortedby, reverse=reverse)
+                                 sortedby=sorted_by, reverse=reverse)
                              # fixes for 32k limit that whoosh uses for highlight
                              whoosh_results.fragmenter.charlimit = None
                              res_ln = whoosh_results.scored_length()
                              result['runtime'] = whoosh_results.runtime
                              result['count'] = res_ln
                              result['results'] = WhooshResultWrapper(
                                  search_type, res_ln, whoosh_results)
                          except QueryParserError:
                              result['error'] = 'Invalid search query. Try quoting it.'
                      except (EmptyIndexError, IOError, OSError):
                          msg = 'There is no index to search in. Please run whoosh indexer'
                          log.exception(msg)
                          result['error'] = msg
                      except Exception:
                          msg = 'An error occurred during this search operation'
                          log.exception(msg)
                          result['error'] = msg
                      return result
                  def statistics(self, translator):
                      _ = translator
                      stats = [
                          {'key': _('Index Type'), 'value': 'Whoosh'},
                          {'sep': True},
                          {'key': _('File Index'), 'value': str(self.file_index)},
                          {'key': _('Indexed documents'), 'value': self.file_index.doc_count()},
                          {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},
                          {'sep': True},
                          {'key': _('Commit index'), 'value': str(self.commit_index)},
                          {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},
                          {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}
                      ]
                      return stats
                  def _get_repo_filter(self, auth_user, repo_name):
                      allowed_to_search = [
                          repo for repo, perm in
                          auth_user.permissions['repositories'].items()
                          if perm != 'repository.none']
                      if repo_name:
                          repo_filter = [query_lib.Term('repository', repo_name)]
                      elif 'hg.admin' in auth_user.permissions.get('global', []):
                          return None
                      else:
                          repo_filter = [query_lib.Term('repository', _rn)
                                         for _rn in allowed_to_search]
                          # in case we're not allowed to search anywhere, it's a trick
                          # to tell whoosh we're filtering, on ALL results
                          repo_filter = repo_filter or [query_lib.Term('repository', '')]
                      return query_lib.Or(repo_filter)
                  def _prepare_for_search(self, cur_type):
                      search_type = {
                          'content': 'content',
                          'commit': 'message',
                          'path': 'path',
                          'repository': 'repository'
                      }.get(cur_type, 'content')
                      index_name = {
                          'content': FILE_INDEX_NAME,
                          'commit': COMMIT_INDEX_NAME,
                          'path': FILE_INDEX_NAME
                      }.get(cur_type, FILE_INDEX_NAME)
                      schema_defn = {
                          'content': self.file_schema,
                          'commit': self.commit_schema,
                          'path': self.file_schema
                      }.get(cur_type, self.file_schema)
                      log.debug('IDX: %s', index_name)
                      log.debug('SCHEMA: %s', schema_defn)
                      return search_type, index_name, schema_defn
                  def _init_searcher(self, index_name):
                      idx = open_dir(self.config['location'], indexname=index_name)
                      self.searcher = idx.searcher()
                      return self.searcher
              Searcher = WhooshSearcher
              class WhooshResultWrapper(object):
                  def __init__(self, search_type, total_hits, results):
                      self.search_type = search_type
                      self.results = results
                      self.total_hits = total_hits
                  def __str__(self):
                      return '<%s at %s>' % (self.__class__.__name__, len(self))
                  def __repr__(self):
                      return self.__str__()
                  def __len__(self):
                      return self.total_hits
                  def __iter__(self):
                      """
                      Allows Iteration over results,and lazy generate content
                      *Requires* implementation of ``__getitem__`` method.
                      """
                      for hit in self.results:
                          yield self.get_full_content(hit)
                  def __getitem__(self, key):
                      """
                      Slicing of resultWrapper
                      """
                      i, j = key.start, key.stop
                      for hit in self.results[i:j]:
                          yield self.get_full_content(hit)
                  def get_full_content(self, hit):
                      # TODO: marcink: this feels like an overkill, there's a lot of data
                      # inside hit object, and we don't need all
                      res = dict(hit)
                      # elastic search uses that, we set it empty so it fallbacks to regular HL logic
                      res['content_highlight'] = ''
                      f_path = ''  # pragma: no cover
                      if self.search_type in ['content', 'path']:
                          f_path = res['path'][len(res['repository']):]
                          f_path = f_path.lstrip(os.sep)
                      if self.search_type == 'content':
                          res.update({'content_short_hl': hit.highlights('content'),
                                      'f_path': f_path})
                      elif self.search_type == 'path':
                          res.update({'f_path': f_path})
                      elif self.search_type == 'message':
                          res.update({'message_hl': hit.highlights('message')})
                      return res

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages