rhodecode-enterprise-ce Files · rhodecode/lib/index/whoosh.py

pull-requests: add merge check that detects WIP marker in title. This will prevent merges in such case....

pull-requests: add merge check that detects WIP marker in title. This will prevent merges in such case. Usually WIP in title means unfinished task that needs still some work. This pattern is present in Gitlab/Github and is already quite common.

dan - - Load All Authors

File last commit:

r3968:2ec277c2 default


                r4099:c12e69d0

default

Download file

             whoosh.py
        
                    311 lines
            
             | 10.7 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / index / whoosh.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # -*- coding: utf-8 -*-

      # Copyright (C) 2012-2019 RhodeCode GmbH

      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      """

      Index schema for RhodeCode

      """

      from __future__ import absolute_import

      import os

      import re

      import logging

      from whoosh import query as query_lib

      from whoosh.highlight import HtmlFormatter, ContextFragmenter

      from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError

      from whoosh.qparser import QueryParser, QueryParserError

      import rhodecode.lib.helpers as h

      from rhodecode.lib.index import BaseSearcher

      from rhodecode.lib.utils2 import safe_unicode

      log = logging.getLogger(__name__)

      try:

          # we first try to import from rhodecode tools, fallback to copies if

          # we're unable to

          from rhodecode_tools.lib.fts_index.whoosh_schema import (

              ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,

              COMMIT_SCHEMA)

      except ImportError:

          log.warning('rhodecode_tools schema not available, doing a fallback '

                      'import from `rhodecode.lib.index.whoosh_fallback_schema`')

          from rhodecode.lib.index.whoosh_fallback_schema import (

              ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,

              COMMIT_SCHEMA)

      FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

      FRAGMENTER = ContextFragmenter(200)

      log = logging.getLogger(__name__)

      class WhooshSearcher(BaseSearcher):

          # this also shows in UI

          query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'

          name = 'whoosh'

          def __init__(self, config):

              super(Searcher, self).__init__()

              self.config = config

              if not os.path.isdir(self.config['location']):

                  os.makedirs(self.config['location'])

              opener = create_in

              if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):

                  opener = open_dir

              file_index = opener(self.config['location'], schema=FILE_SCHEMA,

                                  indexname=FILE_INDEX_NAME)

              opener = create_in

              if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):

                  opener = open_dir

              changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,

                                       indexname=COMMIT_INDEX_NAME)

              self.commit_schema = COMMIT_SCHEMA

              self.commit_index = changeset_index

              self.file_schema = FILE_SCHEMA

              self.file_index = file_index

              self.searcher = None

          def cleanup(self):

              if self.searcher:

                  self.searcher.close()

          def _extend_query(self, query):

              hashes = re.compile('([0-9a-f]{5,40})').findall(query)

              if hashes:

                  hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)

                  query = u'(%s) OR %s' % (query, hashes_or_query)

              return query

          def sort_def(self, search_type, direction, sort_field):

              if search_type == 'commit':

                  field_defs = {

                      'message': 'message',

                      'date': 'date',

                      'author_email': 'author',

                  }

              elif search_type == 'path':

                  field_defs = {

                      'file': 'path',

                      'size': 'size',

                      'lines': 'lines',

                  }

              elif search_type == 'content':

                  # NOTE(dan): content doesn't support any sorting

                  field_defs = {}

              else:

                  return ''

              if sort_field in field_defs:

                  return field_defs[sort_field]

          def search(self, query, document_type, search_user,

                     repo_name=None, repo_group_name=None,

                     requested_page=1, page_limit=10, sort=None, raise_on_exc=True):

              original_query = query

              query = self._extend_query(query)

              log.debug(u'QUERY: %s on %s', query, document_type)

              result = {

                  'results': [],

                  'count': 0,

                  'error': None,

                  'runtime': 0

              }

              search_type, index_name, schema_defn = self._prepare_for_search(

                  document_type)

              self._init_searcher(index_name)

              try:

                  qp = QueryParser(search_type, schema=schema_defn)

                  allowed_repos_filter = self._get_repo_filter(

                      search_user, repo_name)

                  try:

                      query = qp.parse(safe_unicode(query))

                      log.debug('query: %s (%s)', query, repr(query))

                      reverse, sorted_by = False, None

                      direction, sort_field = self.get_sort(search_type, sort)

                      if sort_field:

                          sort_definition = self.sort_def(search_type, direction, sort_field)

                          if sort_definition:

                              sorted_by = sort_definition

                              if direction == Searcher.DIRECTION_DESC:

                                  reverse = True

                              if direction == Searcher.DIRECTION_ASC:

                                  reverse = False

                      whoosh_results = self.searcher.search(

                          query, filter=allowed_repos_filter, limit=None,

                          sortedby=sorted_by, reverse=reverse)

                      # fixes for 32k limit that whoosh uses for highlight

                      whoosh_results.fragmenter.charlimit = None

                      res_ln = whoosh_results.scored_length()

                      result['runtime'] = whoosh_results.runtime

                      result['count'] = res_ln

                      result['results'] = WhooshResultWrapper(

                          search_type, res_ln, whoosh_results)

                  except QueryParserError:

                      result['error'] = 'Invalid search query. Try quoting it.'

              except (EmptyIndexError, IOError, OSError):

                  msg = 'There is no index to search in. Please run whoosh indexer'

                  log.exception(msg)

                  result['error'] = msg

              except Exception:

                  msg = 'An error occurred during this search operation'

                  log.exception(msg)

                  result['error'] = msg

              return result

          def statistics(self, translator):

              _ = translator

              stats = [

                  {'key': _('Index Type'), 'value': 'Whoosh'},

                  {'sep': True},

                  {'key': _('File Index'), 'value': str(self.file_index)},

                  {'key': _('Indexed documents'), 'value': self.file_index.doc_count()},

                  {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},

                  {'sep': True},

                  {'key': _('Commit index'), 'value': str(self.commit_index)},

                  {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},

                  {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}

              ]

              return stats

          def _get_repo_filter(self, auth_user, repo_name):

              allowed_to_search = [

                  repo for repo, perm in

                  auth_user.permissions['repositories'].items()

                  if perm != 'repository.none']

              if repo_name:

                  repo_filter = [query_lib.Term('repository', repo_name)]

              elif 'hg.admin' in auth_user.permissions.get('global', []):

                  return None

              else:

                  repo_filter = [query_lib.Term('repository', _rn)

                                 for _rn in allowed_to_search]

                  # in case we're not allowed to search anywhere, it's a trick

                  # to tell whoosh we're filtering, on ALL results

                  repo_filter = repo_filter or [query_lib.Term('repository', '')]

              return query_lib.Or(repo_filter)

          def _prepare_for_search(self, cur_type):

              search_type = {

                  'content': 'content',

                  'commit': 'message',

                  'path': 'path',

                  'repository': 'repository'

              }.get(cur_type, 'content')

              index_name = {

                  'content': FILE_INDEX_NAME,

                  'commit': COMMIT_INDEX_NAME,

                  'path': FILE_INDEX_NAME

              }.get(cur_type, FILE_INDEX_NAME)

              schema_defn = {

                  'content': self.file_schema,

                  'commit': self.commit_schema,

                  'path': self.file_schema

              }.get(cur_type, self.file_schema)

              log.debug('IDX: %s', index_name)

              log.debug('SCHEMA: %s', schema_defn)

              return search_type, index_name, schema_defn

          def _init_searcher(self, index_name):

              idx = open_dir(self.config['location'], indexname=index_name)

              self.searcher = idx.searcher()

              return self.searcher

      Searcher = WhooshSearcher

      class WhooshResultWrapper(object):

          def __init__(self, search_type, total_hits, results):

              self.search_type = search_type

              self.results = results

              self.total_hits = total_hits

          def __str__(self):

              return '<%s at %s>' % (self.__class__.__name__, len(self))

          def __repr__(self):

              return self.__str__()

          def __len__(self):

              return self.total_hits

          def __iter__(self):

              """

              Allows Iteration over results,and lazy generate content

              *Requires* implementation of ``__getitem__`` method.

              """

              for hit in self.results:

                  yield self.get_full_content(hit)

          def __getitem__(self, key):

              """

              Slicing of resultWrapper

              """

              i, j = key.start, key.stop

              for hit in self.results[i:j]:

                  yield self.get_full_content(hit)

          def get_full_content(self, hit):

              # TODO: marcink: this feels like an overkill, there's a lot of data

              # inside hit object, and we don't need all

              res = dict(hit)

              # elastic search uses that, we set it empty so it fallbacks to regular HL logic

              res['content_highlight'] = ''

              f_path = ''  # pragma: no cover

              if self.search_type in ['content', 'path']:

                  f_path = res['path'][len(res['repository']):]

                  f_path = f_path.lstrip(os.sep)

              if self.search_type == 'content':

                  res.update({'content_short_hl': hit.highlights('content'),

                              'f_path': f_path})

              elif self.search_type == 'path':

                  res.update({'f_path': f_path})

              elif self.search_type == 'message':

                  res.update({'message_hl': hit.highlights('message')})

              return res

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# -- coding: utf-8 --

				# Copyright (C) 2012-2019 RhodeCode GmbH
				#
				# This program is free software: you can redistribute it and/or modify
				# it under the terms of the GNU Affero General Public License, version 3
				# (only), as published by the Free Software Foundation.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU Affero General Public License
				# along with this program. If not, see <http://www.gnu.org/licenses/>.
				#
				# This program is dual-licensed. If you wish to learn more about the
				# RhodeCode Enterprise Edition, including its added features, Support services,
				# and proprietary license terms, please see https://rhodecode.com/licenses/

				"""
				Index schema for RhodeCode
				"""

				from __future__ import absolute_import
				import os
				import re
				import logging

				from whoosh import query as query_lib
				from whoosh.highlight import HtmlFormatter, ContextFragmenter
				from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError
				from whoosh.qparser import QueryParser, QueryParserError

				import rhodecode.lib.helpers as h
				from rhodecode.lib.index import BaseSearcher
				from rhodecode.lib.utils2 import safe_unicode

				log = logging.getLogger(__name__)


				try:
				# we first try to import from rhodecode tools, fallback to copies if
				# we're unable to
				from rhodecode_tools.lib.fts_index.whoosh_schema import (
				ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
				COMMIT_SCHEMA)
				except ImportError:
				log.warning('rhodecode_tools schema not available, doing a fallback '
				'import from `rhodecode.lib.index.whoosh_fallback_schema`')
				from rhodecode.lib.index.whoosh_fallback_schema import (
				ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
				COMMIT_SCHEMA)


				FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
				FRAGMENTER = ContextFragmenter(200)

				log = logging.getLogger(__name__)


				class WhooshSearcher(BaseSearcher):
				# this also shows in UI
				query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'
				name = 'whoosh'

				def __init__(self, config):
				super(Searcher, self).__init__()
				self.config = config
				if not os.path.isdir(self.config['location']):
				os.makedirs(self.config['location'])

				opener = create_in
				if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):
				opener = open_dir
				file_index = opener(self.config['location'], schema=FILE_SCHEMA,
				indexname=FILE_INDEX_NAME)

				opener = create_in
				if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):
				opener = open_dir
				changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,
				indexname=COMMIT_INDEX_NAME)

				self.commit_schema = COMMIT_SCHEMA
				self.commit_index = changeset_index
				self.file_schema = FILE_SCHEMA
				self.file_index = file_index
				self.searcher = None

				def cleanup(self):
				if self.searcher:
				self.searcher.close()

				def _extend_query(self, query):
				hashes = re.compile('([0-9a-f]{5,40})').findall(query)
				if hashes:
				hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)
				query = u'(%s) OR %s' % (query, hashes_or_query)
				return query

				def sort_def(self, search_type, direction, sort_field):

				if search_type == 'commit':
				field_defs = {
				'message': 'message',
				'date': 'date',
				'author_email': 'author',
				}
				elif search_type == 'path':
				field_defs = {
				'file': 'path',
				'size': 'size',
				'lines': 'lines',
				}
				elif search_type == 'content':
				# NOTE(dan): content doesn't support any sorting
				field_defs = {}
				else:
				return ''

				if sort_field in field_defs:
				return field_defs[sort_field]

				def search(self, query, document_type, search_user,
				repo_name=None, repo_group_name=None,
				requested_page=1, page_limit=10, sort=None, raise_on_exc=True):

				original_query = query
				query = self._extend_query(query)

				log.debug(u'QUERY: %s on %s', query, document_type)
				result = {
				'results': [],
				'count': 0,
				'error': None,
				'runtime': 0
				}
				search_type, index_name, schema_defn = self._prepare_for_search(
				document_type)
				self._init_searcher(index_name)
				try:
				qp = QueryParser(search_type, schema=schema_defn)
				allowed_repos_filter = self._get_repo_filter(
				search_user, repo_name)
				try:
				query = qp.parse(safe_unicode(query))
				log.debug('query: %s (%s)', query, repr(query))

				reverse, sorted_by = False, None
				direction, sort_field = self.get_sort(search_type, sort)
				if sort_field:
				sort_definition = self.sort_def(search_type, direction, sort_field)
				if sort_definition:
				sorted_by = sort_definition
				if direction == Searcher.DIRECTION_DESC:
				reverse = True
				if direction == Searcher.DIRECTION_ASC:
				reverse = False

				whoosh_results = self.searcher.search(
				query, filter=allowed_repos_filter, limit=None,
				sortedby=sorted_by, reverse=reverse)

				# fixes for 32k limit that whoosh uses for highlight
				whoosh_results.fragmenter.charlimit = None
				res_ln = whoosh_results.scored_length()
				result['runtime'] = whoosh_results.runtime
				result['count'] = res_ln
				result['results'] = WhooshResultWrapper(
				search_type, res_ln, whoosh_results)

				except QueryParserError:
				result['error'] = 'Invalid search query. Try quoting it.'
				except (EmptyIndexError, IOError, OSError):
				msg = 'There is no index to search in. Please run whoosh indexer'
				log.exception(msg)
				result['error'] = msg
				except Exception:
				msg = 'An error occurred during this search operation'
				log.exception(msg)
				result['error'] = msg

				return result

				def statistics(self, translator):
				_ = translator
				stats = [
				{'key': _('Index Type'), 'value': 'Whoosh'},
				{'sep': True},

				{'key': _('File Index'), 'value': str(self.file_index)},
				{'key': _('Indexed documents'), 'value': self.file_index.doc_count()},
				{'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},

				{'sep': True},

				{'key': _('Commit index'), 'value': str(self.commit_index)},
				{'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},
				{'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}
				]
				return stats

				def _get_repo_filter(self, auth_user, repo_name):

				allowed_to_search = [
				repo for repo, perm in
				auth_user.permissions['repositories'].items()
				if perm != 'repository.none']

				if repo_name:
				repo_filter = [query_lib.Term('repository', repo_name)]

				elif 'hg.admin' in auth_user.permissions.get('global', []):
				return None

				else:
				repo_filter = [query_lib.Term('repository', _rn)
				for _rn in allowed_to_search]
				# in case we're not allowed to search anywhere, it's a trick
				# to tell whoosh we're filtering, on ALL results
				repo_filter = repo_filter or [query_lib.Term('repository', '')]

				return query_lib.Or(repo_filter)

				def _prepare_for_search(self, cur_type):
				search_type = {
				'content': 'content',
				'commit': 'message',
				'path': 'path',
				'repository': 'repository'
				}.get(cur_type, 'content')

				index_name = {
				'content': FILE_INDEX_NAME,
				'commit': COMMIT_INDEX_NAME,
				'path': FILE_INDEX_NAME
				}.get(cur_type, FILE_INDEX_NAME)

				schema_defn = {
				'content': self.file_schema,
				'commit': self.commit_schema,
				'path': self.file_schema
				}.get(cur_type, self.file_schema)

				log.debug('IDX: %s', index_name)
				log.debug('SCHEMA: %s', schema_defn)
				return search_type, index_name, schema_defn

				def _init_searcher(self, index_name):
				idx = open_dir(self.config['location'], indexname=index_name)
				self.searcher = idx.searcher()
				return self.searcher


				Searcher = WhooshSearcher


				class WhooshResultWrapper(object):
				def __init__(self, search_type, total_hits, results):
				self.search_type = search_type
				self.results = results
				self.total_hits = total_hits

				def __str__(self):
				return '<%s at %s>' % (self.__class__.__name__, len(self))

				def __repr__(self):
				return self.__str__()

				def __len__(self):
				return self.total_hits

				def __iter__(self):
				"""
				Allows Iteration over results,and lazy generate content

				Requires implementation of ``__getitem__`` method.
				"""
				for hit in self.results:
				yield self.get_full_content(hit)

				def __getitem__(self, key):
				"""
				Slicing of resultWrapper
				"""
				i, j = key.start, key.stop
				for hit in self.results[i:j]:
				yield self.get_full_content(hit)

				def get_full_content(self, hit):
				# TODO: marcink: this feels like an overkill, there's a lot of data
				# inside hit object, and we don't need all
				res = dict(hit)
				# elastic search uses that, we set it empty so it fallbacks to regular HL logic
				res['content_highlight'] = ''

				f_path = '' # pragma: no cover
				if self.search_type in ['content', 'path']:
				f_path = res['path'][len(res['repository']):]
				f_path = f_path.lstrip(os.sep)

				if self.search_type == 'content':
				res.update({'content_short_hl': hit.highlights('content'),
				'f_path': f_path})
				elif self.search_type == 'path':
				res.update({'f_path': f_path})
				elif self.search_type == 'message':
				res.update({'message_hl': hit.highlights('message')})

				return res