upstream/kallithea Commit - r2388:a0ef98f2

added ID field in whoosh SCHEMA that solves the issue of reindexing modified files

marcink -

r2388:a0ef98f2 beta

parent child

rhodecode/lib/indexers/__init__.py

0 +1 0

              # -*- coding: utf-8 -*-
              """
                  rhodecode.lib.indexers.__init__
                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                  Whoosh indexing module for RhodeCode
                  :created_on: Aug 17, 2010
                  :author: marcink
                  :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                  :license: GPLv3, see COPYING for more details.
              """
              # This program is free software: you can redistribute it and/or modify
              # it under the terms of the GNU General Public License as published by
              # the Free Software Foundation, either version 3 of the License, or
              # (at your option) any later version.
              #
              # This program is distributed in the hope that it will be useful,
              # but WITHOUT ANY WARRANTY; without even the implied warranty of
              # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              # GNU General Public License for more details.
              #
              # You should have received a copy of the GNU General Public License
              # along with this program.  If not, see <http://www.gnu.org/licenses/>.
              import os
              import sys
              import traceback
              import logging
              from os.path import dirname as dn, join as jn
              #to get the rhodecode import
              sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
              from string import strip
              from shutil import rmtree
              from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
              from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
              from whoosh.index import create_in, open_dir
              from whoosh.formats import Characters
              from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
              from webhelpers.html.builder import escape
              from sqlalchemy import engine_from_config
              from rhodecode.model import init_model
              from rhodecode.model.scm import ScmModel
              from rhodecode.model.repo import RepoModel
              from rhodecode.config.environment import load_environment
              from rhodecode.lib.utils2 import LazyProperty
              from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
                  load_rcextensions
              # CUSTOM ANALYZER wordsplit + lowercase filter
              ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
              #INDEX SCHEMA DEFINITION
              SCHEMA = Schema(
+                 fileid=ID(unique=True),
                  owner=TEXT(),
                  repository=TEXT(stored=True),
                  path=TEXT(stored=True),
                  content=FieldType(format=Characters(), analyzer=ANALYZER,
                                    scorable=True, stored=True),
                  modtime=STORED(),
                  extension=TEXT(stored=True)
              )
              IDX_NAME = 'HG_INDEX'
              FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
              FRAGMENTER = ContextFragmenter(200)
              class MakeIndex(BasePasterCommand):
                  max_args = 1
                  min_args = 1
                  usage = "CONFIG_FILE"
                  summary = "Creates index for full text search given configuration file"
                  group_name = "RhodeCode"
                  takes_config_file = -1
                  parser = Command.standard_parser(verbose=True)
                  def command(self):
                      logging.config.fileConfig(self.path_to_ini_file)
                      from pylons import config
                      add_cache(config)
                      engine = engine_from_config(config, 'sqlalchemy.db1.')
                      init_model(engine)
                      index_location = config['index_dir']
                      repo_location = self.options.repo_location \
                          if self.options.repo_location else RepoModel().repos_path
                      repo_list = map(strip, self.options.repo_list.split(',')) \
                          if self.options.repo_list else None
                      repo_update_list = map(strip, self.options.repo_update_list.split(',')) \
                          if self.options.repo_update_list else None
                      load_rcextensions(config['here'])
                      #======================================================================
                      # WHOOSH DAEMON
                      #======================================================================
                      from rhodecode.lib.pidlock import LockHeld, DaemonLock
                      from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
                      try:
                          l = DaemonLock(file_=jn(dn(dn(index_location)), 'make_index.lock'))
                          WhooshIndexingDaemon(index_location=index_location,
                                               repo_location=repo_location,
                                               repo_list=repo_list,
                                               repo_update_list=repo_update_list)\
                              .run(full_index=self.options.full_index)
                          l.release()
                      except LockHeld:
                          sys.exit(1)
                  def update_parser(self):
                      self.parser.add_option('--repo-location',
                                        action='store',
                                        dest='repo_location',
                                        help="Specifies repositories location to index OPTIONAL",
                                        )
                      self.parser.add_option('--index-only',
                                        action='store',
                                        dest='repo_list',
                                        help="Specifies a comma separated list of repositores "
                                              "to build index on. If not given all repositories "
                                              "are scanned for indexing. OPTIONAL",
                                        )
                      self.parser.add_option('--update-only',
                                        action='store',
                                        dest='repo_update_list',
                                        help="Specifies a comma separated list of repositores "
                                              "to re-build index on. OPTIONAL",
                                        )
                      self.parser.add_option('-f',
                                        action='store_true',
                                        dest='full_index',
                                        help="Specifies that index should be made full i.e"
                                              " destroy old and build from scratch",
                                        default=False)
              class WhooshResultWrapper(object):
                  def __init__(self, search_type, searcher, matcher, highlight_items,
                               repo_location):
                      self.search_type = search_type
                      self.searcher = searcher
                      self.matcher = matcher
                      self.highlight_items = highlight_items
                      self.fragment_size = 200
                      self.repo_location = repo_location
                  @LazyProperty
                  def doc_ids(self):
                      docs_id = []
                      while self.matcher.is_active():
                          docnum = self.matcher.id()
                          chunks = [offsets for offsets in self.get_chunks()]
                          docs_id.append([docnum, chunks])
                          self.matcher.next()
                      return docs_id
                  def __str__(self):
                      return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
                  def __repr__(self):
                      return self.__str__()
                  def __len__(self):
                      return len(self.doc_ids)
                  def __iter__(self):
                      """
                      Allows Iteration over results,and lazy generate content
                      *Requires* implementation of ``__getitem__`` method.
                      """
                      for docid in self.doc_ids:
                          yield self.get_full_content(docid)
                  def __getitem__(self, key):
                      """
                      Slicing of resultWrapper
                      """
                      i, j = key.start, key.stop
                      slices = []
                      for docid in self.doc_ids[i:j]:
                          slices.append(self.get_full_content(docid))
                      return slices
                  def get_full_content(self, docid):
                      res = self.searcher.stored_fields(docid[0])
                      full_repo_path = jn(self.repo_location, res['repository'])
                      f_path = res['path'].split(full_repo_path)[-1]
                      f_path = f_path.lstrip(os.sep)
                      content_short = self.get_short_content(res, docid[1])
                      res.update({'content_short': content_short,
                                  'content_short_hl': self.highlight(content_short),
                                  'f_path': f_path})
                      return res
                  def get_short_content(self, res, chunks):
                      return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
                  def get_chunks(self):
                      """
                      Smart function that implements chunking the content
                      but not overlap chunks so it doesn't highlight the same
                      close occurrences twice.
                      :param matcher:
                      :param size:
                      """
                      memory = [(0, 0)]
                      for span in self.matcher.spans():
                          start = span.startchar or 0
                          end = span.endchar or 0
                          start_offseted = max(0, start - self.fragment_size)
                          end_offseted = end + self.fragment_size
                          if start_offseted < memory[-1][1]:
                              start_offseted = memory[-1][1]
                          memory.append((start_offseted, end_offseted,))
                          yield (start_offseted, end_offseted,)
                  def highlight(self, content, top=5):
                      if self.search_type != 'content':
                          return ''
                      hl = highlight(
                          text=escape(content),
                          terms=self.highlight_items,
                          analyzer=ANALYZER,
                          fragmenter=FRAGMENTER,
                          formatter=FORMATTER,
                          top=top
                      )
                      return hl

rhodecode/lib/indexers/daemon.py

0 +9 -3

              # -*- coding: utf-8 -*-
              """
                  rhodecode.lib.indexers.daemon
                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                  A daemon will read from task table and run tasks
                  :created_on: Jan 26, 2010
                  :author: marcink
                  :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                  :license: GPLv3, see COPYING for more details.
              """
              # This program is free software: you can redistribute it and/or modify
              # it under the terms of the GNU General Public License as published by
              # the Free Software Foundation, either version 3 of the License, or
              # (at your option) any later version.
              #
              # This program is distributed in the hope that it will be useful,
              # but WITHOUT ANY WARRANTY; without even the implied warranty of
              # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              # GNU General Public License for more details.
              #
              # You should have received a copy of the GNU General Public License
              # along with this program.  If not, see <http://www.gnu.org/licenses/>.
              import os
              import sys
              import logging
              import traceback
              from shutil import rmtree
              from time import mktime
              from os.path import dirname as dn
              from os.path import join as jn
              #to get the rhodecode import
              project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
              sys.path.append(project_path)
              from rhodecode.config.conf import INDEX_EXTENSIONS
              from rhodecode.model.scm import ScmModel
              from rhodecode.lib.utils2 import safe_unicode
              from rhodecode.lib.indexers import SCHEMA, IDX_NAME
              from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                  NodeDoesNotExistError
              from whoosh.index import create_in, open_dir
              log = logging.getLogger('whoosh_indexer')
              class WhooshIndexingDaemon(object):
                  """
                  Daemon for atomic indexing jobs
                  """
                  def __init__(self, indexname=IDX_NAME, index_location=None,
                               repo_location=None, sa=None, repo_list=None,
                               repo_update_list=None):
                      self.indexname = indexname
                      self.index_location = index_location
                      if not index_location:
                          raise Exception('You have to provide index location')
                      self.repo_location = repo_location
                      if not repo_location:
                          raise Exception('You have to provide repositories location')
                      self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                      #filter repo list
                      if repo_list:
                          self.filtered_repo_paths = {}
                          for repo_name, repo in self.repo_paths.items():
                              if repo_name in repo_list:
                                  self.filtered_repo_paths[repo_name] = repo
                          self.repo_paths = self.filtered_repo_paths
                      #filter update repo list
                      self.filtered_repo_update_paths = {}
                      if repo_update_list:
                          self.filtered_repo_update_paths = {}
                          for repo_name, repo in self.repo_paths.items():
                              if repo_name in repo_update_list:
                                  self.filtered_repo_update_paths[repo_name] = repo
                          self.repo_paths = self.filtered_repo_update_paths
                      self.initial = False
                      if not os.path.isdir(self.index_location):
                          os.makedirs(self.index_location)
                          log.info('Cannot run incremental index since it does not'
                                   ' yet exist running full build')
                          self.initial = True
                  def get_paths(self, repo):
                      """
                      recursive walk in root dir and return a set of all path in that dir
                      based on repository walk function
                      """
                      index_paths_ = set()
                      try:
                          tip = repo.get_changeset('tip')
                          for topnode, dirs, files in tip.walk('/'):
                              for f in files:
                                  index_paths_.add(jn(repo.path, f.path))
                      except RepositoryError, e:
                          log.debug(traceback.format_exc())
                          pass
                      return index_paths_
                  def get_node(self, repo, path):
                      n_path = path[len(repo.path) + 1:]
                      node = repo.get_changeset().get_node(n_path)
                      return node
                  def get_node_mtime(self, node):
                      return mktime(node.last_changeset.date.timetuple())
                  def add_doc(self, writer, path, repo, repo_name):
                      """
                      Adding doc to writer this function itself fetches data from
                      the instance of vcs backend
                      """
                      node = self.get_node(repo, path)
                      indexed = indexed_w_content = 0
                      # we just index the content of chosen files, and skip binary files
                      if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                          u_content = node.content
                          if not isinstance(u_content, unicode):
                              log.warning('  >> %s Could not get this content as unicode '
                                          'replacing with empty content' % path)
                              u_content = u''
                          else:
                              log.debug('    >> %s [WITH CONTENT]' % path)
                              indexed_w_content += 1
                      else:
                          log.debug('    >> %s' % path)
                          # just index file name without it's content
                          u_content = u''
                          indexed += 1
+                     p = safe_unicode(path)
                      writer.add_document(
+                         fileid=p,
                          owner=unicode(repo.contact),
                          repository=safe_unicode(repo_name),
-                         path=safe_unicode(path),
+                         path=p,
                          content=u_content,
                          modtime=self.get_node_mtime(node),
                          extension=node.extension
                      )
                      return indexed, indexed_w_content
                  def build_index(self):
                      if os.path.exists(self.index_location):
                          log.debug('removing previous index')
                          rmtree(self.index_location)
                      if not os.path.exists(self.index_location):
                          os.mkdir(self.index_location)
                      idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                      writer = idx.writer()
                      log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS)
                      for repo_name, repo in self.repo_paths.items():
                          log.debug('building index @ %s' % repo.path)
                          i_cnt = iwc_cnt = 0
                          for idx_path in self.get_paths(repo):
                              i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
                              i_cnt += i
                              iwc_cnt += iwc
                          log.debug('added %s files %s with content for repo %s' % (
                                       i_cnt + iwc_cnt, iwc_cnt, repo.path)
                          )
                      log.debug('>> COMMITING CHANGES <<')
                      writer.commit(merge=True)
                      log.debug('>>> FINISHED BUILDING INDEX <<<')
                  def update_index(self):
                      log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                      idx = open_dir(self.index_location, indexname=self.indexname)
                      # The set of all paths in the index
                      indexed_paths = set()
                      # The set of all paths we need to re-index
                      to_index = set()
                      reader = idx.reader()
                      writer = idx.writer()
                      # Loop over the stored fields in the index
                      for fields in reader.all_stored_fields():
                          indexed_path = fields['path']
                          indexed_repo_path = fields['repository']
                          indexed_paths.add(indexed_path)
                          if not indexed_repo_path in self.filtered_repo_update_paths:
                              continue
                          repo = self.repo_paths[indexed_repo_path]
                          try:
                              node = self.get_node(repo, indexed_path)
                              # Check if this file was changed since it was indexed
                              indexed_time = fields['modtime']
                              mtime = self.get_node_mtime(node)
                              if mtime > indexed_time:
                                  # The file has changed, delete it and add it to the list of
                                  # files to reindex
-                                 log.debug('adding to reindex list %s' % indexed_path)
-                                 writer.delete_by_term('path', indexed_path)
+                                 log.debug('adding to reindex list %s mtime: %s vs %s' % (
+                                                 indexed_path, mtime, indexed_time)
+                                 )
+                                 writer.delete_by_term('fileid', indexed_path)
                                  to_index.add(indexed_path)
                          except (ChangesetError, NodeDoesNotExistError):
                              # This file was deleted since it was indexed
                              log.debug('removing from index %s' % indexed_path)
                              writer.delete_by_term('path', indexed_path)
                      # Loop over the files in the filesystem
                      # Assume we have a function that gathers the filenames of the
                      # documents to be indexed
                      ri_cnt = riwc_cnt = 0
                      for repo_name, repo in self.repo_paths.items():
                          for path in self.get_paths(repo):
                              path = safe_unicode(path)
                              if path in to_index or path not in indexed_paths:
                                  # This is either a file that's changed, or a new file
                                  # that wasn't indexed before. So index it!
                                  i, iwc = self.add_doc(writer, path, repo, repo_name)
                                  log.debug('re indexing %s' % path)
                                  ri_cnt += i
                                  riwc_cnt += iwc
                      log.debug('added %s files %s with content for repo %s' % (
                                   ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                      )
                      log.debug('>> COMMITING CHANGES <<')
                      writer.commit(merge=True)
                      log.debug('>>> FINISHED REBUILDING INDEX <<<')
                  def run(self, full_index=False):
                      """Run daemon"""
                      if full_index or self.initial:
                          self.build_index()
                      else:
                          self.update_index()

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages