upstream/kallithea Files · rhodecode/lib/indexers/daemon.py

reset charset for git rpc cals also

marcink - - Load All Authors

File last commit:

r2569:b98fd6fc beta


                r2581:ee980ead

beta

Download file

             daemon.py
        
                    267 lines
            
             | 9.6 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / indexers / daemon.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        marcink
    
fixes to #92, updated changelog

              r885
            
      # -*- coding: utf-8 -*-

      """

          rhodecode.lib.indexers.daemon

          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        marcink
    
fixed daemon typos

              r1377
            
          A daemon will read from task table and run tasks

        marcink
    
Fixed whoosh daemon, for depracated walk method

              r947
            
        marcink
    
fixes to #92, updated changelog

              r885
            
          :created_on: Jan 26, 2010

          :author: marcink

        marcink
    
2012 copyrights

              r1824
            
          :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>

        marcink
    
fixes to #92, updated changelog

              r885
            
          :license: GPLv3, see COPYING for more details.

      """

        marcink
    
fixed license  issue #149

              r1206
            
      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU General Public License as published by

      # the Free Software Foundation, either version 3 of the License, or

      # (at your option) any later version.

        marcink
    
Fixed whoosh daemon, for depracated walk method

              r947
            
      #

        marcink
    
renamed project to rhodecode

              r547
            
      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

        marcink
    
Fixed whoosh daemon, for depracated walk method

              r947
            
      #

        marcink
    
renamed project to rhodecode

              r547
            
      # You should have received a copy of the GNU General Public License

        marcink
    
fixed license  issue #149

              r1206
            
      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

        marcink
    
renamed project to rhodecode

              r547
            
        marcink
    
simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function

              r1154
            
      import os

        marcink
    
renamed project to rhodecode

              r547
            
      import sys

        marcink
    
simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function

              r1154
            
      import logging

        marcink
    
fixes to #92, updated changelog

              r885
            
      import traceback

        marcink
    
simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function

              r1154
            
      from shutil import rmtree

      from time import mktime

        marcink
    
renamed project to rhodecode

              r547
            
      from os.path import dirname as dn

      from os.path import join as jn

      #to get the rhodecode import

      project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

      sys.path.append(project_path)

        marcink
    
utils/conf...

              r2109
            
      from rhodecode.config.conf import INDEX_EXTENSIONS

        marcink
    
Refactor codes for scm model...

              r691
            
      from rhodecode.model.scm import ScmModel

        marcink
    
utils/conf...

              r2109
            
      from rhodecode.lib.utils2 import safe_unicode

      from rhodecode.lib.indexers import SCHEMA, IDX_NAME

        marcink
    
renamed project to rhodecode

              r547
            
        marcink
    
Added VCS into rhodecode core for faster and easier deployments of new versions

              r2007
            
      from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

        marcink
    
fixes issue with whoosh reindexing files that were removed or renamed

              r1711
            
          NodeDoesNotExistError

        marcink
    
rewrote whoosh indexing to run internal repository.walk() instead of filesystem....

              r560
            
        marcink
    
simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function

              r1154
            
      from whoosh.index import create_in, open_dir

        marcink
    
code garden

              r2101
            
      log = logging.getLogger('whoosh_indexer')

        marcink
    
renamed project to rhodecode

              r547
            
        marcink
    
bumbed whoosh to 2.3.X series...

              r1995
            
        marcink
    
renamed project to rhodecode

              r547
            
      class WhooshIndexingDaemon(object):

        marcink
    
rewrote whoosh indexing to run internal repository.walk() instead of filesystem....

              r560
            
          """

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
          Daemon for atomic indexing jobs

        marcink
    
rewrote whoosh indexing to run internal repository.walk() instead of filesystem....

              r560
            
          """

        marcink
    
renamed project to rhodecode

              r547
            
        marcink
    
bumbed whoosh to 2.3.X series...

              r1995
            
          def __init__(self, indexname=IDX_NAME, index_location=None,

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
                       repo_location=None, sa=None, repo_list=None,

                       repo_update_list=None):

        marcink
    
renamed project to rhodecode

              r547
            
              self.indexname = indexname

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              self.index_location = index_location

              if not index_location:

                  raise Exception('You have to provide index location')

        marcink
    
renamed project to rhodecode

              r547
            
              self.repo_location = repo_location

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              if not repo_location:

                  raise Exception('You have to provide repositories location')

        marcink
    
Major refactoring, removed when possible calls to app globals....

              r1036
            
              self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

        marcink
    
fixes #90 + docs update

              r894
            
        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
              #filter repo list

        marcink
    
fixes #90 + docs update

              r894
            
              if repo_list:

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
                  self.filtered_repo_paths = {}

        marcink
    
fixes #90 + docs update

              r894
            
                  for repo_name, repo in self.repo_paths.items():

                      if repo_name in repo_list:

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
                          self.filtered_repo_paths[repo_name] = repo

                  self.repo_paths = self.filtered_repo_paths

        marcink
    
fixes #90 + docs update

              r894
            
        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
              #filter update repo list

              self.filtered_repo_update_paths = {}

              if repo_update_list:

                  self.filtered_repo_update_paths = {}

                  for repo_name, repo in self.repo_paths.items():

                      if repo_name in repo_update_list:

                          self.filtered_repo_update_paths[repo_name] = repo

                  self.repo_paths = self.filtered_repo_update_paths

        marcink
    
fixes #90 + docs update

              r894
            
        marcink
    
renamed project to rhodecode

              r547
            
              self.initial = False

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              if not os.path.isdir(self.index_location):

        marcink
    
extended trending languages to more entries, implemented new faster and "fancy"...

              r763
            
                  os.makedirs(self.index_location)

        marcink
    
renamed project to rhodecode

              r547
            
                  log.info('Cannot run incremental index since it does not'

                           ' yet exist running full build')

                  self.initial = True

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
          def get_paths(self, repo):

        marcink
    
code garden

              r2101
            
              """

              recursive walk in root dir and return a set of all path in that dir

        marcink
    
rewrote whoosh indexing to run internal repository.walk() instead of filesystem....

              r560
            
              based on repository walk function

              """

        marcink
    
renamed project to rhodecode

              r547
            
              index_paths_ = set()

        marcink
    
fixed whoosh failure on new repository...

              r567
            
              try:

        marcink
    
Fixed whoosh daemon, for depracated walk method

              r947
            
                  tip = repo.get_changeset('tip')

                  for topnode, dirs, files in tip.walk('/'):

        marcink
    
renamed project to rhodecode

              r547
            
                      for f in files:

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
                          index_paths_.add(jn(repo.path, f.path))

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
fixes to #92, updated changelog

              r885
            
              except RepositoryError, e:

                  log.debug(traceback.format_exc())

        marcink
    
fixed whoosh failure on new repository...

              r567
            
                  pass

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              return index_paths_

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
          def get_node(self, repo, path):

              n_path = path[len(repo.path) + 1:]

              node = repo.get_changeset().get_node(n_path)

              return node

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
          def get_node_mtime(self, node):

              return mktime(node.last_changeset.date.timetuple())

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
changes for #56

              r1171
            
          def add_doc(self, writer, path, repo, repo_name):

        marcink
    
code garden

              r2101
            
              """

              Adding doc to writer this function itself fetches data from

              the instance of vcs backend

              """

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
              node = self.get_node(repo, path)

        marcink
    
utils/conf...

              r2109
            
              indexed = indexed_w_content = 0

        marcink
    
code garden

              r2101
            
              # we just index the content of chosen files, and skip binary files

        marcink
    
#92 removed content of binary files for whoosh indexer

              r886
            
              if node.extension in INDEX_EXTENSIONS and not node.is_binary:

        marcink
    
rewrote whoosh indexing to run internal repository.walk() instead of filesystem....

              r560
            
                  u_content = node.content

        marcink
    
fixes to #92, updated changelog

              r885
            
                  if not isinstance(u_content, unicode):

                      log.warning('  >> %s Could not get this content as unicode '

        marcink
    
code garden

              r2101
            
                                  'replacing with empty content' % path)

        marcink
    
fixes to #92, updated changelog

              r885
            
                      u_content = u''

                  else:

                      log.debug('    >> %s [WITH CONTENT]' % path)

        marcink
    
utils/conf...

              r2109
            
                      indexed_w_content += 1

        marcink
    
fixes to #92, updated changelog

              r885
            
        marcink
    
renamed project to rhodecode

              r547
            
              else:

                  log.debug('    >> %s' % path)

        marcink
    
code garden

              r2101
            
                  # just index file name without it's content

        marcink
    
renamed project to rhodecode

              r547
            
                  u_content = u''

        marcink
    
utils/conf...

              r2109
            
                  indexed += 1

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files

              r2388
            
              p = safe_unicode(path)

        marcink
    
code garden

              r2101
            
              writer.add_document(

        marcink
    
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files

              r2388
            
                  fileid=p,

        marcink
    
code garden

              r2101
            
                  owner=unicode(repo.contact),

                  repository=safe_unicode(repo_name),

        marcink
    
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files

              r2388
            
                  path=p,

        marcink
    
code garden

              r2101
            
                  content=u_content,

                  modtime=self.get_node_mtime(node),

                  extension=node.extension

              )

        marcink
    
utils/conf...

              r2109
            
              return indexed, indexed_w_content

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
          def build_index(self):

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              if os.path.exists(self.index_location):

        marcink
    
rewrote whoosh indexing to run internal repository.walk() instead of filesystem....

              r560
            
                  log.debug('removing previous index')

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
                  rmtree(self.index_location)

              if not os.path.exists(self.index_location):

                  os.mkdir(self.index_location)

              idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

        marcink
    
renamed project to rhodecode

              r547
            
              writer = idx.writer()

        marcink
    
Little better logging in whoosh indexer

              r2569
            
              log.debug('BUILDING INDEX FOR EXTENSIONS %s '

                        'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

        marcink
    
changes for #56

              r1171
            
              for repo_name, repo in self.repo_paths.items():

        marcink
    
renamed project to rhodecode

              r547
            
                  log.debug('building index @ %s' % repo.path)

        marcink
    
utils/conf...

              r2109
            
                  i_cnt = iwc_cnt = 0

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
                  for idx_path in self.get_paths(repo):

        marcink
    
utils/conf...

              r2109
            
                      i, iwc = self.add_doc(writer, idx_path, repo, repo_name)

                      i_cnt += i

                      iwc_cnt += iwc

                  log.debug('added %s files %s with content for repo %s' % (

                               i_cnt + iwc_cnt, iwc_cnt, repo.path)

                  )

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
              log.debug('>> COMMITING CHANGES <<')

        marcink
    
renamed project to rhodecode

              r547
            
              writer.commit(merge=True)

              log.debug('>>> FINISHED BUILDING INDEX <<<')

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
          def update_index(self):

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
              log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

                         'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              idx = open_dir(self.index_location, indexname=self.indexname)

        marcink
    
renamed project to rhodecode

              r547
            
              # The set of all paths in the index

              indexed_paths = set()

              # The set of all paths we need to re-index

              to_index = set()

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
              reader = idx.reader()

              writer = idx.writer()

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
              # Loop over the stored fields in the index

              for fields in reader.all_stored_fields():

                  indexed_path = fields['path']

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
                  indexed_repo_path = fields['repository']

        marcink
    
renamed project to rhodecode

              r547
            
                  indexed_paths.add(indexed_path)

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
                  if not indexed_repo_path in self.filtered_repo_update_paths:

                      continue

                  repo = self.repo_paths[indexed_repo_path]

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
                  try:

                      node = self.get_node(repo, indexed_path)

                      # Check if this file was changed since it was indexed

        marcink
    
renamed project to rhodecode

              r547
            
                      indexed_time = fields['modtime']

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
                      mtime = self.get_node_mtime(node)

        marcink
    
renamed project to rhodecode

              r547
            
                      if mtime > indexed_time:

                          # The file has changed, delete it and add it to the list of

                          # files to reindex

        marcink
    
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files

              r2388
            
                          log.debug('adding to reindex list %s mtime: %s vs %s' % (

                                          indexed_path, mtime, indexed_time)

                          )

                          writer.delete_by_term('fileid', indexed_path)

        marcink
    
renamed project to rhodecode

              r547
            
                          to_index.add(indexed_path)

        marcink
    
#469 added --update-only option to whoosh to re-index only given list...

              r2373
            
                  except (ChangesetError, NodeDoesNotExistError):

                      # This file was deleted since it was indexed

                      log.debug('removing from index %s' % indexed_path)

                      writer.delete_by_term('path', indexed_path)

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
              # Loop over the files in the filesystem

              # Assume we have a function that gathers the filenames of the

              # documents to be indexed

        marcink
    
Little better logging in whoosh indexer

              r2569
            
              ri_cnt_total = 0  # indexed

              riwc_cnt_total = 0  # indexed with content

        marcink
    
changes for #56

              r1171
            
              for repo_name, repo in self.repo_paths.items():

        marcink
    
Little better logging in whoosh indexer

              r2569
            
                  ri_cnt = 0   # indexed

                  riwc_cnt = 0  # indexed with content

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
                  for path in self.get_paths(repo):

        marcink
    
fixed issue with whoosh always re-indexing non-ascii filenames even if they didn't change

              r2372
            
                      path = safe_unicode(path)

        marcink
    
renamed project to rhodecode

              r547
            
                      if path in to_index or path not in indexed_paths:

        marcink
    
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files

              r2388
            
        marcink
    
renamed project to rhodecode

              r547
            
                          # This is either a file that's changed, or a new file

                          # that wasn't indexed before. So index it!

        marcink
    
utils/conf...

              r2109
            
                          i, iwc = self.add_doc(writer, path, repo, repo_name)

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
                          log.debug('re indexing %s' % path)

        marcink
    
utils/conf...

              r2109
            
                          ri_cnt += i

        marcink
    
Little better logging in whoosh indexer

              r2569
            
                          ri_cnt_total += 1

        marcink
    
utils/conf...

              r2109
            
                          riwc_cnt += iwc

        marcink
    
Little better logging in whoosh indexer

              r2569
            
                          riwc_cnt_total += iwc

                  log.debug('added %s files %s with content for repo %s' % (

                               ri_cnt + riwc_cnt, riwc_cnt, repo.path)

                  )

              log.debug('indexed %s files in total and %s with content' % (

                          ri_cnt_total, riwc_cnt_total)

        marcink
    
utils/conf...

              r2109
            
              )

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
              log.debug('>> COMMITING CHANGES <<')

        marcink
    
renamed project to rhodecode

              r547
            
              writer.commit(merge=True)

        marcink
    
fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.

              r561
            
              log.debug('>>> FINISHED REBUILDING INDEX <<<')

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
          def run(self, full_index=False):

              """Run daemon"""

              if full_index or self.initial:

                  self.build_index()

              else:

                  self.update_index()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

marcink fixes to #92, updated changelog	r885	# -- coding: utf-8 --
		"""
		rhodecode.lib.indexers.daemon
		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

marcink fixed daemon typos	r1377	A daemon will read from task table and run tasks
marcink Fixed whoosh daemon, for depracated walk method	r947
marcink fixes to #92, updated changelog	r885	:created_on: Jan 26, 2010
		:author: marcink
marcink 2012 copyrights	r1824	:copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
marcink fixes to #92, updated changelog	r885	:license: GPLv3, see COPYING for more details.
		"""
marcink fixed license issue #149	r1206	# This program is free software: you can redistribute it and/or modify
		# it under the terms of the GNU General Public License as published by
		# the Free Software Foundation, either version 3 of the License, or
		# (at your option) any later version.
marcink Fixed whoosh daemon, for depracated walk method	r947	#
marcink renamed project to rhodecode	r547	# This program is distributed in the hope that it will be useful,
		# but WITHOUT ANY WARRANTY; without even the implied warranty of
		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		# GNU General Public License for more details.
marcink Fixed whoosh daemon, for depracated walk method	r947	#
marcink renamed project to rhodecode	r547	# You should have received a copy of the GNU General Public License
marcink fixed license issue #149	r1206	# along with this program. If not, see <http://www.gnu.org/licenses/>.
marcink renamed project to rhodecode	r547
marcink simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function	r1154	import os
marcink renamed project to rhodecode	r547	import sys
marcink simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function	r1154	import logging
marcink fixes to #92, updated changelog	r885	import traceback
marcink simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function	r1154
		from shutil import rmtree
		from time import mktime

marcink renamed project to rhodecode	r547	from os.path import dirname as dn
		from os.path import join as jn

		#to get the rhodecode import
		project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
		sys.path.append(project_path)

marcink utils/conf...	r2109	from rhodecode.config.conf import INDEX_EXTENSIONS
marcink Refactor codes for scm model...	r691	from rhodecode.model.scm import ScmModel
marcink utils/conf...	r2109	from rhodecode.lib.utils2 import safe_unicode
		from rhodecode.lib.indexers import SCHEMA, IDX_NAME
marcink renamed project to rhodecode	r547
marcink Added VCS into rhodecode core for faster and easier deployments of new versions	r2007	from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
marcink fixes issue with whoosh reindexing files that were removed or renamed	r1711	NodeDoesNotExistError
marcink rewrote whoosh indexing to run internal repository.walk() instead of filesystem....	r560
marcink simplified str2bool, and moved safe_unicode out of helpers since it was not html specific function	r1154	from whoosh.index import create_in, open_dir

marcink code garden	r2101	log = logging.getLogger('whoosh_indexer')
marcink renamed project to rhodecode	r547
marcink bumbed whoosh to 2.3.X series...	r1995
marcink renamed project to rhodecode	r547	class WhooshIndexingDaemon(object):
marcink rewrote whoosh indexing to run internal repository.walk() instead of filesystem....	r560	"""
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	Daemon for atomic indexing jobs
marcink rewrote whoosh indexing to run internal repository.walk() instead of filesystem....	r560	"""
marcink renamed project to rhodecode	r547
marcink bumbed whoosh to 2.3.X series...	r1995	def __init__(self, indexname=IDX_NAME, index_location=None,
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	repo_location=None, sa=None, repo_list=None,
		repo_update_list=None):
marcink renamed project to rhodecode	r547	self.indexname = indexname
marcink Hacking for git support,and new faster repo scan	r631
		self.index_location = index_location
		if not index_location:
		raise Exception('You have to provide index location')

marcink renamed project to rhodecode	r547	self.repo_location = repo_location
marcink Hacking for git support,and new faster repo scan	r631	if not repo_location:
		raise Exception('You have to provide repositories location')

marcink Major refactoring, removed when possible calls to app globals....	r1036	self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
marcink fixes #90 + docs update	r894
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	#filter repo list
marcink fixes #90 + docs update	r894	if repo_list:
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	self.filtered_repo_paths = {}
marcink fixes #90 + docs update	r894	for repo_name, repo in self.repo_paths.items():
		if repo_name in repo_list:
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	self.filtered_repo_paths[repo_name] = repo

		self.repo_paths = self.filtered_repo_paths
marcink fixes #90 + docs update	r894
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	#filter update repo list
		self.filtered_repo_update_paths = {}
		if repo_update_list:
		self.filtered_repo_update_paths = {}
		for repo_name, repo in self.repo_paths.items():
		if repo_name in repo_update_list:
		self.filtered_repo_update_paths[repo_name] = repo
		self.repo_paths = self.filtered_repo_update_paths
marcink fixes #90 + docs update	r894
marcink renamed project to rhodecode	r547	self.initial = False
marcink Hacking for git support,and new faster repo scan	r631	if not os.path.isdir(self.index_location):
marcink extended trending languages to more entries, implemented new faster and "fancy"...	r763	os.makedirs(self.index_location)
marcink renamed project to rhodecode	r547	log.info('Cannot run incremental index since it does not'
		' yet exist running full build')
		self.initial = True
marcink Hacking for git support,and new faster repo scan	r631
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	def get_paths(self, repo):
marcink code garden	r2101	"""
		recursive walk in root dir and return a set of all path in that dir
marcink rewrote whoosh indexing to run internal repository.walk() instead of filesystem....	r560	based on repository walk function
		"""
marcink renamed project to rhodecode	r547	index_paths_ = set()
marcink fixed whoosh failure on new repository...	r567	try:
marcink Fixed whoosh daemon, for depracated walk method	r947	tip = repo.get_changeset('tip')
		for topnode, dirs, files in tip.walk('/'):
marcink renamed project to rhodecode	r547	for f in files:
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	index_paths_.add(jn(repo.path, f.path))
marcink Hacking for git support,and new faster repo scan	r631
marcink fixes to #92, updated changelog	r885	except RepositoryError, e:
		log.debug(traceback.format_exc())
marcink fixed whoosh failure on new repository...	r567	pass
marcink Hacking for git support,and new faster repo scan	r631	return index_paths_

marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	def get_node(self, repo, path):
		n_path = path[len(repo.path) + 1:]
		node = repo.get_changeset().get_node(n_path)
		return node
marcink Hacking for git support,and new faster repo scan	r631
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	def get_node_mtime(self, node):
		return mktime(node.last_changeset.date.timetuple())
marcink Hacking for git support,and new faster repo scan	r631
marcink changes for #56	r1171	def add_doc(self, writer, path, repo, repo_name):
marcink code garden	r2101	"""
		Adding doc to writer this function itself fetches data from
		the instance of vcs backend
		"""

marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	node = self.get_node(repo, path)
marcink utils/conf...	r2109	indexed = indexed_w_content = 0
marcink code garden	r2101	# we just index the content of chosen files, and skip binary files
marcink #92 removed content of binary files for whoosh indexer	r886	if node.extension in INDEX_EXTENSIONS and not node.is_binary:
marcink rewrote whoosh indexing to run internal repository.walk() instead of filesystem....	r560	u_content = node.content
marcink fixes to #92, updated changelog	r885	if not isinstance(u_content, unicode):
		log.warning(' >> %s Could not get this content as unicode '
marcink code garden	r2101	'replacing with empty content' % path)
marcink fixes to #92, updated changelog	r885	u_content = u''
		else:
		log.debug(' >> %s [WITH CONTENT]' % path)
marcink utils/conf...	r2109	indexed_w_content += 1
marcink fixes to #92, updated changelog	r885
marcink renamed project to rhodecode	r547	else:
		log.debug(' >> %s' % path)
marcink code garden	r2101	# just index file name without it's content
marcink renamed project to rhodecode	r547	u_content = u''
marcink utils/conf...	r2109	indexed += 1
marcink Hacking for git support,and new faster repo scan	r631
marcink #453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files	r2388	p = safe_unicode(path)
marcink code garden	r2101	writer.add_document(
marcink #453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files	r2388	fileid=p,
marcink code garden	r2101	owner=unicode(repo.contact),
		repository=safe_unicode(repo_name),
marcink #453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files	r2388	path=p,
marcink code garden	r2101	content=u_content,
		modtime=self.get_node_mtime(node),
		extension=node.extension
		)
marcink utils/conf...	r2109	return indexed, indexed_w_content
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	def build_index(self):
marcink Hacking for git support,and new faster repo scan	r631	if os.path.exists(self.index_location):
marcink rewrote whoosh indexing to run internal repository.walk() instead of filesystem....	r560	log.debug('removing previous index')
marcink Hacking for git support,and new faster repo scan	r631	rmtree(self.index_location)

		if not os.path.exists(self.index_location):
		os.mkdir(self.index_location)

		idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
marcink renamed project to rhodecode	r547	writer = idx.writer()
marcink Little better logging in whoosh indexer	r2569	log.debug('BUILDING INDEX FOR EXTENSIONS %s '
		'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

marcink changes for #56	r1171	for repo_name, repo in self.repo_paths.items():
marcink renamed project to rhodecode	r547	log.debug('building index @ %s' % repo.path)
marcink utils/conf...	r2109	i_cnt = iwc_cnt = 0
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	for idx_path in self.get_paths(repo):
marcink utils/conf...	r2109	i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
		i_cnt += i
		iwc_cnt += iwc
		log.debug('added %s files %s with content for repo %s' % (
		i_cnt + iwc_cnt, iwc_cnt, repo.path)
		)
marcink Hacking for git support,and new faster repo scan	r631
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	log.debug('>> COMMITING CHANGES <<')
marcink renamed project to rhodecode	r547	writer.commit(merge=True)
		log.debug('>>> FINISHED BUILDING INDEX <<<')
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	def update_index(self):
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
		'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
marcink Hacking for git support,and new faster repo scan	r631
		idx = open_dir(self.index_location, indexname=self.indexname)
marcink renamed project to rhodecode	r547	# The set of all paths in the index
		indexed_paths = set()
		# The set of all paths we need to re-index
		to_index = set()
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	reader = idx.reader()
		writer = idx.writer()
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	# Loop over the stored fields in the index
		for fields in reader.all_stored_fields():
		indexed_path = fields['path']
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	indexed_repo_path = fields['repository']
marcink renamed project to rhodecode	r547	indexed_paths.add(indexed_path)
marcink Hacking for git support,and new faster repo scan	r631
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	if not indexed_repo_path in self.filtered_repo_update_paths:
		continue

		repo = self.repo_paths[indexed_repo_path]
marcink Hacking for git support,and new faster repo scan	r631
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	try:
		node = self.get_node(repo, indexed_path)
		# Check if this file was changed since it was indexed
marcink renamed project to rhodecode	r547	indexed_time = fields['modtime']
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	mtime = self.get_node_mtime(node)
marcink renamed project to rhodecode	r547	if mtime > indexed_time:
		# The file has changed, delete it and add it to the list of
		# files to reindex
marcink #453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files	r2388	log.debug('adding to reindex list %s mtime: %s vs %s' % (
		indexed_path, mtime, indexed_time)
		)
		writer.delete_by_term('fileid', indexed_path)

marcink renamed project to rhodecode	r547	to_index.add(indexed_path)
marcink #469 added --update-only option to whoosh to re-index only given list...	r2373	except (ChangesetError, NodeDoesNotExistError):
		# This file was deleted since it was indexed
		log.debug('removing from index %s' % indexed_path)
		writer.delete_by_term('path', indexed_path)
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	# Loop over the files in the filesystem
		# Assume we have a function that gathers the filenames of the
		# documents to be indexed
marcink Little better logging in whoosh indexer	r2569	ri_cnt_total = 0 # indexed
		riwc_cnt_total = 0 # indexed with content
marcink changes for #56	r1171	for repo_name, repo in self.repo_paths.items():
marcink Little better logging in whoosh indexer	r2569	ri_cnt = 0 # indexed
		riwc_cnt = 0 # indexed with content
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	for path in self.get_paths(repo):
marcink fixed issue with whoosh always re-indexing non-ascii filenames even if they didn't change	r2372	path = safe_unicode(path)
marcink renamed project to rhodecode	r547	if path in to_index or path not in indexed_paths:
marcink #453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files	r2388
marcink renamed project to rhodecode	r547	# This is either a file that's changed, or a new file
		# that wasn't indexed before. So index it!
marcink utils/conf...	r2109	i, iwc = self.add_doc(writer, path, repo, repo_name)
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	log.debug('re indexing %s' % path)
marcink utils/conf...	r2109	ri_cnt += i
marcink Little better logging in whoosh indexer	r2569	ri_cnt_total += 1
marcink utils/conf...	r2109	riwc_cnt += iwc
marcink Little better logging in whoosh indexer	r2569	riwc_cnt_total += iwc
		log.debug('added %s files %s with content for repo %s' % (
		ri_cnt + riwc_cnt, riwc_cnt, repo.path)
		)
		log.debug('indexed %s files in total and %s with content' % (
		ri_cnt_total, riwc_cnt_total)
marcink utils/conf...	r2109	)
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	log.debug('>> COMMITING CHANGES <<')
marcink renamed project to rhodecode	r547	writer.commit(merge=True)
marcink fixed reindexing, and made some optimizations to reuse repo instances from repo scann list.	r561	log.debug('>>> FINISHED REBUILDING INDEX <<<')
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	def run(self, full_index=False):
		"""Run daemon"""
		if full_index or self.initial:
		self.build_index()
		else:
		self.update_index()