upstream/kallithea Files · pylons_app/lib/indexers/daemon.py

updated init scripts to start-stop-daemons

marcink - - Load All Authors

File last commit:

r411:9b67cebe default


                r426:17d5028e

default

Download file

             daemon.py
        
                    181 lines
            
             | 6.4 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / pylons_app / lib / indexers / daemon.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!/usr/bin/env python

      # encoding: utf-8

      # whoosh indexer daemon for hg-app

      # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>

      #

      # This program is free software; you can redistribute it and/or

      # modify it under the terms of the GNU General Public License

      # as published by the Free Software Foundation; version 2

      # of the License or (at your opinion) any later version of the license.

      # 

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      # 

      # You should have received a copy of the GNU General Public License

      # along with this program; if not, write to the Free Software

      # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

      # MA  02110-1301, USA.

      """

      Created on Jan 26, 2010

      @author: marcink

      A deamon will read from task table and run tasks

      """

      import sys

      import os

      from os.path import dirname as dn

      from os.path import join as jn

      #to get the pylons_app import

      project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

      sys.path.append(project_path)

      from pidlock import LockHeld, DaemonLock

      import traceback

      from pylons_app.config.environment import load_environment

      from pylons_app.model.hg_model import HgModel

      from whoosh.index import create_in, open_dir

      from shutil import rmtree

      from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \

      SCHEMA, IDX_NAME

      import logging

      import logging.config

      logging.config.fileConfig(jn(project_path, 'development.ini'))

      log = logging.getLogger('whooshIndexer')

      def scan_paths(root_location):

          return HgModel.repo_scan('/', root_location, None, True)

      class WhooshIndexingDaemon(object):

          """Deamon for atomic jobs"""

          def __init__(self, indexname='HG_INDEX', repo_location=None):

              self.indexname = indexname

              self.repo_location = repo_location

          def get_paths(self, root_dir):

              """recursive walk in root dir and return a set of all path in that dir

              excluding files in .hg dir"""

              index_paths_ = set()

              for path, dirs, files in os.walk(root_dir):

                  if path.find('.hg') == -1:

                      for f in files:

                          index_paths_.add(jn(path, f))

              return index_paths_

          def add_doc(self, writer, path, repo):

              """Adding doc to writer"""

              #we don't won't to read excluded file extensions just index them

              if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:

                  fobj = open(path, 'rb')

                  content = fobj.read()

                  fobj.close()

                  try:

                      u_content = unicode(content)

                  except UnicodeDecodeError:

                      #incase we have a decode error just represent as byte string

                      u_content = unicode(str(content).encode('string_escape'))

              else:

                  u_content = u''    

              writer.add_document(owner=unicode(repo.contact),

                                  repository=u"%s" % repo.name,

                                  path=u"%s" % path,

                                  content=u_content,

                                  modtime=os.path.getmtime(path)) 

          def build_index(self):

              if os.path.exists(IDX_LOCATION):

                  rmtree(IDX_LOCATION)

              if not os.path.exists(IDX_LOCATION):

                  os.mkdir(IDX_LOCATION)

              idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

              writer = idx.writer()

              for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

                  log.debug('building index @ %s' % repo.path)

                  for idx_path in self.get_paths(repo.path):

                      log.debug('    >> %s' % idx_path)

                      self.add_doc(writer, idx_path, repo)

              writer.commit(merge=True)

              log.debug('>>> FINISHED BUILDING INDEX <<<')

          def update_index(self):

              log.debug('STARTING INCREMENTAL INDEXING UPDATE')

              idx = open_dir(IDX_LOCATION, indexname=self.indexname)

              # The set of all paths in the index

              indexed_paths = set()

              # The set of all paths we need to re-index

              to_index = set()

              reader = idx.reader()

              writer = idx.writer()

              # Loop over the stored fields in the index

              for fields in reader.all_stored_fields():

                  indexed_path = fields['path']

                  indexed_paths.add(indexed_path)

                  if not os.path.exists(indexed_path):

                      # This file was deleted since it was indexed

                      log.debug('removing from index %s' % indexed_path)

                      writer.delete_by_term('path', indexed_path)

                  else:

                      # Check if this file was changed since it

                      # was indexed

                      indexed_time = fields['modtime']

                      mtime = os.path.getmtime(indexed_path)

                      if mtime > indexed_time:

                          # The file has changed, delete it and add it to the list of

                          # files to reindex

                          log.debug('adding to reindex list %s' % indexed_path)

                          writer.delete_by_term('path', indexed_path)

                          to_index.add(indexed_path)

                          #writer.commit()

              # Loop over the files in the filesystem

              # Assume we have a function that gathers the filenames of the

              # documents to be indexed

              for repo in scan_paths(self.repo_location).values():

                  for path in self.get_paths(repo.path):

                      if path in to_index or path not in indexed_paths:

                          # This is either a file that's changed, or a new file

                          # that wasn't indexed before. So index it!

                          self.add_doc(writer, path, repo)

                          log.debug('reindexing %s' % path)

              writer.commit(merge=True)

              #idx.optimize()

              log.debug('>>> FINISHED <<<')

          def run(self, full_index=False):

              """Run daemon"""

              if full_index:

                  self.build_index()

              else:

                  self.update_index()

      if __name__ == "__main__":

          repo_location = '/home/marcink/python_workspace_dirty/*'

          try:

              l = DaemonLock()

              WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True)

              l.release()

          except LockHeld:

              sys.exit(1)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!/usr/bin/env python
				# encoding: utf-8
				# whoosh indexer daemon for hg-app
				# Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
				#
				# This program is free software; you can redistribute it and/or
				# modify it under the terms of the GNU General Public License
				# as published by the Free Software Foundation; version 2
				# of the License or (at your opinion) any later version of the license.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU General Public License
				# along with this program; if not, write to the Free Software
				# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
				# MA 02110-1301, USA.
				"""
				Created on Jan 26, 2010

				@author: marcink
				A deamon will read from task table and run tasks
				"""
				import sys
				import os
				from os.path import dirname as dn
				from os.path import join as jn

				#to get the pylons_app import
				project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
				sys.path.append(project_path)

				from pidlock import LockHeld, DaemonLock
				import traceback
				from pylons_app.config.environment import load_environment
				from pylons_app.model.hg_model import HgModel
				from whoosh.index import create_in, open_dir
				from shutil import rmtree
				from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \
				SCHEMA, IDX_NAME

				import logging
				import logging.config
				logging.config.fileConfig(jn(project_path, 'development.ini'))
				log = logging.getLogger('whooshIndexer')

				def scan_paths(root_location):
				return HgModel.repo_scan('/', root_location, None, True)

				class WhooshIndexingDaemon(object):
				"""Deamon for atomic jobs"""

				def __init__(self, indexname='HG_INDEX', repo_location=None):
				self.indexname = indexname
				self.repo_location = repo_location

				def get_paths(self, root_dir):
				"""recursive walk in root dir and return a set of all path in that dir
				excluding files in .hg dir"""
				index_paths_ = set()
				for path, dirs, files in os.walk(root_dir):
				if path.find('.hg') == -1:
				for f in files:
				index_paths_.add(jn(path, f))

				return index_paths_

				def add_doc(self, writer, path, repo):
				"""Adding doc to writer"""

				#we don't won't to read excluded file extensions just index them
				if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
				fobj = open(path, 'rb')
				content = fobj.read()
				fobj.close()
				try:
				u_content = unicode(content)
				except UnicodeDecodeError:
				#incase we have a decode error just represent as byte string
				u_content = unicode(str(content).encode('string_escape'))
				else:
				u_content = u''
				writer.add_document(owner=unicode(repo.contact),
				repository=u"%s" % repo.name,
				path=u"%s" % path,
				content=u_content,
				modtime=os.path.getmtime(path))

				def build_index(self):
				if os.path.exists(IDX_LOCATION):
				rmtree(IDX_LOCATION)

				if not os.path.exists(IDX_LOCATION):
				os.mkdir(IDX_LOCATION)

				idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
				writer = idx.writer()

				for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
				log.debug('building index @ %s' % repo.path)

				for idx_path in self.get_paths(repo.path):
				log.debug(' >> %s' % idx_path)
				self.add_doc(writer, idx_path, repo)
				writer.commit(merge=True)

				log.debug('>>> FINISHED BUILDING INDEX <<<')


				def update_index(self):
				log.debug('STARTING INCREMENTAL INDEXING UPDATE')

				idx = open_dir(IDX_LOCATION, indexname=self.indexname)
				# The set of all paths in the index
				indexed_paths = set()
				# The set of all paths we need to re-index
				to_index = set()

				reader = idx.reader()
				writer = idx.writer()

				# Loop over the stored fields in the index
				for fields in reader.all_stored_fields():
				indexed_path = fields['path']
				indexed_paths.add(indexed_path)

				if not os.path.exists(indexed_path):
				# This file was deleted since it was indexed
				log.debug('removing from index %s' % indexed_path)
				writer.delete_by_term('path', indexed_path)

				else:
				# Check if this file was changed since it
				# was indexed
				indexed_time = fields['modtime']

				mtime = os.path.getmtime(indexed_path)

				if mtime > indexed_time:

				# The file has changed, delete it and add it to the list of
				# files to reindex
				log.debug('adding to reindex list %s' % indexed_path)
				writer.delete_by_term('path', indexed_path)
				to_index.add(indexed_path)
				#writer.commit()

				# Loop over the files in the filesystem
				# Assume we have a function that gathers the filenames of the
				# documents to be indexed
				for repo in scan_paths(self.repo_location).values():
				for path in self.get_paths(repo.path):
				if path in to_index or path not in indexed_paths:
				# This is either a file that's changed, or a new file
				# that wasn't indexed before. So index it!
				self.add_doc(writer, path, repo)
				log.debug('reindexing %s' % path)

				writer.commit(merge=True)
				#idx.optimize()
				log.debug('>>> FINISHED <<<')

				def run(self, full_index=False):
				"""Run daemon"""
				if full_index:
				self.build_index()
				else:
				self.update_index()

				if __name__ == "__main__":
				repo_location = '/home/marcink/python_workspace_dirty/*'

				try:
				l = DaemonLock()
				WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True)
				l.release()
				except LockHeld:
				sys.exit(1)