upstream/kallithea Files · pylons_app/lib/indexers/daemon.py

added mimetypes to annotate

marcink - - Load All Authors

File last commit:

r491:fefffd6f celery


                r496:47f4c7ff

celery

Download file

             daemon.py
        
                    238 lines
            
             | 7.9 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / pylons_app / lib / indexers / daemon.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!/usr/bin/env python

      # encoding: utf-8

      # whoosh indexer daemon for hg-app

      # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>

      #

      # This program is free software; you can redistribute it and/or

      # modify it under the terms of the GNU General Public License

      # as published by the Free Software Foundation; version 2

      # of the License or (at your opinion) any later version of the license.

      # 

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      # 

      # You should have received a copy of the GNU General Public License

      # along with this program; if not, write to the Free Software

      # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

      # MA  02110-1301, USA.

      """

      Created on Jan 26, 2010

      @author: marcink

      A deamon will read from task table and run tasks

      """

      import sys

      import os

      from os.path import dirname as dn

      from os.path import join as jn

      #to get the pylons_app import

      project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

      sys.path.append(project_path)

      from pidlock import LockHeld, DaemonLock

      from pylons_app.model.hg_model import HgModel

      from pylons_app.lib.helpers import safe_unicode

      from whoosh.index import create_in, open_dir

      from shutil import rmtree

      from pylons_app.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME

      import logging

      log = logging.getLogger('whooshIndexer')

      # create logger

      log.setLevel(logging.DEBUG)

      log.propagate = False

      # create console handler and set level to debug

      ch = logging.StreamHandler()

      ch.setLevel(logging.DEBUG)

      # create formatter

      formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

      # add formatter to ch

      ch.setFormatter(formatter)

      # add ch to logger

      log.addHandler(ch)

      def scan_paths(root_location):

          return HgModel.repo_scan('/', root_location, None, True)

      class WhooshIndexingDaemon(object):

          """Deamon for atomic jobs"""

          def __init__(self, indexname='HG_INDEX', repo_location=None):

              self.indexname = indexname

              self.repo_location = repo_location

              self.initial = False

              if not os.path.isdir(IDX_LOCATION):

                  os.mkdir(IDX_LOCATION)

                  log.info('Cannot run incremental index since it does not'

                           ' yet exist running full build')

                  self.initial = True

          def get_paths(self, root_dir):

              """recursive walk in root dir and return a set of all path in that dir

              excluding files in .hg dir"""

              index_paths_ = set()

              for path, dirs, files in os.walk(root_dir):

                  if path.find('.hg') == -1:

                      for f in files:

                          index_paths_.add(jn(path, f))

              return index_paths_

          def add_doc(self, writer, path, repo):

              """Adding doc to writer"""

              ext = unicode(path.split('/')[-1].split('.')[-1].lower())

              #we just index the content of choosen files

              if ext in INDEX_EXTENSIONS:

                  log.debug('    >> %s [WITH CONTENT]' % path)

                  fobj = open(path, 'rb')

                  content = fobj.read()

                  fobj.close()

                  u_content = safe_unicode(content)

              else:

                  log.debug('    >> %s' % path)

                  #just index file name without it's content

                  u_content = u''

              try:

                  os.stat(path)

                  writer.add_document(owner=unicode(repo.contact),

                                  repository=u"%s" % repo.name,

                                  path=u"%s" % path,

                                  content=u_content,

                                  modtime=os.path.getmtime(path),

                                  extension=ext)             

              except OSError, e:

                  import errno

                  if e.errno == errno.ENOENT:

                      log.debug('path %s does not exist or is a broken symlink' % path)

                  else:

                      raise e                 

          def build_index(self):

              if os.path.exists(IDX_LOCATION):

                  log.debug('removing previos index')

                  rmtree(IDX_LOCATION)

              if not os.path.exists(IDX_LOCATION):

                  os.mkdir(IDX_LOCATION)

              idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

              writer = idx.writer()

              for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

                  log.debug('building index @ %s' % repo.path)

                  for idx_path in self.get_paths(repo.path):

                      self.add_doc(writer, idx_path, repo)

              writer.commit(merge=True)

              log.debug('>>> FINISHED BUILDING INDEX <<<')

          def update_index(self):

              log.debug('STARTING INCREMENTAL INDEXING UPDATE')

              idx = open_dir(IDX_LOCATION, indexname=self.indexname)

              # The set of all paths in the index

              indexed_paths = set()

              # The set of all paths we need to re-index

              to_index = set()

              reader = idx.reader()

              writer = idx.writer()

              # Loop over the stored fields in the index

              for fields in reader.all_stored_fields():

                  indexed_path = fields['path']

                  indexed_paths.add(indexed_path)

                  if not os.path.exists(indexed_path):

                      # This file was deleted since it was indexed

                      log.debug('removing from index %s' % indexed_path)

                      writer.delete_by_term('path', indexed_path)

                  else:

                      # Check if this file was changed since it

                      # was indexed

                      indexed_time = fields['modtime']

                      mtime = os.path.getmtime(indexed_path)

                      if mtime > indexed_time:

                          # The file has changed, delete it and add it to the list of

                          # files to reindex

                          log.debug('adding to reindex list %s' % indexed_path)

                          writer.delete_by_term('path', indexed_path)

                          to_index.add(indexed_path)

                          #writer.commit()

              # Loop over the files in the filesystem

              # Assume we have a function that gathers the filenames of the

              # documents to be indexed

              for repo in scan_paths(self.repo_location).values():

                  for path in self.get_paths(repo.path):

                      if path in to_index or path not in indexed_paths:

                          # This is either a file that's changed, or a new file

                          # that wasn't indexed before. So index it!

                          self.add_doc(writer, path, repo)

                          log.debug('reindexing %s' % path)

              writer.commit(merge=True)

              #idx.optimize()

              log.debug('>>> FINISHED <<<')

          def run(self, full_index=False):

              """Run daemon"""

              if full_index or self.initial:

                  self.build_index()

              else:

                  self.update_index()

      if __name__ == "__main__":

          arg = sys.argv[1:]

          if len(arg) != 2:

              sys.stderr.write('Please specify indexing type [full|incremental]' 

                               'and path to repositories as script args \n')

              sys.exit()

          if arg[0] == 'full':

              full_index = True

          elif arg[0] == 'incremental':

              # False means looking just for changes

              full_index = False

          else:

              sys.stdout.write('Please use [full|incremental]' 

                               ' as script first arg \n')

              sys.exit()

          if not os.path.isdir(arg[1]):

              sys.stderr.write('%s is not a valid path \n' % arg[1])

              sys.exit()

          else:

              if arg[1].endswith('/'):

                  repo_location = arg[1] + '*'

              else:

                  repo_location = arg[1] + '/*'

          try:

              l = DaemonLock()

              WhooshIndexingDaemon(repo_location=repo_location)\

                  .run(full_index=full_index)

              l.release()

              reload(logging)

          except LockHeld:

              sys.exit(1)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!/usr/bin/env python
				# encoding: utf-8
				# whoosh indexer daemon for hg-app
				# Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
				#
				# This program is free software; you can redistribute it and/or
				# modify it under the terms of the GNU General Public License
				# as published by the Free Software Foundation; version 2
				# of the License or (at your opinion) any later version of the license.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU General Public License
				# along with this program; if not, write to the Free Software
				# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
				# MA 02110-1301, USA.
				"""
				Created on Jan 26, 2010

				@author: marcink
				A deamon will read from task table and run tasks
				"""
				import sys
				import os
				from os.path import dirname as dn
				from os.path import join as jn

				#to get the pylons_app import
				project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
				sys.path.append(project_path)

				from pidlock import LockHeld, DaemonLock
				from pylons_app.model.hg_model import HgModel
				from pylons_app.lib.helpers import safe_unicode
				from whoosh.index import create_in, open_dir
				from shutil import rmtree
				from pylons_app.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME

				import logging

				log = logging.getLogger('whooshIndexer')
				# create logger
				log.setLevel(logging.DEBUG)
				log.propagate = False
				# create console handler and set level to debug
				ch = logging.StreamHandler()
				ch.setLevel(logging.DEBUG)

				# create formatter
				formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

				# add formatter to ch
				ch.setFormatter(formatter)

				# add ch to logger
				log.addHandler(ch)

				def scan_paths(root_location):
				return HgModel.repo_scan('/', root_location, None, True)

				class WhooshIndexingDaemon(object):
				"""Deamon for atomic jobs"""

				def __init__(self, indexname='HG_INDEX', repo_location=None):
				self.indexname = indexname
				self.repo_location = repo_location
				self.initial = False
				if not os.path.isdir(IDX_LOCATION):
				os.mkdir(IDX_LOCATION)
				log.info('Cannot run incremental index since it does not'
				' yet exist running full build')
				self.initial = True

				def get_paths(self, root_dir):
				"""recursive walk in root dir and return a set of all path in that dir
				excluding files in .hg dir"""
				index_paths_ = set()
				for path, dirs, files in os.walk(root_dir):
				if path.find('.hg') == -1:
				for f in files:
				index_paths_.add(jn(path, f))

				return index_paths_

				def add_doc(self, writer, path, repo):
				"""Adding doc to writer"""

				ext = unicode(path.split('/')[-1].split('.')[-1].lower())
				#we just index the content of choosen files
				if ext in INDEX_EXTENSIONS:
				log.debug(' >> %s [WITH CONTENT]' % path)
				fobj = open(path, 'rb')
				content = fobj.read()
				fobj.close()
				u_content = safe_unicode(content)
				else:
				log.debug(' >> %s' % path)
				#just index file name without it's content
				u_content = u''



				try:
				os.stat(path)
				writer.add_document(owner=unicode(repo.contact),
				repository=u"%s" % repo.name,
				path=u"%s" % path,
				content=u_content,
				modtime=os.path.getmtime(path),
				extension=ext)
				except OSError, e:
				import errno
				if e.errno == errno.ENOENT:
				log.debug('path %s does not exist or is a broken symlink' % path)
				else:
				raise e


				def build_index(self):
				if os.path.exists(IDX_LOCATION):
				log.debug('removing previos index')
				rmtree(IDX_LOCATION)

				if not os.path.exists(IDX_LOCATION):
				os.mkdir(IDX_LOCATION)

				idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
				writer = idx.writer()

				for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
				log.debug('building index @ %s' % repo.path)

				for idx_path in self.get_paths(repo.path):
				self.add_doc(writer, idx_path, repo)
				writer.commit(merge=True)

				log.debug('>>> FINISHED BUILDING INDEX <<<')


				def update_index(self):
				log.debug('STARTING INCREMENTAL INDEXING UPDATE')

				idx = open_dir(IDX_LOCATION, indexname=self.indexname)
				# The set of all paths in the index
				indexed_paths = set()
				# The set of all paths we need to re-index
				to_index = set()

				reader = idx.reader()
				writer = idx.writer()

				# Loop over the stored fields in the index
				for fields in reader.all_stored_fields():
				indexed_path = fields['path']
				indexed_paths.add(indexed_path)

				if not os.path.exists(indexed_path):
				# This file was deleted since it was indexed
				log.debug('removing from index %s' % indexed_path)
				writer.delete_by_term('path', indexed_path)

				else:
				# Check if this file was changed since it
				# was indexed
				indexed_time = fields['modtime']

				mtime = os.path.getmtime(indexed_path)

				if mtime > indexed_time:

				# The file has changed, delete it and add it to the list of
				# files to reindex
				log.debug('adding to reindex list %s' % indexed_path)
				writer.delete_by_term('path', indexed_path)
				to_index.add(indexed_path)
				#writer.commit()

				# Loop over the files in the filesystem
				# Assume we have a function that gathers the filenames of the
				# documents to be indexed
				for repo in scan_paths(self.repo_location).values():
				for path in self.get_paths(repo.path):
				if path in to_index or path not in indexed_paths:
				# This is either a file that's changed, or a new file
				# that wasn't indexed before. So index it!
				self.add_doc(writer, path, repo)
				log.debug('reindexing %s' % path)

				writer.commit(merge=True)
				#idx.optimize()
				log.debug('>>> FINISHED <<<')

				def run(self, full_index=False):
				"""Run daemon"""
				if full_index or self.initial:
				self.build_index()
				else:
				self.update_index()

				if __name__ == "__main__":
				arg = sys.argv[1:]
				if len(arg) != 2:
				sys.stderr.write('Please specify indexing type [full\|incremental]'
				'and path to repositories as script args \n')
				sys.exit()


				if arg[0] == 'full':
				full_index = True
				elif arg[0] == 'incremental':
				# False means looking just for changes
				full_index = False
				else:
				sys.stdout.write('Please use [full\|incremental]'
				' as script first arg \n')
				sys.exit()

				if not os.path.isdir(arg[1]):
				sys.stderr.write('%s is not a valid path \n' % arg[1])
				sys.exit()
				else:
				if arg[1].endswith('/'):
				repo_location = arg[1] + '*'
				else:
				repo_location = arg[1] + '/*'

				try:
				l = DaemonLock()
				WhooshIndexingDaemon(repo_location=repo_location)\
				.run(full_index=full_index)
				l.release()
				reload(logging)
				except LockHeld:
				sys.exit(1)