daemon.py
240 lines
| 8.0 KiB
| text/x-python
|
PythonLexer
r885 | # -*- coding: utf-8 -*- | |||
""" | ||||
rhodecode.lib.indexers.daemon | ||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
r1377 | A daemon will read from task table and run tasks | |||
r947 | ||||
r885 | :created_on: Jan 26, 2010 | |||
:author: marcink | ||||
r947 | :copyright: (C) 2009-2011 Marcin Kuzminski <marcin@python-works.com> | |||
r885 | :license: GPLv3, see COPYING for more details. | |||
""" | ||||
r1206 | # This program is free software: you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | ||||
# the Free Software Foundation, either version 3 of the License, or | ||||
# (at your option) any later version. | ||||
r947 | # | |||
r547 | # This program is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
r947 | # | |||
r547 | # You should have received a copy of the GNU General Public License | |||
r1206 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |||
r547 | ||||
r1154 | import os | |||
r547 | import sys | |||
r1154 | import logging | |||
r885 | import traceback | |||
r1154 | ||||
from shutil import rmtree | ||||
from time import mktime | ||||
r547 | from os.path import dirname as dn | |||
from os.path import join as jn | ||||
#to get the rhodecode import | ||||
project_path = dn(dn(dn(dn(os.path.realpath(__file__))))) | ||||
sys.path.append(project_path) | ||||
r631 | ||||
r691 | from rhodecode.model.scm import ScmModel | |||
r1154 | from rhodecode.lib import safe_unicode | |||
r631 | from rhodecode.lib.indexers import INDEX_EXTENSIONS, SCHEMA, IDX_NAME | |||
r547 | ||||
r567 | from vcs.exceptions import ChangesetError, RepositoryError | |||
r560 | ||||
r1154 | from whoosh.index import create_in, open_dir | |||
r547 | ||||
log = logging.getLogger('whooshIndexer') | ||||
# create logger | ||||
log.setLevel(logging.DEBUG) | ||||
log.propagate = False | ||||
# create console handler and set level to debug | ||||
ch = logging.StreamHandler() | ||||
ch.setLevel(logging.DEBUG) | ||||
# create formatter | ||||
r1183 | formatter = logging.Formatter("%(asctime)s - %(name)s -" | |||
" %(levelname)s - %(message)s") | ||||
r547 | ||||
# add formatter to ch | ||||
ch.setFormatter(formatter) | ||||
# add ch to logger | ||||
log.addHandler(ch) | ||||
class WhooshIndexingDaemon(object): | ||||
r560 | """ | |||
r1377 | Daemon for atomic jobs | |||
r560 | """ | |||
r547 | ||||
r631 | def __init__(self, indexname='HG_INDEX', index_location=None, | |||
r894 | repo_location=None, sa=None, repo_list=None): | |||
r547 | self.indexname = indexname | |||
r631 | ||||
self.index_location = index_location | ||||
if not index_location: | ||||
raise Exception('You have to provide index location') | ||||
r547 | self.repo_location = repo_location | |||
r631 | if not repo_location: | |||
raise Exception('You have to provide repositories location') | ||||
r1036 | self.repo_paths = ScmModel(sa).repo_scan(self.repo_location) | |||
r894 | ||||
if repo_list: | ||||
filtered_repo_paths = {} | ||||
for repo_name, repo in self.repo_paths.items(): | ||||
if repo_name in repo_list: | ||||
r1171 | filtered_repo_paths[repo_name] = repo | |||
r894 | ||||
self.repo_paths = filtered_repo_paths | ||||
r547 | self.initial = False | |||
r631 | if not os.path.isdir(self.index_location): | |||
r763 | os.makedirs(self.index_location) | |||
r547 | log.info('Cannot run incremental index since it does not' | |||
' yet exist running full build') | ||||
self.initial = True | ||||
r631 | ||||
r561 | def get_paths(self, repo): | |||
r683 | """recursive walk in root dir and return a set of all path in that dir | |||
r560 | based on repository walk function | |||
""" | ||||
r547 | index_paths_ = set() | |||
r567 | try: | |||
r947 | tip = repo.get_changeset('tip') | |||
for topnode, dirs, files in tip.walk('/'): | ||||
r547 | for f in files: | |||
r561 | index_paths_.add(jn(repo.path, f.path)) | |||
r567 | for dir in dirs: | |||
for f in files: | ||||
index_paths_.add(jn(repo.path, f.path)) | ||||
r631 | ||||
r885 | except RepositoryError, e: | |||
log.debug(traceback.format_exc()) | ||||
r567 | pass | |||
r631 | return index_paths_ | |||
r561 | def get_node(self, repo, path): | |||
n_path = path[len(repo.path) + 1:] | ||||
node = repo.get_changeset().get_node(n_path) | ||||
return node | ||||
r631 | ||||
r561 | def get_node_mtime(self, node): | |||
return mktime(node.last_changeset.date.timetuple()) | ||||
r631 | ||||
r1171 | def add_doc(self, writer, path, repo, repo_name): | |||
r683 | """Adding doc to writer this function itself fetches data from | |||
the instance of vcs backend""" | ||||
r561 | node = self.get_node(repo, path) | |||
r560 | ||||
r886 | #we just index the content of chosen files, and skip binary files | |||
if node.extension in INDEX_EXTENSIONS and not node.is_binary: | ||||
r885 | ||||
r560 | u_content = node.content | |||
r885 | if not isinstance(u_content, unicode): | |||
log.warning(' >> %s Could not get this content as unicode ' | ||||
'replacing with empty content', path) | ||||
u_content = u'' | ||||
else: | ||||
log.debug(' >> %s [WITH CONTENT]' % path) | ||||
r547 | else: | |||
log.debug(' >> %s' % path) | ||||
#just index file name without it's content | ||||
u_content = u'' | ||||
r631 | ||||
r560 | writer.add_document(owner=unicode(repo.contact), | |||
r1171 | repository=safe_unicode(repo_name), | |||
r560 | path=safe_unicode(path), | |||
content=u_content, | ||||
r561 | modtime=self.get_node_mtime(node), | |||
r631 | extension=node.extension) | |||
r547 | ||||
def build_index(self): | ||||
r631 | if os.path.exists(self.index_location): | |||
r560 | log.debug('removing previous index') | |||
r631 | rmtree(self.index_location) | |||
if not os.path.exists(self.index_location): | ||||
os.mkdir(self.index_location) | ||||
idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) | ||||
r547 | writer = idx.writer() | |||
r894 | ||||
r1171 | for repo_name, repo in self.repo_paths.items(): | |||
r547 | log.debug('building index @ %s' % repo.path) | |||
r631 | ||||
r561 | for idx_path in self.get_paths(repo): | |||
r1171 | self.add_doc(writer, idx_path, repo, repo_name) | |||
r631 | ||||
r561 | log.debug('>> COMMITING CHANGES <<') | |||
r547 | writer.commit(merge=True) | |||
log.debug('>>> FINISHED BUILDING INDEX <<<') | ||||
r631 | ||||
r547 | def update_index(self): | |||
log.debug('STARTING INCREMENTAL INDEXING UPDATE') | ||||
r631 | ||||
idx = open_dir(self.index_location, indexname=self.indexname) | ||||
r547 | # The set of all paths in the index | |||
indexed_paths = set() | ||||
# The set of all paths we need to re-index | ||||
to_index = set() | ||||
r631 | ||||
r547 | reader = idx.reader() | |||
writer = idx.writer() | ||||
r631 | ||||
r547 | # Loop over the stored fields in the index | |||
for fields in reader.all_stored_fields(): | ||||
indexed_path = fields['path'] | ||||
indexed_paths.add(indexed_path) | ||||
r631 | ||||
r561 | repo = self.repo_paths[fields['repository']] | |||
r631 | ||||
r561 | try: | |||
node = self.get_node(repo, indexed_path) | ||||
except ChangesetError: | ||||
r547 | # This file was deleted since it was indexed | |||
log.debug('removing from index %s' % indexed_path) | ||||
writer.delete_by_term('path', indexed_path) | ||||
r631 | ||||
r547 | else: | |||
r561 | # Check if this file was changed since it was indexed | |||
r547 | indexed_time = fields['modtime'] | |||
r561 | mtime = self.get_node_mtime(node) | |||
r547 | if mtime > indexed_time: | |||
# The file has changed, delete it and add it to the list of | ||||
# files to reindex | ||||
log.debug('adding to reindex list %s' % indexed_path) | ||||
writer.delete_by_term('path', indexed_path) | ||||
to_index.add(indexed_path) | ||||
r631 | ||||
r547 | # Loop over the files in the filesystem | |||
# Assume we have a function that gathers the filenames of the | ||||
# documents to be indexed | ||||
r1171 | for repo_name, repo in self.repo_paths.items(): | |||
r561 | for path in self.get_paths(repo): | |||
r547 | if path in to_index or path not in indexed_paths: | |||
# This is either a file that's changed, or a new file | ||||
# that wasn't indexed before. So index it! | ||||
r1171 | self.add_doc(writer, path, repo, repo_name) | |||
r561 | log.debug('re indexing %s' % path) | |||
r631 | ||||
r561 | log.debug('>> COMMITING CHANGES <<') | |||
r547 | writer.commit(merge=True) | |||
r561 | log.debug('>>> FINISHED REBUILDING INDEX <<<') | |||
r631 | ||||
r547 | def run(self, full_index=False): | |||
"""Run daemon""" | ||||
if full_index or self.initial: | ||||
self.build_index() | ||||
else: | ||||
self.update_index() | ||||