daemon.py
181 lines
| 6.4 KiB
| text/x-python
|
PythonLexer
r406 | #!/usr/bin/env python | |||
# encoding: utf-8 | ||||
# whoosh indexer daemon for hg-app | ||||
# Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com> | ||||
# | ||||
# This program is free software; you can redistribute it and/or | ||||
# modify it under the terms of the GNU General Public License | ||||
# as published by the Free Software Foundation; version 2 | ||||
# of the License or (at your opinion) any later version of the license. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU General Public License | ||||
# along with this program; if not, write to the Free Software | ||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | ||||
# MA 02110-1301, USA. | ||||
""" | ||||
Created on Jan 26, 2010 | ||||
@author: marcink | ||||
A deamon will read from task table and run tasks | ||||
""" | ||||
import sys | ||||
import os | ||||
from os.path import dirname as dn | ||||
from os.path import join as jn | ||||
#to get the pylons_app import | ||||
r411 | project_path = dn(dn(dn(dn(os.path.realpath(__file__))))) | |||
sys.path.append(project_path) | ||||
r406 | ||||
r411 | from pidlock import LockHeld, DaemonLock | |||
import traceback | ||||
r406 | from pylons_app.config.environment import load_environment | |||
from pylons_app.model.hg_model import HgModel | ||||
from whoosh.index import create_in, open_dir | ||||
from shutil import rmtree | ||||
r411 | from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \ | |||
SCHEMA, IDX_NAME | ||||
r406 | import logging | |||
r411 | import logging.config | |||
logging.config.fileConfig(jn(project_path, 'development.ini')) | ||||
log = logging.getLogger('whooshIndexer') | ||||
r406 | ||||
def scan_paths(root_location): | ||||
return HgModel.repo_scan('/', root_location, None, True) | ||||
class WhooshIndexingDaemon(object): | ||||
"""Deamon for atomic jobs""" | ||||
r411 | def __init__(self, indexname='HG_INDEX', repo_location=None): | |||
r406 | self.indexname = indexname | |||
r411 | self.repo_location = repo_location | |||
r406 | ||||
def get_paths(self, root_dir): | ||||
"""recursive walk in root dir and return a set of all path in that dir | ||||
excluding files in .hg dir""" | ||||
index_paths_ = set() | ||||
for path, dirs, files in os.walk(root_dir): | ||||
if path.find('.hg') == -1: | ||||
for f in files: | ||||
index_paths_.add(jn(path, f)) | ||||
return index_paths_ | ||||
def add_doc(self, writer, path, repo): | ||||
"""Adding doc to writer""" | ||||
#we don't won't to read excluded file extensions just index them | ||||
if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS: | ||||
fobj = open(path, 'rb') | ||||
content = fobj.read() | ||||
fobj.close() | ||||
try: | ||||
u_content = unicode(content) | ||||
except UnicodeDecodeError: | ||||
#incase we have a decode error just represent as byte string | ||||
u_content = unicode(str(content).encode('string_escape')) | ||||
else: | ||||
u_content = u'' | ||||
writer.add_document(owner=unicode(repo.contact), | ||||
repository=u"%s" % repo.name, | ||||
path=u"%s" % path, | ||||
content=u_content, | ||||
modtime=os.path.getmtime(path)) | ||||
def build_index(self): | ||||
if os.path.exists(IDX_LOCATION): | ||||
rmtree(IDX_LOCATION) | ||||
if not os.path.exists(IDX_LOCATION): | ||||
os.mkdir(IDX_LOCATION) | ||||
idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME) | ||||
writer = idx.writer() | ||||
r411 | for cnt, repo in enumerate(scan_paths(self.repo_location).values()): | |||
r406 | log.debug('building index @ %s' % repo.path) | |||
for idx_path in self.get_paths(repo.path): | ||||
log.debug(' >> %s' % idx_path) | ||||
self.add_doc(writer, idx_path, repo) | ||||
writer.commit(merge=True) | ||||
log.debug('>>> FINISHED BUILDING INDEX <<<') | ||||
def update_index(self): | ||||
log.debug('STARTING INCREMENTAL INDEXING UPDATE') | ||||
idx = open_dir(IDX_LOCATION, indexname=self.indexname) | ||||
# The set of all paths in the index | ||||
indexed_paths = set() | ||||
# The set of all paths we need to re-index | ||||
to_index = set() | ||||
reader = idx.reader() | ||||
writer = idx.writer() | ||||
# Loop over the stored fields in the index | ||||
for fields in reader.all_stored_fields(): | ||||
indexed_path = fields['path'] | ||||
indexed_paths.add(indexed_path) | ||||
if not os.path.exists(indexed_path): | ||||
# This file was deleted since it was indexed | ||||
log.debug('removing from index %s' % indexed_path) | ||||
writer.delete_by_term('path', indexed_path) | ||||
else: | ||||
# Check if this file was changed since it | ||||
# was indexed | ||||
indexed_time = fields['modtime'] | ||||
mtime = os.path.getmtime(indexed_path) | ||||
if mtime > indexed_time: | ||||
# The file has changed, delete it and add it to the list of | ||||
# files to reindex | ||||
log.debug('adding to reindex list %s' % indexed_path) | ||||
writer.delete_by_term('path', indexed_path) | ||||
to_index.add(indexed_path) | ||||
#writer.commit() | ||||
# Loop over the files in the filesystem | ||||
# Assume we have a function that gathers the filenames of the | ||||
# documents to be indexed | ||||
r411 | for repo in scan_paths(self.repo_location).values(): | |||
r406 | for path in self.get_paths(repo.path): | |||
if path in to_index or path not in indexed_paths: | ||||
# This is either a file that's changed, or a new file | ||||
# that wasn't indexed before. So index it! | ||||
self.add_doc(writer, path, repo) | ||||
log.debug('reindexing %s' % path) | ||||
writer.commit(merge=True) | ||||
#idx.optimize() | ||||
log.debug('>>> FINISHED <<<') | ||||
def run(self, full_index=False): | ||||
"""Run daemon""" | ||||
if full_index: | ||||
self.build_index() | ||||
else: | ||||
self.update_index() | ||||
if __name__ == "__main__": | ||||
r411 | repo_location = '/home/marcink/python_workspace_dirty/*' | |||
r406 | ||||
try: | ||||
l = DaemonLock() | ||||
r411 | WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True) | |||
r406 | l.release() | |||
except LockHeld: | ||||
sys.exit(1) | ||||