##// END OF EJS Templates
some fixes to whoosh indexer daemon
marcink -
r411:9b67cebe default
parent child Browse files
Show More
@@ -1,181 +1,181 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3 # whoosh indexer daemon for hg-app
4 4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 5 #
6 6 # This program is free software; you can redistribute it and/or
7 7 # modify it under the terms of the GNU General Public License
8 8 # as published by the Free Software Foundation; version 2
9 9 # of the License or (at your opinion) any later version of the license.
10 10 #
11 11 # This program is distributed in the hope that it will be useful,
12 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 14 # GNU General Public License for more details.
15 15 #
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program; if not, write to the Free Software
18 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 19 # MA 02110-1301, USA.
20 20 """
21 21 Created on Jan 26, 2010
22 22
23 23 @author: marcink
24 24 A deamon will read from task table and run tasks
25 25 """
26 26 import sys
27 27 import os
28 from pidlock import LockHeld, DaemonLock
29 import traceback
30
31 28 from os.path import dirname as dn
32 29 from os.path import join as jn
33 30
34 31 #to get the pylons_app import
35 sys.path.append(dn(dn(dn(dn(os.path.realpath(__file__))))))
32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
33 sys.path.append(project_path)
36 34
35 from pidlock import LockHeld, DaemonLock
36 import traceback
37 37 from pylons_app.config.environment import load_environment
38 38 from pylons_app.model.hg_model import HgModel
39 39 from whoosh.index import create_in, open_dir
40 40 from shutil import rmtree
41 from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
41 from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \
42 SCHEMA, IDX_NAME
43
42 44 import logging
43 log = logging.getLogger(__name__)
44
45
46 location = '/home/marcink/python_workspace_dirty/*'
45 import logging.config
46 logging.config.fileConfig(jn(project_path, 'development.ini'))
47 log = logging.getLogger('whooshIndexer')
47 48
48 49 def scan_paths(root_location):
49 50 return HgModel.repo_scan('/', root_location, None, True)
50 51
51 52 class WhooshIndexingDaemon(object):
52 53 """Deamon for atomic jobs"""
53 54
54 def __init__(self, indexname='HG_INDEX'):
55 def __init__(self, indexname='HG_INDEX', repo_location=None):
55 56 self.indexname = indexname
56
57 self.repo_location = repo_location
57 58
58 59 def get_paths(self, root_dir):
59 60 """recursive walk in root dir and return a set of all path in that dir
60 61 excluding files in .hg dir"""
61 62 index_paths_ = set()
62 63 for path, dirs, files in os.walk(root_dir):
63 64 if path.find('.hg') == -1:
64 65 for f in files:
65 66 index_paths_.add(jn(path, f))
66 67
67 68 return index_paths_
68 69
69 70 def add_doc(self, writer, path, repo):
70 71 """Adding doc to writer"""
71 72
72 73 #we don't won't to read excluded file extensions just index them
73 74 if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
74 75 fobj = open(path, 'rb')
75 76 content = fobj.read()
76 77 fobj.close()
77 78 try:
78 79 u_content = unicode(content)
79 80 except UnicodeDecodeError:
80 81 #incase we have a decode error just represent as byte string
81 82 u_content = unicode(str(content).encode('string_escape'))
82 83 else:
83 84 u_content = u''
84 85 writer.add_document(owner=unicode(repo.contact),
85 86 repository=u"%s" % repo.name,
86 87 path=u"%s" % path,
87 88 content=u_content,
88 89 modtime=os.path.getmtime(path))
89 90
90 91 def build_index(self):
91 92 if os.path.exists(IDX_LOCATION):
92 93 rmtree(IDX_LOCATION)
93 94
94 95 if not os.path.exists(IDX_LOCATION):
95 96 os.mkdir(IDX_LOCATION)
96 97
97 98 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
98 99 writer = idx.writer()
99 100
100 for cnt, repo in enumerate(scan_paths(location).values()):
101 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
101 102 log.debug('building index @ %s' % repo.path)
102 103
103 104 for idx_path in self.get_paths(repo.path):
104 105 log.debug(' >> %s' % idx_path)
105 106 self.add_doc(writer, idx_path, repo)
106 107 writer.commit(merge=True)
107 108
108 109 log.debug('>>> FINISHED BUILDING INDEX <<<')
109 110
110 111
111 112 def update_index(self):
112 113 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
113 114
114 115 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
115 116 # The set of all paths in the index
116 117 indexed_paths = set()
117 118 # The set of all paths we need to re-index
118 119 to_index = set()
119 120
120 121 reader = idx.reader()
121 122 writer = idx.writer()
122 123
123 124 # Loop over the stored fields in the index
124 125 for fields in reader.all_stored_fields():
125 126 indexed_path = fields['path']
126 127 indexed_paths.add(indexed_path)
127 128
128 129 if not os.path.exists(indexed_path):
129 130 # This file was deleted since it was indexed
130 131 log.debug('removing from index %s' % indexed_path)
131 132 writer.delete_by_term('path', indexed_path)
132 133
133 134 else:
134 135 # Check if this file was changed since it
135 136 # was indexed
136 137 indexed_time = fields['modtime']
137 138
138 139 mtime = os.path.getmtime(indexed_path)
139 140
140 141 if mtime > indexed_time:
141 142
142 143 # The file has changed, delete it and add it to the list of
143 144 # files to reindex
144 145 log.debug('adding to reindex list %s' % indexed_path)
145 146 writer.delete_by_term('path', indexed_path)
146 147 to_index.add(indexed_path)
147 148 #writer.commit()
148 149
149 150 # Loop over the files in the filesystem
150 151 # Assume we have a function that gathers the filenames of the
151 152 # documents to be indexed
152 for repo in scan_paths(location).values():
153 for repo in scan_paths(self.repo_location).values():
153 154 for path in self.get_paths(repo.path):
154 155 if path in to_index or path not in indexed_paths:
155 156 # This is either a file that's changed, or a new file
156 157 # that wasn't indexed before. So index it!
157 158 self.add_doc(writer, path, repo)
158 159 log.debug('reindexing %s' % path)
159 160
160 161 writer.commit(merge=True)
161 162 #idx.optimize()
162 163 log.debug('>>> FINISHED <<<')
163 164
164 165 def run(self, full_index=False):
165 166 """Run daemon"""
166 167 if full_index:
167 168 self.build_index()
168 169 else:
169 170 self.update_index()
170 171
171 172 if __name__ == "__main__":
173 repo_location = '/home/marcink/python_workspace_dirty/*'
172 174
173 #config = load_environment()
174 #print config
175 175 try:
176 176 l = DaemonLock()
177 WhooshIndexingDaemon().run(full_index=True)
177 WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True)
178 178 l.release()
179 179 except LockHeld:
180 180 sys.exit(1)
181 181
General Comments 0
You need to be logged in to leave comments. Login now