##// END OF EJS Templates
added support for broken symlinks in whoosh indexer
marcink -
r441:c59c4d43 default
parent child Browse files
Show More
@@ -1,188 +1,199 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 # whoosh indexer daemon for hg-app
3 # whoosh indexer daemon for hg-app
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 #
5 #
6 # This program is free software; you can redistribute it and/or
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; version 2
8 # as published by the Free Software Foundation; version 2
9 # of the License or (at your opinion) any later version of the license.
9 # of the License or (at your opinion) any later version of the license.
10 #
10 #
11 # This program is distributed in the hope that it will be useful,
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
14 # GNU General Public License for more details.
15 #
15 #
16 # You should have received a copy of the GNU General Public License
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 # MA 02110-1301, USA.
19 # MA 02110-1301, USA.
20 """
20 """
21 Created on Jan 26, 2010
21 Created on Jan 26, 2010
22
22
23 @author: marcink
23 @author: marcink
24 A deamon will read from task table and run tasks
24 A deamon will read from task table and run tasks
25 """
25 """
26 import sys
26 import sys
27 import os
27 import os
28 from os.path import dirname as dn
28 from os.path import dirname as dn
29 from os.path import join as jn
29 from os.path import join as jn
30
30
31 #to get the pylons_app import
31 #to get the pylons_app import
32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
33 sys.path.append(project_path)
33 sys.path.append(project_path)
34
34
35 from pidlock import LockHeld, DaemonLock
35 from pidlock import LockHeld, DaemonLock
36 import traceback
36 import traceback
37 from pylons_app.config.environment import load_environment
37 from pylons_app.config.environment import load_environment
38 from pylons_app.model.hg_model import HgModel
38 from pylons_app.model.hg_model import HgModel
39 from whoosh.index import create_in, open_dir
39 from whoosh.index import create_in, open_dir
40 from shutil import rmtree
40 from shutil import rmtree
41 from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
41 from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
42 SCHEMA, IDX_NAME
42 SCHEMA, IDX_NAME
43
43
44 import logging
44 import logging
45 import logging.config
45 import logging.config
46 logging.config.fileConfig(jn(project_path, 'development.ini'))
46 logging.config.fileConfig(jn(project_path, 'development.ini'))
47 log = logging.getLogger('whooshIndexer')
47 log = logging.getLogger('whooshIndexer')
48
48
49 def scan_paths(root_location):
49 def scan_paths(root_location):
50 return HgModel.repo_scan('/', root_location, None, True)
50 return HgModel.repo_scan('/', root_location, None, True)
51
51
52 class WhooshIndexingDaemon(object):
52 class WhooshIndexingDaemon(object):
53 """Deamon for atomic jobs"""
53 """Deamon for atomic jobs"""
54
54
55 def __init__(self, indexname='HG_INDEX', repo_location=None):
55 def __init__(self, indexname='HG_INDEX', repo_location=None):
56 self.indexname = indexname
56 self.indexname = indexname
57 self.repo_location = repo_location
57 self.repo_location = repo_location
58
58
59 def get_paths(self, root_dir):
59 def get_paths(self, root_dir):
60 """recursive walk in root dir and return a set of all path in that dir
60 """recursive walk in root dir and return a set of all path in that dir
61 excluding files in .hg dir"""
61 excluding files in .hg dir"""
62 index_paths_ = set()
62 index_paths_ = set()
63 for path, dirs, files in os.walk(root_dir):
63 for path, dirs, files in os.walk(root_dir):
64 if path.find('.hg') == -1:
64 if path.find('.hg') == -1:
65 for f in files:
65 for f in files:
66 index_paths_.add(jn(path, f))
66 index_paths_.add(jn(path, f))
67
67
68 return index_paths_
68 return index_paths_
69
69
70 def add_doc(self, writer, path, repo):
70 def add_doc(self, writer, path, repo):
71 """Adding doc to writer"""
71 """Adding doc to writer"""
72
72
73 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
73 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
74 #we just index the content of choosen files
74 #we just index the content of choosen files
75 if ext in INDEX_EXTENSIONS:
75 if ext in INDEX_EXTENSIONS:
76 log.debug(' >> %s [WITH CONTENT]' % path)
76 log.debug(' >> %s [WITH CONTENT]' % path)
77 fobj = open(path, 'rb')
77 fobj = open(path, 'rb')
78 content = fobj.read()
78 content = fobj.read()
79 fobj.close()
79 fobj.close()
80 try:
80 try:
81 u_content = unicode(content)
81 u_content = unicode(content)
82 except UnicodeDecodeError:
82 except UnicodeDecodeError:
83 #incase we have a decode error just represent as byte string
83 #incase we have a decode error just represent as byte string
84 u_content = unicode(str(content).encode('string_escape'))
84 u_content = unicode(str(content).encode('string_escape'))
85 else:
85 else:
86 log.debug(' >> %s' % path)
86 log.debug(' >> %s' % path)
87 #just index file name without it's content
87 #just index file name without it's content
88 u_content = u''
88 u_content = u''
89
89
90 writer.add_document(owner=unicode(repo.contact),
90
91
92 try:
93 os.stat(path)
94 writer.add_document(owner=unicode(repo.contact),
91 repository=u"%s" % repo.name,
95 repository=u"%s" % repo.name,
92 path=u"%s" % path,
96 path=u"%s" % path,
93 content=u_content,
97 content=u_content,
94 modtime=os.path.getmtime(path),
98 modtime=os.path.getmtime(path),
95 extension=ext)
99 extension=ext)
100 except OSError, e:
101 import errno
102 if e.errno == errno.ENOENT:
103 log.debug('path %s does not exist or is a broken symlink' % path)
104 else:
105 raise e
106
96
107
97 def build_index(self):
108 def build_index(self):
98 if os.path.exists(IDX_LOCATION):
109 if os.path.exists(IDX_LOCATION):
99 log.debug('removing previos index')
110 log.debug('removing previos index')
100 rmtree(IDX_LOCATION)
111 rmtree(IDX_LOCATION)
101
112
102 if not os.path.exists(IDX_LOCATION):
113 if not os.path.exists(IDX_LOCATION):
103 os.mkdir(IDX_LOCATION)
114 os.mkdir(IDX_LOCATION)
104
115
105 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
116 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
106 writer = idx.writer()
117 writer = idx.writer()
107
118
108 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
119 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
109 log.debug('building index @ %s' % repo.path)
120 log.debug('building index @ %s' % repo.path)
110
121
111 for idx_path in self.get_paths(repo.path):
122 for idx_path in self.get_paths(repo.path):
112 self.add_doc(writer, idx_path, repo)
123 self.add_doc(writer, idx_path, repo)
113 writer.commit(merge=True)
124 writer.commit(merge=True)
114
125
115 log.debug('>>> FINISHED BUILDING INDEX <<<')
126 log.debug('>>> FINISHED BUILDING INDEX <<<')
116
127
117
128
118 def update_index(self):
129 def update_index(self):
119 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
130 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
120
131
121 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
132 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
122 # The set of all paths in the index
133 # The set of all paths in the index
123 indexed_paths = set()
134 indexed_paths = set()
124 # The set of all paths we need to re-index
135 # The set of all paths we need to re-index
125 to_index = set()
136 to_index = set()
126
137
127 reader = idx.reader()
138 reader = idx.reader()
128 writer = idx.writer()
139 writer = idx.writer()
129
140
130 # Loop over the stored fields in the index
141 # Loop over the stored fields in the index
131 for fields in reader.all_stored_fields():
142 for fields in reader.all_stored_fields():
132 indexed_path = fields['path']
143 indexed_path = fields['path']
133 indexed_paths.add(indexed_path)
144 indexed_paths.add(indexed_path)
134
145
135 if not os.path.exists(indexed_path):
146 if not os.path.exists(indexed_path):
136 # This file was deleted since it was indexed
147 # This file was deleted since it was indexed
137 log.debug('removing from index %s' % indexed_path)
148 log.debug('removing from index %s' % indexed_path)
138 writer.delete_by_term('path', indexed_path)
149 writer.delete_by_term('path', indexed_path)
139
150
140 else:
151 else:
141 # Check if this file was changed since it
152 # Check if this file was changed since it
142 # was indexed
153 # was indexed
143 indexed_time = fields['modtime']
154 indexed_time = fields['modtime']
144
155
145 mtime = os.path.getmtime(indexed_path)
156 mtime = os.path.getmtime(indexed_path)
146
157
147 if mtime > indexed_time:
158 if mtime > indexed_time:
148
159
149 # The file has changed, delete it and add it to the list of
160 # The file has changed, delete it and add it to the list of
150 # files to reindex
161 # files to reindex
151 log.debug('adding to reindex list %s' % indexed_path)
162 log.debug('adding to reindex list %s' % indexed_path)
152 writer.delete_by_term('path', indexed_path)
163 writer.delete_by_term('path', indexed_path)
153 to_index.add(indexed_path)
164 to_index.add(indexed_path)
154 #writer.commit()
165 #writer.commit()
155
166
156 # Loop over the files in the filesystem
167 # Loop over the files in the filesystem
157 # Assume we have a function that gathers the filenames of the
168 # Assume we have a function that gathers the filenames of the
158 # documents to be indexed
169 # documents to be indexed
159 for repo in scan_paths(self.repo_location).values():
170 for repo in scan_paths(self.repo_location).values():
160 for path in self.get_paths(repo.path):
171 for path in self.get_paths(repo.path):
161 if path in to_index or path not in indexed_paths:
172 if path in to_index or path not in indexed_paths:
162 # This is either a file that's changed, or a new file
173 # This is either a file that's changed, or a new file
163 # that wasn't indexed before. So index it!
174 # that wasn't indexed before. So index it!
164 self.add_doc(writer, path, repo)
175 self.add_doc(writer, path, repo)
165 log.debug('reindexing %s' % path)
176 log.debug('reindexing %s' % path)
166
177
167 writer.commit(merge=True)
178 writer.commit(merge=True)
168 #idx.optimize()
179 #idx.optimize()
169 log.debug('>>> FINISHED <<<')
180 log.debug('>>> FINISHED <<<')
170
181
171 def run(self, full_index=False):
182 def run(self, full_index=False):
172 """Run daemon"""
183 """Run daemon"""
173 if full_index:
184 if full_index:
174 self.build_index()
185 self.build_index()
175 else:
186 else:
176 self.update_index()
187 self.update_index()
177
188
178 if __name__ == "__main__":
189 if __name__ == "__main__":
179 repo_location = '/home/marcink/hg_repos/*'
190 repo_location = '/home/marcink/hg_repos/*'
180 full_index = True # False means looking just for changes
191 full_index = True # False means looking just for changes
181 try:
192 try:
182 l = DaemonLock()
193 l = DaemonLock()
183 WhooshIndexingDaemon(repo_location=repo_location)\
194 WhooshIndexingDaemon(repo_location=repo_location)\
184 .run(full_index=full_index)
195 .run(full_index=full_index)
185 l.release()
196 l.release()
186 except LockHeld:
197 except LockHeld:
187 sys.exit(1)
198 sys.exit(1)
188
199
General Comments 0
You need to be logged in to leave comments. Login now