##// END OF EJS Templates
fixed whoosh indexing possible unicode decode errors
marcink -
r557:29ec9ddb default
parent child Browse files
Show More
@@ -1,238 +1,238 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 # whoosh indexer daemon for rhodecode
3 # whoosh indexer daemon for rhodecode
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 #
5 #
6 # This program is free software; you can redistribute it and/or
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; version 2
8 # as published by the Free Software Foundation; version 2
9 # of the License or (at your opinion) any later version of the license.
9 # of the License or (at your opinion) any later version of the license.
10 #
10 #
11 # This program is distributed in the hope that it will be useful,
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
14 # GNU General Public License for more details.
15 #
15 #
16 # You should have received a copy of the GNU General Public License
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 # MA 02110-1301, USA.
19 # MA 02110-1301, USA.
20 """
20 """
21 Created on Jan 26, 2010
21 Created on Jan 26, 2010
22
22
23 @author: marcink
23 @author: marcink
24 A deamon will read from task table and run tasks
24 A deamon will read from task table and run tasks
25 """
25 """
26 import sys
26 import sys
27 import os
27 import os
28 from os.path import dirname as dn
28 from os.path import dirname as dn
29 from os.path import join as jn
29 from os.path import join as jn
30
30
31 #to get the rhodecode import
31 #to get the rhodecode import
32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
33 sys.path.append(project_path)
33 sys.path.append(project_path)
34
34
35 from rhodecode.lib.pidlock import LockHeld, DaemonLock
35 from rhodecode.lib.pidlock import LockHeld, DaemonLock
36 from rhodecode.model.hg_model import HgModel
36 from rhodecode.model.hg_model import HgModel
37 from rhodecode.lib.helpers import safe_unicode
37 from rhodecode.lib.helpers import safe_unicode
38 from whoosh.index import create_in, open_dir
38 from whoosh.index import create_in, open_dir
39 from shutil import rmtree
39 from shutil import rmtree
40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
41
41
42 import logging
42 import logging
43
43
44 log = logging.getLogger('whooshIndexer')
44 log = logging.getLogger('whooshIndexer')
45 # create logger
45 # create logger
46 log.setLevel(logging.DEBUG)
46 log.setLevel(logging.DEBUG)
47 log.propagate = False
47 log.propagate = False
48 # create console handler and set level to debug
48 # create console handler and set level to debug
49 ch = logging.StreamHandler()
49 ch = logging.StreamHandler()
50 ch.setLevel(logging.DEBUG)
50 ch.setLevel(logging.DEBUG)
51
51
52 # create formatter
52 # create formatter
53 formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
53 formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
54
54
55 # add formatter to ch
55 # add formatter to ch
56 ch.setFormatter(formatter)
56 ch.setFormatter(formatter)
57
57
58 # add ch to logger
58 # add ch to logger
59 log.addHandler(ch)
59 log.addHandler(ch)
60
60
61 def scan_paths(root_location):
61 def scan_paths(root_location):
62 return HgModel.repo_scan('/', root_location, None, True)
62 return HgModel.repo_scan('/', root_location, None, True)
63
63
64 class WhooshIndexingDaemon(object):
64 class WhooshIndexingDaemon(object):
65 """Deamon for atomic jobs"""
65 """Deamon for atomic jobs"""
66
66
67 def __init__(self, indexname='HG_INDEX', repo_location=None):
67 def __init__(self, indexname='HG_INDEX', repo_location=None):
68 self.indexname = indexname
68 self.indexname = indexname
69 self.repo_location = repo_location
69 self.repo_location = repo_location
70 self.initial = False
70 self.initial = False
71 if not os.path.isdir(IDX_LOCATION):
71 if not os.path.isdir(IDX_LOCATION):
72 os.mkdir(IDX_LOCATION)
72 os.mkdir(IDX_LOCATION)
73 log.info('Cannot run incremental index since it does not'
73 log.info('Cannot run incremental index since it does not'
74 ' yet exist running full build')
74 ' yet exist running full build')
75 self.initial = True
75 self.initial = True
76
76
77 def get_paths(self, root_dir):
77 def get_paths(self, root_dir):
78 """recursive walk in root dir and return a set of all path in that dir
78 """recursive walk in root dir and return a set of all path in that dir
79 excluding files in .hg dir"""
79 excluding files in .hg dir"""
80 index_paths_ = set()
80 index_paths_ = set()
81 for path, dirs, files in os.walk(root_dir):
81 for path, dirs, files in os.walk(root_dir):
82 if path.find('.hg') == -1:
82 if path.find('.hg') == -1:
83 for f in files:
83 for f in files:
84 index_paths_.add(jn(path, f))
84 index_paths_.add(jn(path, f))
85
85
86 return index_paths_
86 return index_paths_
87
87
88 def add_doc(self, writer, path, repo):
88 def add_doc(self, writer, path, repo):
89 """Adding doc to writer"""
89 """Adding doc to writer"""
90
90
91 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
91 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
92 #we just index the content of choosen files
92 #we just index the content of choosen files
93 if ext in INDEX_EXTENSIONS:
93 if ext in INDEX_EXTENSIONS:
94 log.debug(' >> %s [WITH CONTENT]' % path)
94 log.debug(' >> %s [WITH CONTENT]' % path)
95 fobj = open(path, 'rb')
95 fobj = open(path, 'rb')
96 content = fobj.read()
96 content = fobj.read()
97 fobj.close()
97 fobj.close()
98 u_content = safe_unicode(content)
98 u_content = safe_unicode(content)
99 else:
99 else:
100 log.debug(' >> %s' % path)
100 log.debug(' >> %s' % path)
101 #just index file name without it's content
101 #just index file name without it's content
102 u_content = u''
102 u_content = u''
103
103
104
104
105
105
106 try:
106 try:
107 os.stat(path)
107 os.stat(path)
108 writer.add_document(owner=unicode(repo.contact),
108 writer.add_document(owner=unicode(repo.contact),
109 repository=u"%s" % repo.name,
109 repository=safe_unicode(repo.name),
110 path=u"%s" % path,
110 path=safe_unicode(path),
111 content=u_content,
111 content=u_content,
112 modtime=os.path.getmtime(path),
112 modtime=os.path.getmtime(path),
113 extension=ext)
113 extension=ext)
114 except OSError, e:
114 except OSError, e:
115 import errno
115 import errno
116 if e.errno == errno.ENOENT:
116 if e.errno == errno.ENOENT:
117 log.debug('path %s does not exist or is a broken symlink' % path)
117 log.debug('path %s does not exist or is a broken symlink' % path)
118 else:
118 else:
119 raise e
119 raise e
120
120
121
121
122 def build_index(self):
122 def build_index(self):
123 if os.path.exists(IDX_LOCATION):
123 if os.path.exists(IDX_LOCATION):
124 log.debug('removing previos index')
124 log.debug('removing previos index')
125 rmtree(IDX_LOCATION)
125 rmtree(IDX_LOCATION)
126
126
127 if not os.path.exists(IDX_LOCATION):
127 if not os.path.exists(IDX_LOCATION):
128 os.mkdir(IDX_LOCATION)
128 os.mkdir(IDX_LOCATION)
129
129
130 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
130 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
131 writer = idx.writer()
131 writer = idx.writer()
132
132
133 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
133 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
134 log.debug('building index @ %s' % repo.path)
134 log.debug('building index @ %s' % repo.path)
135
135
136 for idx_path in self.get_paths(repo.path):
136 for idx_path in self.get_paths(repo.path):
137 self.add_doc(writer, idx_path, repo)
137 self.add_doc(writer, idx_path, repo)
138 writer.commit(merge=True)
138 writer.commit(merge=True)
139
139
140 log.debug('>>> FINISHED BUILDING INDEX <<<')
140 log.debug('>>> FINISHED BUILDING INDEX <<<')
141
141
142
142
143 def update_index(self):
143 def update_index(self):
144 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
144 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
145
145
146 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
146 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
147 # The set of all paths in the index
147 # The set of all paths in the index
148 indexed_paths = set()
148 indexed_paths = set()
149 # The set of all paths we need to re-index
149 # The set of all paths we need to re-index
150 to_index = set()
150 to_index = set()
151
151
152 reader = idx.reader()
152 reader = idx.reader()
153 writer = idx.writer()
153 writer = idx.writer()
154
154
155 # Loop over the stored fields in the index
155 # Loop over the stored fields in the index
156 for fields in reader.all_stored_fields():
156 for fields in reader.all_stored_fields():
157 indexed_path = fields['path']
157 indexed_path = fields['path']
158 indexed_paths.add(indexed_path)
158 indexed_paths.add(indexed_path)
159
159
160 if not os.path.exists(indexed_path):
160 if not os.path.exists(indexed_path):
161 # This file was deleted since it was indexed
161 # This file was deleted since it was indexed
162 log.debug('removing from index %s' % indexed_path)
162 log.debug('removing from index %s' % indexed_path)
163 writer.delete_by_term('path', indexed_path)
163 writer.delete_by_term('path', indexed_path)
164
164
165 else:
165 else:
166 # Check if this file was changed since it
166 # Check if this file was changed since it
167 # was indexed
167 # was indexed
168 indexed_time = fields['modtime']
168 indexed_time = fields['modtime']
169
169
170 mtime = os.path.getmtime(indexed_path)
170 mtime = os.path.getmtime(indexed_path)
171
171
172 if mtime > indexed_time:
172 if mtime > indexed_time:
173
173
174 # The file has changed, delete it and add it to the list of
174 # The file has changed, delete it and add it to the list of
175 # files to reindex
175 # files to reindex
176 log.debug('adding to reindex list %s' % indexed_path)
176 log.debug('adding to reindex list %s' % indexed_path)
177 writer.delete_by_term('path', indexed_path)
177 writer.delete_by_term('path', indexed_path)
178 to_index.add(indexed_path)
178 to_index.add(indexed_path)
179 #writer.commit()
179 #writer.commit()
180
180
181 # Loop over the files in the filesystem
181 # Loop over the files in the filesystem
182 # Assume we have a function that gathers the filenames of the
182 # Assume we have a function that gathers the filenames of the
183 # documents to be indexed
183 # documents to be indexed
184 for repo in scan_paths(self.repo_location).values():
184 for repo in scan_paths(self.repo_location).values():
185 for path in self.get_paths(repo.path):
185 for path in self.get_paths(repo.path):
186 if path in to_index or path not in indexed_paths:
186 if path in to_index or path not in indexed_paths:
187 # This is either a file that's changed, or a new file
187 # This is either a file that's changed, or a new file
188 # that wasn't indexed before. So index it!
188 # that wasn't indexed before. So index it!
189 self.add_doc(writer, path, repo)
189 self.add_doc(writer, path, repo)
190 log.debug('reindexing %s' % path)
190 log.debug('reindexing %s' % path)
191
191
192 writer.commit(merge=True)
192 writer.commit(merge=True)
193 #idx.optimize()
193 #idx.optimize()
194 log.debug('>>> FINISHED <<<')
194 log.debug('>>> FINISHED <<<')
195
195
196 def run(self, full_index=False):
196 def run(self, full_index=False):
197 """Run daemon"""
197 """Run daemon"""
198 if full_index or self.initial:
198 if full_index or self.initial:
199 self.build_index()
199 self.build_index()
200 else:
200 else:
201 self.update_index()
201 self.update_index()
202
202
203 if __name__ == "__main__":
203 if __name__ == "__main__":
204 arg = sys.argv[1:]
204 arg = sys.argv[1:]
205 if len(arg) != 2:
205 if len(arg) != 2:
206 sys.stderr.write('Please specify indexing type [full|incremental]'
206 sys.stderr.write('Please specify indexing type [full|incremental]'
207 'and path to repositories as script args \n')
207 'and path to repositories as script args \n')
208 sys.exit()
208 sys.exit()
209
209
210
210
211 if arg[0] == 'full':
211 if arg[0] == 'full':
212 full_index = True
212 full_index = True
213 elif arg[0] == 'incremental':
213 elif arg[0] == 'incremental':
214 # False means looking just for changes
214 # False means looking just for changes
215 full_index = False
215 full_index = False
216 else:
216 else:
217 sys.stdout.write('Please use [full|incremental]'
217 sys.stdout.write('Please use [full|incremental]'
218 ' as script first arg \n')
218 ' as script first arg \n')
219 sys.exit()
219 sys.exit()
220
220
221 if not os.path.isdir(arg[1]):
221 if not os.path.isdir(arg[1]):
222 sys.stderr.write('%s is not a valid path \n' % arg[1])
222 sys.stderr.write('%s is not a valid path \n' % arg[1])
223 sys.exit()
223 sys.exit()
224 else:
224 else:
225 if arg[1].endswith('/'):
225 if arg[1].endswith('/'):
226 repo_location = arg[1] + '*'
226 repo_location = arg[1] + '*'
227 else:
227 else:
228 repo_location = arg[1] + '/*'
228 repo_location = arg[1] + '/*'
229
229
230 try:
230 try:
231 l = DaemonLock()
231 l = DaemonLock()
232 WhooshIndexingDaemon(repo_location=repo_location)\
232 WhooshIndexingDaemon(repo_location=repo_location)\
233 .run(full_index=full_index)
233 .run(full_index=full_index)
234 l.release()
234 l.release()
235 reload(logging)
235 reload(logging)
236 except LockHeld:
236 except LockHeld:
237 sys.exit(1)
237 sys.exit(1)
238
238
General Comments 0
You need to be logged in to leave comments. Login now