##// END OF EJS Templates
fixed whoosh indexing possible unicode decode errors
marcink -
r557:29ec9ddb default
parent child Browse files
Show More
@@ -1,238 +1,238 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3 # whoosh indexer daemon for rhodecode
4 4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 5 #
6 6 # This program is free software; you can redistribute it and/or
7 7 # modify it under the terms of the GNU General Public License
8 8 # as published by the Free Software Foundation; version 2
9 9 # of the License or (at your opinion) any later version of the license.
10 10 #
11 11 # This program is distributed in the hope that it will be useful,
12 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 14 # GNU General Public License for more details.
15 15 #
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program; if not, write to the Free Software
18 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 19 # MA 02110-1301, USA.
20 20 """
21 21 Created on Jan 26, 2010
22 22
23 23 @author: marcink
24 24 A deamon will read from task table and run tasks
25 25 """
26 26 import sys
27 27 import os
28 28 from os.path import dirname as dn
29 29 from os.path import join as jn
30 30
31 31 #to get the rhodecode import
32 32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
33 33 sys.path.append(project_path)
34 34
35 35 from rhodecode.lib.pidlock import LockHeld, DaemonLock
36 36 from rhodecode.model.hg_model import HgModel
37 37 from rhodecode.lib.helpers import safe_unicode
38 38 from whoosh.index import create_in, open_dir
39 39 from shutil import rmtree
40 40 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
41 41
42 42 import logging
43 43
44 44 log = logging.getLogger('whooshIndexer')
45 45 # create logger
46 46 log.setLevel(logging.DEBUG)
47 47 log.propagate = False
48 48 # create console handler and set level to debug
49 49 ch = logging.StreamHandler()
50 50 ch.setLevel(logging.DEBUG)
51 51
52 52 # create formatter
53 53 formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
54 54
55 55 # add formatter to ch
56 56 ch.setFormatter(formatter)
57 57
58 58 # add ch to logger
59 59 log.addHandler(ch)
60 60
61 61 def scan_paths(root_location):
62 62 return HgModel.repo_scan('/', root_location, None, True)
63 63
64 64 class WhooshIndexingDaemon(object):
65 65 """Deamon for atomic jobs"""
66 66
67 67 def __init__(self, indexname='HG_INDEX', repo_location=None):
68 68 self.indexname = indexname
69 69 self.repo_location = repo_location
70 70 self.initial = False
71 71 if not os.path.isdir(IDX_LOCATION):
72 72 os.mkdir(IDX_LOCATION)
73 73 log.info('Cannot run incremental index since it does not'
74 74 ' yet exist running full build')
75 75 self.initial = True
76 76
77 77 def get_paths(self, root_dir):
78 78 """recursive walk in root dir and return a set of all path in that dir
79 79 excluding files in .hg dir"""
80 80 index_paths_ = set()
81 81 for path, dirs, files in os.walk(root_dir):
82 82 if path.find('.hg') == -1:
83 83 for f in files:
84 84 index_paths_.add(jn(path, f))
85 85
86 86 return index_paths_
87 87
88 88 def add_doc(self, writer, path, repo):
89 89 """Adding doc to writer"""
90 90
91 91 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
92 92 #we just index the content of choosen files
93 93 if ext in INDEX_EXTENSIONS:
94 94 log.debug(' >> %s [WITH CONTENT]' % path)
95 95 fobj = open(path, 'rb')
96 96 content = fobj.read()
97 97 fobj.close()
98 98 u_content = safe_unicode(content)
99 99 else:
100 100 log.debug(' >> %s' % path)
101 101 #just index file name without it's content
102 102 u_content = u''
103 103
104 104
105 105
106 106 try:
107 107 os.stat(path)
108 108 writer.add_document(owner=unicode(repo.contact),
109 repository=u"%s" % repo.name,
110 path=u"%s" % path,
109 repository=safe_unicode(repo.name),
110 path=safe_unicode(path),
111 111 content=u_content,
112 112 modtime=os.path.getmtime(path),
113 113 extension=ext)
114 114 except OSError, e:
115 115 import errno
116 116 if e.errno == errno.ENOENT:
117 117 log.debug('path %s does not exist or is a broken symlink' % path)
118 118 else:
119 119 raise e
120 120
121 121
122 122 def build_index(self):
123 123 if os.path.exists(IDX_LOCATION):
124 124 log.debug('removing previos index')
125 125 rmtree(IDX_LOCATION)
126 126
127 127 if not os.path.exists(IDX_LOCATION):
128 128 os.mkdir(IDX_LOCATION)
129 129
130 130 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
131 131 writer = idx.writer()
132 132
133 133 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
134 134 log.debug('building index @ %s' % repo.path)
135 135
136 136 for idx_path in self.get_paths(repo.path):
137 137 self.add_doc(writer, idx_path, repo)
138 138 writer.commit(merge=True)
139 139
140 140 log.debug('>>> FINISHED BUILDING INDEX <<<')
141 141
142 142
143 143 def update_index(self):
144 144 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
145 145
146 146 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
147 147 # The set of all paths in the index
148 148 indexed_paths = set()
149 149 # The set of all paths we need to re-index
150 150 to_index = set()
151 151
152 152 reader = idx.reader()
153 153 writer = idx.writer()
154 154
155 155 # Loop over the stored fields in the index
156 156 for fields in reader.all_stored_fields():
157 157 indexed_path = fields['path']
158 158 indexed_paths.add(indexed_path)
159 159
160 160 if not os.path.exists(indexed_path):
161 161 # This file was deleted since it was indexed
162 162 log.debug('removing from index %s' % indexed_path)
163 163 writer.delete_by_term('path', indexed_path)
164 164
165 165 else:
166 166 # Check if this file was changed since it
167 167 # was indexed
168 168 indexed_time = fields['modtime']
169 169
170 170 mtime = os.path.getmtime(indexed_path)
171 171
172 172 if mtime > indexed_time:
173 173
174 174 # The file has changed, delete it and add it to the list of
175 175 # files to reindex
176 176 log.debug('adding to reindex list %s' % indexed_path)
177 177 writer.delete_by_term('path', indexed_path)
178 178 to_index.add(indexed_path)
179 179 #writer.commit()
180 180
181 181 # Loop over the files in the filesystem
182 182 # Assume we have a function that gathers the filenames of the
183 183 # documents to be indexed
184 184 for repo in scan_paths(self.repo_location).values():
185 185 for path in self.get_paths(repo.path):
186 186 if path in to_index or path not in indexed_paths:
187 187 # This is either a file that's changed, or a new file
188 188 # that wasn't indexed before. So index it!
189 189 self.add_doc(writer, path, repo)
190 190 log.debug('reindexing %s' % path)
191 191
192 192 writer.commit(merge=True)
193 193 #idx.optimize()
194 194 log.debug('>>> FINISHED <<<')
195 195
196 196 def run(self, full_index=False):
197 197 """Run daemon"""
198 198 if full_index or self.initial:
199 199 self.build_index()
200 200 else:
201 201 self.update_index()
202 202
203 203 if __name__ == "__main__":
204 204 arg = sys.argv[1:]
205 205 if len(arg) != 2:
206 206 sys.stderr.write('Please specify indexing type [full|incremental]'
207 207 'and path to repositories as script args \n')
208 208 sys.exit()
209 209
210 210
211 211 if arg[0] == 'full':
212 212 full_index = True
213 213 elif arg[0] == 'incremental':
214 214 # False means looking just for changes
215 215 full_index = False
216 216 else:
217 217 sys.stdout.write('Please use [full|incremental]'
218 218 ' as script first arg \n')
219 219 sys.exit()
220 220
221 221 if not os.path.isdir(arg[1]):
222 222 sys.stderr.write('%s is not a valid path \n' % arg[1])
223 223 sys.exit()
224 224 else:
225 225 if arg[1].endswith('/'):
226 226 repo_location = arg[1] + '*'
227 227 else:
228 228 repo_location = arg[1] + '/*'
229 229
230 230 try:
231 231 l = DaemonLock()
232 232 WhooshIndexingDaemon(repo_location=repo_location)\
233 233 .run(full_index=full_index)
234 234 l.release()
235 235 reload(logging)
236 236 except LockHeld:
237 237 sys.exit(1)
238 238
General Comments 0
You need to be logged in to leave comments. Login now