##// END OF EJS Templates
fixed #851 and #563 make-index crashes on non-ascii files
marcink -
r3921:932c84e8 beta
parent child Browse files
Show More
@@ -1,430 +1,439 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2 """
2 """
3 rhodecode.lib.indexers.daemon
3 rhodecode.lib.indexers.daemon
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5
5
6 A daemon will read from task table and run tasks
6 A daemon will read from task table and run tasks
7
7
8 :created_on: Jan 26, 2010
8 :created_on: Jan 26, 2010
9 :author: marcink
9 :author: marcink
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 :license: GPLv3, see COPYING for more details.
11 :license: GPLv3, see COPYING for more details.
12 """
12 """
13 # This program is free software: you can redistribute it and/or modify
13 # This program is free software: you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation, either version 3 of the License, or
15 # the Free Software Foundation, either version 3 of the License, or
16 # (at your option) any later version.
16 # (at your option) any later version.
17 #
17 #
18 # This program is distributed in the hope that it will be useful,
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
21 # GNU General Public License for more details.
22 #
22 #
23 # You should have received a copy of the GNU General Public License
23 # You should have received a copy of the GNU General Public License
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 from __future__ import with_statement
25 from __future__ import with_statement
26
26
27 import os
27 import os
28 import sys
28 import sys
29 import logging
29 import logging
30 import traceback
30 import traceback
31
31
32 from shutil import rmtree
32 from shutil import rmtree
33 from time import mktime
33 from time import mktime
34
34
35 from os.path import dirname as dn
35 from os.path import dirname as dn
36 from os.path import join as jn
36 from os.path import join as jn
37
37
38 #to get the rhodecode import
38 #to get the rhodecode import
39 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
39 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
40 sys.path.append(project_path)
40 sys.path.append(project_path)
41
41
42 from rhodecode.config.conf import INDEX_EXTENSIONS
42 from rhodecode.config.conf import INDEX_EXTENSIONS
43 from rhodecode.model.scm import ScmModel
43 from rhodecode.model.scm import ScmModel
44 from rhodecode.model.db import Repository
44 from rhodecode.model.db import Repository
45 from rhodecode.lib.utils2 import safe_unicode, safe_str
45 from rhodecode.lib.utils2 import safe_unicode, safe_str
46 from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
46 from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
47 CHGSET_IDX_NAME
47 CHGSET_IDX_NAME
48
48
49 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
49 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
50 NodeDoesNotExistError
50 NodeDoesNotExistError
51
51
52 from whoosh.index import create_in, open_dir, exists_in
52 from whoosh.index import create_in, open_dir, exists_in
53 from whoosh.query import *
53 from whoosh.query import *
54 from whoosh.qparser import QueryParser
54 from whoosh.qparser import QueryParser
55
55
56 log = logging.getLogger('whoosh_indexer')
56 log = logging.getLogger('whoosh_indexer')
57
57
58
58
59 class WhooshIndexingDaemon(object):
59 class WhooshIndexingDaemon(object):
60 """
60 """
61 Daemon for atomic indexing jobs
61 Daemon for atomic indexing jobs
62 """
62 """
63
63
64 def __init__(self, indexname=IDX_NAME, index_location=None,
64 def __init__(self, indexname=IDX_NAME, index_location=None,
65 repo_location=None, sa=None, repo_list=None,
65 repo_location=None, sa=None, repo_list=None,
66 repo_update_list=None):
66 repo_update_list=None):
67 self.indexname = indexname
67 self.indexname = indexname
68
68
69 self.index_location = index_location
69 self.index_location = index_location
70 if not index_location:
70 if not index_location:
71 raise Exception('You have to provide index location')
71 raise Exception('You have to provide index location')
72
72
73 self.repo_location = repo_location
73 self.repo_location = repo_location
74 if not repo_location:
74 if not repo_location:
75 raise Exception('You have to provide repositories location')
75 raise Exception('You have to provide repositories location')
76
76
77 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
77 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
78
78
79 #filter repo list
79 #filter repo list
80 if repo_list:
80 if repo_list:
81 #Fix non-ascii repo names to unicode
81 #Fix non-ascii repo names to unicode
82 repo_list = map(safe_unicode, repo_list)
82 repo_list = map(safe_unicode, repo_list)
83 self.filtered_repo_paths = {}
83 self.filtered_repo_paths = {}
84 for repo_name, repo in self.repo_paths.items():
84 for repo_name, repo in self.repo_paths.items():
85 if repo_name in repo_list:
85 if repo_name in repo_list:
86 self.filtered_repo_paths[repo_name] = repo
86 self.filtered_repo_paths[repo_name] = repo
87
87
88 self.repo_paths = self.filtered_repo_paths
88 self.repo_paths = self.filtered_repo_paths
89
89
90 #filter update repo list
90 #filter update repo list
91 self.filtered_repo_update_paths = {}
91 self.filtered_repo_update_paths = {}
92 if repo_update_list:
92 if repo_update_list:
93 self.filtered_repo_update_paths = {}
93 self.filtered_repo_update_paths = {}
94 for repo_name, repo in self.repo_paths.items():
94 for repo_name, repo in self.repo_paths.items():
95 if repo_name in repo_update_list:
95 if repo_name in repo_update_list:
96 self.filtered_repo_update_paths[repo_name] = repo
96 self.filtered_repo_update_paths[repo_name] = repo
97 self.repo_paths = self.filtered_repo_update_paths
97 self.repo_paths = self.filtered_repo_update_paths
98
98
99 self.initial = True
99 self.initial = True
100 if not os.path.isdir(self.index_location):
100 if not os.path.isdir(self.index_location):
101 os.makedirs(self.index_location)
101 os.makedirs(self.index_location)
102 log.info('Cannot run incremental index since it does not '
102 log.info('Cannot run incremental index since it does not '
103 'yet exist running full build')
103 'yet exist running full build')
104 elif not exists_in(self.index_location, IDX_NAME):
104 elif not exists_in(self.index_location, IDX_NAME):
105 log.info('Running full index build as the file content '
105 log.info('Running full index build as the file content '
106 'index does not exist')
106 'index does not exist')
107 elif not exists_in(self.index_location, CHGSET_IDX_NAME):
107 elif not exists_in(self.index_location, CHGSET_IDX_NAME):
108 log.info('Running full index build as the changeset '
108 log.info('Running full index build as the changeset '
109 'index does not exist')
109 'index does not exist')
110 else:
110 else:
111 self.initial = False
111 self.initial = False
112
112
113 def _get_index_revision(self, repo):
113 def _get_index_revision(self, repo):
114 db_repo = Repository.get_by_repo_name(repo.name)
114 db_repo = Repository.get_by_repo_name(repo.name)
115 landing_rev = 'tip'
115 landing_rev = 'tip'
116 if db_repo:
116 if db_repo:
117 landing_rev = db_repo.landing_rev
117 landing_rev = db_repo.landing_rev
118 return landing_rev
118 return landing_rev
119
119
120 def _get_index_changeset(self, repo):
120 def _get_index_changeset(self, repo):
121 index_rev = self._get_index_revision(repo)
121 index_rev = self._get_index_revision(repo)
122 cs = repo.get_changeset(index_rev)
122 cs = repo.get_changeset(index_rev)
123 return cs
123 return cs
124
124
125 def get_paths(self, repo):
125 def get_paths(self, repo):
126 """
126 """
127 recursive walk in root dir and return a set of all path in that dir
127 recursive walk in root dir and return a set of all path in that dir
128 based on repository walk function
128 based on repository walk function
129 """
129 """
130 index_paths_ = set()
130 index_paths_ = set()
131 try:
131 try:
132 cs = self._get_index_changeset(repo)
132 cs = self._get_index_changeset(repo)
133 for _topnode, _dirs, files in cs.walk('/'):
133 for _topnode, _dirs, files in cs.walk('/'):
134 for f in files:
134 for f in files:
135 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
135 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
136
136
137 except RepositoryError:
137 except RepositoryError:
138 log.debug(traceback.format_exc())
138 log.debug(traceback.format_exc())
139 pass
139 pass
140 return index_paths_
140 return index_paths_
141
141
142 def get_node(self, repo, path):
142 def get_node(self, repo, path):
143 n_path = path[len(repo.path) + 1:]
143 """
144 gets a filenode based on given full path.It operates on string for
145 hg git compatability.
146
147 :param repo: scm repo instance
148 :param path: full path including root location
149 :return: FileNode
150 """
151 root_path = safe_str(repo.path)+'/'
152 parts = safe_str(path).partition(root_path)
144 cs = self._get_index_changeset(repo)
153 cs = self._get_index_changeset(repo)
145 node = cs.get_node(n_path)
154 node = cs.get_node(parts[-1])
146 return node
155 return node
147
156
148 def get_node_mtime(self, node):
157 def get_node_mtime(self, node):
149 return mktime(node.last_changeset.date.timetuple())
158 return mktime(node.last_changeset.date.timetuple())
150
159
151 def add_doc(self, writer, path, repo, repo_name):
160 def add_doc(self, writer, path, repo, repo_name):
152 """
161 """
153 Adding doc to writer this function itself fetches data from
162 Adding doc to writer this function itself fetches data from
154 the instance of vcs backend
163 the instance of vcs backend
155 """
164 """
156
165
157 node = self.get_node(repo, path)
166 node = self.get_node(repo, path)
158 indexed = indexed_w_content = 0
167 indexed = indexed_w_content = 0
159 # we just index the content of chosen files, and skip binary files
168 # we just index the content of chosen files, and skip binary files
160 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
169 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
161 u_content = node.content
170 u_content = node.content
162 if not isinstance(u_content, unicode):
171 if not isinstance(u_content, unicode):
163 log.warning(' >> %s Could not get this content as unicode '
172 log.warning(' >> %s Could not get this content as unicode '
164 'replacing with empty content' % path)
173 'replacing with empty content' % path)
165 u_content = u''
174 u_content = u''
166 else:
175 else:
167 log.debug(' >> %s [WITH CONTENT]' % path)
176 log.debug(' >> %s [WITH CONTENT]' % path)
168 indexed_w_content += 1
177 indexed_w_content += 1
169
178
170 else:
179 else:
171 log.debug(' >> %s' % path)
180 log.debug(' >> %s' % path)
172 # just index file name without it's content
181 # just index file name without it's content
173 u_content = u''
182 u_content = u''
174 indexed += 1
183 indexed += 1
175
184
176 p = safe_unicode(path)
185 p = safe_unicode(path)
177 writer.add_document(
186 writer.add_document(
178 fileid=p,
187 fileid=p,
179 owner=unicode(repo.contact),
188 owner=unicode(repo.contact),
180 repository=safe_unicode(repo_name),
189 repository=safe_unicode(repo_name),
181 path=p,
190 path=p,
182 content=u_content,
191 content=u_content,
183 modtime=self.get_node_mtime(node),
192 modtime=self.get_node_mtime(node),
184 extension=node.extension
193 extension=node.extension
185 )
194 )
186 return indexed, indexed_w_content
195 return indexed, indexed_w_content
187
196
188 def index_changesets(self, writer, repo_name, repo, start_rev=None):
197 def index_changesets(self, writer, repo_name, repo, start_rev=None):
189 """
198 """
190 Add all changeset in the vcs repo starting at start_rev
199 Add all changeset in the vcs repo starting at start_rev
191 to the index writer
200 to the index writer
192
201
193 :param writer: the whoosh index writer to add to
202 :param writer: the whoosh index writer to add to
194 :param repo_name: name of the repository from whence the
203 :param repo_name: name of the repository from whence the
195 changeset originates including the repository group
204 changeset originates including the repository group
196 :param repo: the vcs repository instance to index changesets for,
205 :param repo: the vcs repository instance to index changesets for,
197 the presumption is the repo has changesets to index
206 the presumption is the repo has changesets to index
198 :param start_rev=None: the full sha id to start indexing from
207 :param start_rev=None: the full sha id to start indexing from
199 if start_rev is None then index from the first changeset in
208 if start_rev is None then index from the first changeset in
200 the repo
209 the repo
201 """
210 """
202
211
203 if start_rev is None:
212 if start_rev is None:
204 start_rev = repo[0].raw_id
213 start_rev = repo[0].raw_id
205
214
206 log.debug('indexing changesets in %s starting at rev: %s' %
215 log.debug('indexing changesets in %s starting at rev: %s' %
207 (repo_name, start_rev))
216 (repo_name, start_rev))
208
217
209 indexed = 0
218 indexed = 0
210 for cs in repo.get_changesets(start=start_rev):
219 for cs in repo.get_changesets(start=start_rev):
211 log.debug(' >> %s' % cs)
220 log.debug(' >> %s' % cs)
212 writer.add_document(
221 writer.add_document(
213 raw_id=unicode(cs.raw_id),
222 raw_id=unicode(cs.raw_id),
214 owner=unicode(repo.contact),
223 owner=unicode(repo.contact),
215 date=cs._timestamp,
224 date=cs._timestamp,
216 repository=safe_unicode(repo_name),
225 repository=safe_unicode(repo_name),
217 author=cs.author,
226 author=cs.author,
218 message=cs.message,
227 message=cs.message,
219 last=cs.last,
228 last=cs.last,
220 added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
229 added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
221 removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
230 removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
222 changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
231 changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
223 parents=u' '.join([cs.raw_id for cs in cs.parents]),
232 parents=u' '.join([cs.raw_id for cs in cs.parents]),
224 )
233 )
225 indexed += 1
234 indexed += 1
226
235
227 log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
236 log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
228 return indexed
237 return indexed
229
238
230 def index_files(self, file_idx_writer, repo_name, repo):
239 def index_files(self, file_idx_writer, repo_name, repo):
231 """
240 """
232 Index files for given repo_name
241 Index files for given repo_name
233
242
234 :param file_idx_writer: the whoosh index writer to add to
243 :param file_idx_writer: the whoosh index writer to add to
235 :param repo_name: name of the repository we're indexing
244 :param repo_name: name of the repository we're indexing
236 :param repo: instance of vcs repo
245 :param repo: instance of vcs repo
237 """
246 """
238 i_cnt = iwc_cnt = 0
247 i_cnt = iwc_cnt = 0
239 log.debug('building index for %s @revision:%s' % (repo.path,
248 log.debug('building index for %s @revision:%s' % (repo.path,
240 self._get_index_revision(repo)))
249 self._get_index_revision(repo)))
241 for idx_path in self.get_paths(repo):
250 for idx_path in self.get_paths(repo):
242 i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
251 i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
243 i_cnt += i
252 i_cnt += i
244 iwc_cnt += iwc
253 iwc_cnt += iwc
245
254
246 log.debug('added %s files %s with content for repo %s' %
255 log.debug('added %s files %s with content for repo %s' %
247 (i_cnt + iwc_cnt, iwc_cnt, repo.path))
256 (i_cnt + iwc_cnt, iwc_cnt, repo.path))
248 return i_cnt, iwc_cnt
257 return i_cnt, iwc_cnt
249
258
250 def update_changeset_index(self):
259 def update_changeset_index(self):
251 idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
260 idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
252
261
253 with idx.searcher() as searcher:
262 with idx.searcher() as searcher:
254 writer = idx.writer()
263 writer = idx.writer()
255 writer_is_dirty = False
264 writer_is_dirty = False
256 try:
265 try:
257 indexed_total = 0
266 indexed_total = 0
258 repo_name = None
267 repo_name = None
259 for repo_name, repo in self.repo_paths.items():
268 for repo_name, repo in self.repo_paths.items():
260 # skip indexing if there aren't any revs in the repo
269 # skip indexing if there aren't any revs in the repo
261 num_of_revs = len(repo)
270 num_of_revs = len(repo)
262 if num_of_revs < 1:
271 if num_of_revs < 1:
263 continue
272 continue
264
273
265 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
274 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
266 q = qp.parse(u"last:t AND %s" % repo_name)
275 q = qp.parse(u"last:t AND %s" % repo_name)
267
276
268 results = searcher.search(q)
277 results = searcher.search(q)
269
278
270 # default to scanning the entire repo
279 # default to scanning the entire repo
271 last_rev = 0
280 last_rev = 0
272 start_id = None
281 start_id = None
273
282
274 if len(results) > 0:
283 if len(results) > 0:
275 # assuming that there is only one result, if not this
284 # assuming that there is only one result, if not this
276 # may require a full re-index.
285 # may require a full re-index.
277 start_id = results[0]['raw_id']
286 start_id = results[0]['raw_id']
278 last_rev = repo.get_changeset(revision=start_id).revision
287 last_rev = repo.get_changeset(revision=start_id).revision
279
288
280 # there are new changesets to index or a new repo to index
289 # there are new changesets to index or a new repo to index
281 if last_rev == 0 or num_of_revs > last_rev + 1:
290 if last_rev == 0 or num_of_revs > last_rev + 1:
282 # delete the docs in the index for the previous
291 # delete the docs in the index for the previous
283 # last changeset(s)
292 # last changeset(s)
284 for hit in results:
293 for hit in results:
285 q = qp.parse(u"last:t AND %s AND raw_id:%s" %
294 q = qp.parse(u"last:t AND %s AND raw_id:%s" %
286 (repo_name, hit['raw_id']))
295 (repo_name, hit['raw_id']))
287 writer.delete_by_query(q)
296 writer.delete_by_query(q)
288
297
289 # index from the previous last changeset + all new ones
298 # index from the previous last changeset + all new ones
290 indexed_total += self.index_changesets(writer,
299 indexed_total += self.index_changesets(writer,
291 repo_name, repo, start_id)
300 repo_name, repo, start_id)
292 writer_is_dirty = True
301 writer_is_dirty = True
293 log.debug('indexed %s changesets for repo %s' % (
302 log.debug('indexed %s changesets for repo %s' % (
294 indexed_total, repo_name)
303 indexed_total, repo_name)
295 )
304 )
296 finally:
305 finally:
297 if writer_is_dirty:
306 if writer_is_dirty:
298 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
307 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
299 writer.commit(merge=True)
308 writer.commit(merge=True)
300 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
309 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
301 else:
310 else:
302 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
311 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
303
312
304 def update_file_index(self):
313 def update_file_index(self):
305 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
314 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
306 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
315 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
307
316
308 idx = open_dir(self.index_location, indexname=self.indexname)
317 idx = open_dir(self.index_location, indexname=self.indexname)
309 # The set of all paths in the index
318 # The set of all paths in the index
310 indexed_paths = set()
319 indexed_paths = set()
311 # The set of all paths we need to re-index
320 # The set of all paths we need to re-index
312 to_index = set()
321 to_index = set()
313
322
314 writer = idx.writer()
323 writer = idx.writer()
315 writer_is_dirty = False
324 writer_is_dirty = False
316 try:
325 try:
317 with idx.reader() as reader:
326 with idx.reader() as reader:
318
327
319 # Loop over the stored fields in the index
328 # Loop over the stored fields in the index
320 for fields in reader.all_stored_fields():
329 for fields in reader.all_stored_fields():
321 indexed_path = fields['path']
330 indexed_path = fields['path']
322 indexed_repo_path = fields['repository']
331 indexed_repo_path = fields['repository']
323 indexed_paths.add(indexed_path)
332 indexed_paths.add(indexed_path)
324
333
325 if not indexed_repo_path in self.filtered_repo_update_paths:
334 if not indexed_repo_path in self.filtered_repo_update_paths:
326 continue
335 continue
327
336
328 repo = self.repo_paths[indexed_repo_path]
337 repo = self.repo_paths[indexed_repo_path]
329
338
330 try:
339 try:
331 node = self.get_node(repo, indexed_path)
340 node = self.get_node(repo, indexed_path)
332 # Check if this file was changed since it was indexed
341 # Check if this file was changed since it was indexed
333 indexed_time = fields['modtime']
342 indexed_time = fields['modtime']
334 mtime = self.get_node_mtime(node)
343 mtime = self.get_node_mtime(node)
335 if mtime > indexed_time:
344 if mtime > indexed_time:
336 # The file has changed, delete it and add it to
345 # The file has changed, delete it and add it to
337 # the list of files to reindex
346 # the list of files to reindex
338 log.debug(
347 log.debug(
339 'adding to reindex list %s mtime: %s vs %s' % (
348 'adding to reindex list %s mtime: %s vs %s' % (
340 indexed_path, mtime, indexed_time)
349 indexed_path, mtime, indexed_time)
341 )
350 )
342 writer.delete_by_term('fileid', indexed_path)
351 writer.delete_by_term('fileid', indexed_path)
343 writer_is_dirty = True
352 writer_is_dirty = True
344
353
345 to_index.add(indexed_path)
354 to_index.add(indexed_path)
346 except (ChangesetError, NodeDoesNotExistError):
355 except (ChangesetError, NodeDoesNotExistError):
347 # This file was deleted since it was indexed
356 # This file was deleted since it was indexed
348 log.debug('removing from index %s' % indexed_path)
357 log.debug('removing from index %s' % indexed_path)
349 writer.delete_by_term('path', indexed_path)
358 writer.delete_by_term('path', indexed_path)
350 writer_is_dirty = True
359 writer_is_dirty = True
351
360
352 # Loop over the files in the filesystem
361 # Loop over the files in the filesystem
353 # Assume we have a function that gathers the filenames of the
362 # Assume we have a function that gathers the filenames of the
354 # documents to be indexed
363 # documents to be indexed
355 ri_cnt_total = 0 # indexed
364 ri_cnt_total = 0 # indexed
356 riwc_cnt_total = 0 # indexed with content
365 riwc_cnt_total = 0 # indexed with content
357 for repo_name, repo in self.repo_paths.items():
366 for repo_name, repo in self.repo_paths.items():
358 # skip indexing if there aren't any revisions
367 # skip indexing if there aren't any revisions
359 if len(repo) < 1:
368 if len(repo) < 1:
360 continue
369 continue
361 ri_cnt = 0 # indexed
370 ri_cnt = 0 # indexed
362 riwc_cnt = 0 # indexed with content
371 riwc_cnt = 0 # indexed with content
363 for path in self.get_paths(repo):
372 for path in self.get_paths(repo):
364 path = safe_unicode(path)
373 path = safe_unicode(path)
365 if path in to_index or path not in indexed_paths:
374 if path in to_index or path not in indexed_paths:
366
375
367 # This is either a file that's changed, or a new file
376 # This is either a file that's changed, or a new file
368 # that wasn't indexed before. So index it!
377 # that wasn't indexed before. So index it!
369 i, iwc = self.add_doc(writer, path, repo, repo_name)
378 i, iwc = self.add_doc(writer, path, repo, repo_name)
370 writer_is_dirty = True
379 writer_is_dirty = True
371 log.debug('re indexing %s' % path)
380 log.debug('re indexing %s' % path)
372 ri_cnt += i
381 ri_cnt += i
373 ri_cnt_total += 1
382 ri_cnt_total += 1
374 riwc_cnt += iwc
383 riwc_cnt += iwc
375 riwc_cnt_total += iwc
384 riwc_cnt_total += iwc
376 log.debug('added %s files %s with content for repo %s' % (
385 log.debug('added %s files %s with content for repo %s' % (
377 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
386 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
378 )
387 )
379 log.debug('indexed %s files in total and %s with content' % (
388 log.debug('indexed %s files in total and %s with content' % (
380 ri_cnt_total, riwc_cnt_total)
389 ri_cnt_total, riwc_cnt_total)
381 )
390 )
382 finally:
391 finally:
383 if writer_is_dirty:
392 if writer_is_dirty:
384 log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
393 log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
385 writer.commit(merge=True)
394 writer.commit(merge=True)
386 log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
395 log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
387 else:
396 else:
388 log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
397 log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
389 writer.cancel()
398 writer.cancel()
390
399
391 def build_indexes(self):
400 def build_indexes(self):
392 if os.path.exists(self.index_location):
401 if os.path.exists(self.index_location):
393 log.debug('removing previous index')
402 log.debug('removing previous index')
394 rmtree(self.index_location)
403 rmtree(self.index_location)
395
404
396 if not os.path.exists(self.index_location):
405 if not os.path.exists(self.index_location):
397 os.mkdir(self.index_location)
406 os.mkdir(self.index_location)
398
407
399 chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
408 chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
400 indexname=CHGSET_IDX_NAME)
409 indexname=CHGSET_IDX_NAME)
401 chgset_idx_writer = chgset_idx.writer()
410 chgset_idx_writer = chgset_idx.writer()
402
411
403 file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
412 file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
404 file_idx_writer = file_idx.writer()
413 file_idx_writer = file_idx.writer()
405 log.debug('BUILDING INDEX FOR EXTENSIONS %s '
414 log.debug('BUILDING INDEX FOR EXTENSIONS %s '
406 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
415 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
407
416
408 for repo_name, repo in self.repo_paths.items():
417 for repo_name, repo in self.repo_paths.items():
409 # skip indexing if there aren't any revisions
418 # skip indexing if there aren't any revisions
410 if len(repo) < 1:
419 if len(repo) < 1:
411 continue
420 continue
412
421
413 self.index_files(file_idx_writer, repo_name, repo)
422 self.index_files(file_idx_writer, repo_name, repo)
414 self.index_changesets(chgset_idx_writer, repo_name, repo)
423 self.index_changesets(chgset_idx_writer, repo_name, repo)
415
424
416 log.debug('>> COMMITING CHANGES <<')
425 log.debug('>> COMMITING CHANGES <<')
417 file_idx_writer.commit(merge=True)
426 file_idx_writer.commit(merge=True)
418 chgset_idx_writer.commit(merge=True)
427 chgset_idx_writer.commit(merge=True)
419 log.debug('>>> FINISHED BUILDING INDEX <<<')
428 log.debug('>>> FINISHED BUILDING INDEX <<<')
420
429
421 def update_indexes(self):
430 def update_indexes(self):
422 self.update_file_index()
431 self.update_file_index()
423 self.update_changeset_index()
432 self.update_changeset_index()
424
433
425 def run(self, full_index=False):
434 def run(self, full_index=False):
426 """Run daemon"""
435 """Run daemon"""
427 if full_index or self.initial:
436 if full_index or self.initial:
428 self.build_indexes()
437 self.build_indexes()
429 else:
438 else:
430 self.update_indexes()
439 self.update_indexes()
General Comments 0
You need to be logged in to leave comments. Login now