##// END OF EJS Templates
fixed #850 Whoosh indexer should use the default revision flag to make index
marcink -
r3916:ba08786c beta
parent child Browse files
Show More
@@ -1,416 +1,430 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2 """
2 """
3 rhodecode.lib.indexers.daemon
3 rhodecode.lib.indexers.daemon
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5
5
6 A daemon will read from task table and run tasks
6 A daemon will read from task table and run tasks
7
7
8 :created_on: Jan 26, 2010
8 :created_on: Jan 26, 2010
9 :author: marcink
9 :author: marcink
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 :license: GPLv3, see COPYING for more details.
11 :license: GPLv3, see COPYING for more details.
12 """
12 """
13 # This program is free software: you can redistribute it and/or modify
13 # This program is free software: you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation, either version 3 of the License, or
15 # the Free Software Foundation, either version 3 of the License, or
16 # (at your option) any later version.
16 # (at your option) any later version.
17 #
17 #
18 # This program is distributed in the hope that it will be useful,
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
21 # GNU General Public License for more details.
22 #
22 #
23 # You should have received a copy of the GNU General Public License
23 # You should have received a copy of the GNU General Public License
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 from __future__ import with_statement
25 from __future__ import with_statement
26
26
27 import os
27 import os
28 import sys
28 import sys
29 import logging
29 import logging
30 import traceback
30 import traceback
31
31
32 from shutil import rmtree
32 from shutil import rmtree
33 from time import mktime
33 from time import mktime
34
34
35 from os.path import dirname as dn
35 from os.path import dirname as dn
36 from os.path import join as jn
36 from os.path import join as jn
37
37
38 #to get the rhodecode import
38 #to get the rhodecode import
39 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
39 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
40 sys.path.append(project_path)
40 sys.path.append(project_path)
41
41
42 from rhodecode.config.conf import INDEX_EXTENSIONS
42 from rhodecode.config.conf import INDEX_EXTENSIONS
43 from rhodecode.model.scm import ScmModel
43 from rhodecode.model.scm import ScmModel
44 from rhodecode.model.db import Repository
44 from rhodecode.lib.utils2 import safe_unicode, safe_str
45 from rhodecode.lib.utils2 import safe_unicode, safe_str
45 from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
46 from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
46 CHGSET_IDX_NAME
47 CHGSET_IDX_NAME
47
48
48 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
49 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
49 NodeDoesNotExistError
50 NodeDoesNotExistError
50
51
51 from whoosh.index import create_in, open_dir, exists_in
52 from whoosh.index import create_in, open_dir, exists_in
52 from whoosh.query import *
53 from whoosh.query import *
53 from whoosh.qparser import QueryParser
54 from whoosh.qparser import QueryParser
54
55
55 log = logging.getLogger('whoosh_indexer')
56 log = logging.getLogger('whoosh_indexer')
56
57
57
58
58 class WhooshIndexingDaemon(object):
59 class WhooshIndexingDaemon(object):
59 """
60 """
60 Daemon for atomic indexing jobs
61 Daemon for atomic indexing jobs
61 """
62 """
62
63
63 def __init__(self, indexname=IDX_NAME, index_location=None,
64 def __init__(self, indexname=IDX_NAME, index_location=None,
64 repo_location=None, sa=None, repo_list=None,
65 repo_location=None, sa=None, repo_list=None,
65 repo_update_list=None):
66 repo_update_list=None):
66 self.indexname = indexname
67 self.indexname = indexname
67
68
68 self.index_location = index_location
69 self.index_location = index_location
69 if not index_location:
70 if not index_location:
70 raise Exception('You have to provide index location')
71 raise Exception('You have to provide index location')
71
72
72 self.repo_location = repo_location
73 self.repo_location = repo_location
73 if not repo_location:
74 if not repo_location:
74 raise Exception('You have to provide repositories location')
75 raise Exception('You have to provide repositories location')
75
76
76 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
77 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
77
78
78 #filter repo list
79 #filter repo list
79 if repo_list:
80 if repo_list:
80 #Fix non-ascii repo names to unicode
81 #Fix non-ascii repo names to unicode
81 repo_list = map(safe_unicode, repo_list)
82 repo_list = map(safe_unicode, repo_list)
82 self.filtered_repo_paths = {}
83 self.filtered_repo_paths = {}
83 for repo_name, repo in self.repo_paths.items():
84 for repo_name, repo in self.repo_paths.items():
84 if repo_name in repo_list:
85 if repo_name in repo_list:
85 self.filtered_repo_paths[repo_name] = repo
86 self.filtered_repo_paths[repo_name] = repo
86
87
87 self.repo_paths = self.filtered_repo_paths
88 self.repo_paths = self.filtered_repo_paths
88
89
89 #filter update repo list
90 #filter update repo list
90 self.filtered_repo_update_paths = {}
91 self.filtered_repo_update_paths = {}
91 if repo_update_list:
92 if repo_update_list:
92 self.filtered_repo_update_paths = {}
93 self.filtered_repo_update_paths = {}
93 for repo_name, repo in self.repo_paths.items():
94 for repo_name, repo in self.repo_paths.items():
94 if repo_name in repo_update_list:
95 if repo_name in repo_update_list:
95 self.filtered_repo_update_paths[repo_name] = repo
96 self.filtered_repo_update_paths[repo_name] = repo
96 self.repo_paths = self.filtered_repo_update_paths
97 self.repo_paths = self.filtered_repo_update_paths
97
98
98 self.initial = True
99 self.initial = True
99 if not os.path.isdir(self.index_location):
100 if not os.path.isdir(self.index_location):
100 os.makedirs(self.index_location)
101 os.makedirs(self.index_location)
101 log.info('Cannot run incremental index since it does not'
102 log.info('Cannot run incremental index since it does not '
102 ' yet exist running full build')
103 'yet exist running full build')
103 elif not exists_in(self.index_location, IDX_NAME):
104 elif not exists_in(self.index_location, IDX_NAME):
104 log.info('Running full index build as the file content'
105 log.info('Running full index build as the file content '
105 ' index does not exist')
106 'index does not exist')
106 elif not exists_in(self.index_location, CHGSET_IDX_NAME):
107 elif not exists_in(self.index_location, CHGSET_IDX_NAME):
107 log.info('Running full index build as the changeset'
108 log.info('Running full index build as the changeset '
108 ' index does not exist')
109 'index does not exist')
109 else:
110 else:
110 self.initial = False
111 self.initial = False
111
112
113 def _get_index_revision(self, repo):
114 db_repo = Repository.get_by_repo_name(repo.name)
115 landing_rev = 'tip'
116 if db_repo:
117 landing_rev = db_repo.landing_rev
118 return landing_rev
119
120 def _get_index_changeset(self, repo):
121 index_rev = self._get_index_revision(repo)
122 cs = repo.get_changeset(index_rev)
123 return cs
124
112 def get_paths(self, repo):
125 def get_paths(self, repo):
113 """
126 """
114 recursive walk in root dir and return a set of all path in that dir
127 recursive walk in root dir and return a set of all path in that dir
115 based on repository walk function
128 based on repository walk function
116 """
129 """
117 index_paths_ = set()
130 index_paths_ = set()
118 try:
131 try:
119 tip = repo.get_changeset('tip')
132 cs = self._get_index_changeset(repo)
120 for _topnode, _dirs, files in tip.walk('/'):
133 for _topnode, _dirs, files in cs.walk('/'):
121 for f in files:
134 for f in files:
122 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
135 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
123
136
124 except RepositoryError:
137 except RepositoryError:
125 log.debug(traceback.format_exc())
138 log.debug(traceback.format_exc())
126 pass
139 pass
127 return index_paths_
140 return index_paths_
128
141
129 def get_node(self, repo, path):
142 def get_node(self, repo, path):
130 n_path = path[len(repo.path) + 1:]
143 n_path = path[len(repo.path) + 1:]
131 node = repo.get_changeset().get_node(n_path)
144 cs = self._get_index_changeset(repo)
145 node = cs.get_node(n_path)
132 return node
146 return node
133
147
134 def get_node_mtime(self, node):
148 def get_node_mtime(self, node):
135 return mktime(node.last_changeset.date.timetuple())
149 return mktime(node.last_changeset.date.timetuple())
136
150
137 def add_doc(self, writer, path, repo, repo_name):
151 def add_doc(self, writer, path, repo, repo_name):
138 """
152 """
139 Adding doc to writer this function itself fetches data from
153 Adding doc to writer this function itself fetches data from
140 the instance of vcs backend
154 the instance of vcs backend
141 """
155 """
142
156
143 node = self.get_node(repo, path)
157 node = self.get_node(repo, path)
144 indexed = indexed_w_content = 0
158 indexed = indexed_w_content = 0
145 # we just index the content of chosen files, and skip binary files
159 # we just index the content of chosen files, and skip binary files
146 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
160 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
147 u_content = node.content
161 u_content = node.content
148 if not isinstance(u_content, unicode):
162 if not isinstance(u_content, unicode):
149 log.warning(' >> %s Could not get this content as unicode '
163 log.warning(' >> %s Could not get this content as unicode '
150 'replacing with empty content' % path)
164 'replacing with empty content' % path)
151 u_content = u''
165 u_content = u''
152 else:
166 else:
153 log.debug(' >> %s [WITH CONTENT]' % path)
167 log.debug(' >> %s [WITH CONTENT]' % path)
154 indexed_w_content += 1
168 indexed_w_content += 1
155
169
156 else:
170 else:
157 log.debug(' >> %s' % path)
171 log.debug(' >> %s' % path)
158 # just index file name without it's content
172 # just index file name without it's content
159 u_content = u''
173 u_content = u''
160 indexed += 1
174 indexed += 1
161
175
162 p = safe_unicode(path)
176 p = safe_unicode(path)
163 writer.add_document(
177 writer.add_document(
164 fileid=p,
178 fileid=p,
165 owner=unicode(repo.contact),
179 owner=unicode(repo.contact),
166 repository=safe_unicode(repo_name),
180 repository=safe_unicode(repo_name),
167 path=p,
181 path=p,
168 content=u_content,
182 content=u_content,
169 modtime=self.get_node_mtime(node),
183 modtime=self.get_node_mtime(node),
170 extension=node.extension
184 extension=node.extension
171 )
185 )
172 return indexed, indexed_w_content
186 return indexed, indexed_w_content
173
187
174 def index_changesets(self, writer, repo_name, repo, start_rev=None):
188 def index_changesets(self, writer, repo_name, repo, start_rev=None):
175 """
189 """
176 Add all changeset in the vcs repo starting at start_rev
190 Add all changeset in the vcs repo starting at start_rev
177 to the index writer
191 to the index writer
178
192
179 :param writer: the whoosh index writer to add to
193 :param writer: the whoosh index writer to add to
180 :param repo_name: name of the repository from whence the
194 :param repo_name: name of the repository from whence the
181 changeset originates including the repository group
195 changeset originates including the repository group
182 :param repo: the vcs repository instance to index changesets for,
196 :param repo: the vcs repository instance to index changesets for,
183 the presumption is the repo has changesets to index
197 the presumption is the repo has changesets to index
184 :param start_rev=None: the full sha id to start indexing from
198 :param start_rev=None: the full sha id to start indexing from
185 if start_rev is None then index from the first changeset in
199 if start_rev is None then index from the first changeset in
186 the repo
200 the repo
187 """
201 """
188
202
189 if start_rev is None:
203 if start_rev is None:
190 start_rev = repo[0].raw_id
204 start_rev = repo[0].raw_id
191
205
192 log.debug('indexing changesets in %s starting at rev: %s' %
206 log.debug('indexing changesets in %s starting at rev: %s' %
193 (repo_name, start_rev))
207 (repo_name, start_rev))
194
208
195 indexed = 0
209 indexed = 0
196 for cs in repo.get_changesets(start=start_rev):
210 for cs in repo.get_changesets(start=start_rev):
197 log.debug(' >> %s' % cs)
211 log.debug(' >> %s' % cs)
198 writer.add_document(
212 writer.add_document(
199 raw_id=unicode(cs.raw_id),
213 raw_id=unicode(cs.raw_id),
200 owner=unicode(repo.contact),
214 owner=unicode(repo.contact),
201 date=cs._timestamp,
215 date=cs._timestamp,
202 repository=safe_unicode(repo_name),
216 repository=safe_unicode(repo_name),
203 author=cs.author,
217 author=cs.author,
204 message=cs.message,
218 message=cs.message,
205 last=cs.last,
219 last=cs.last,
206 added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
220 added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
207 removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
221 removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
208 changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
222 changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
209 parents=u' '.join([cs.raw_id for cs in cs.parents]),
223 parents=u' '.join([cs.raw_id for cs in cs.parents]),
210 )
224 )
211 indexed += 1
225 indexed += 1
212
226
213 log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
227 log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
214 return indexed
228 return indexed
215
229
216 def index_files(self, file_idx_writer, repo_name, repo):
230 def index_files(self, file_idx_writer, repo_name, repo):
217 """
231 """
218 Index files for given repo_name
232 Index files for given repo_name
219
233
220 :param file_idx_writer: the whoosh index writer to add to
234 :param file_idx_writer: the whoosh index writer to add to
221 :param repo_name: name of the repository we're indexing
235 :param repo_name: name of the repository we're indexing
222 :param repo: instance of vcs repo
236 :param repo: instance of vcs repo
223 """
237 """
224 i_cnt = iwc_cnt = 0
238 i_cnt = iwc_cnt = 0
225 log.debug('building index for [%s]' % repo.path)
239 log.debug('building index for %s @revision:%s' % (repo.path,
240 self._get_index_revision(repo)))
226 for idx_path in self.get_paths(repo):
241 for idx_path in self.get_paths(repo):
227 i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
242 i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
228 i_cnt += i
243 i_cnt += i
229 iwc_cnt += iwc
244 iwc_cnt += iwc
230
245
231 log.debug('added %s files %s with content for repo %s' %
246 log.debug('added %s files %s with content for repo %s' %
232 (i_cnt + iwc_cnt, iwc_cnt, repo.path))
247 (i_cnt + iwc_cnt, iwc_cnt, repo.path))
233 return i_cnt, iwc_cnt
248 return i_cnt, iwc_cnt
234
249
235 def update_changeset_index(self):
250 def update_changeset_index(self):
236 idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
251 idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
237
252
238 with idx.searcher() as searcher:
253 with idx.searcher() as searcher:
239 writer = idx.writer()
254 writer = idx.writer()
240 writer_is_dirty = False
255 writer_is_dirty = False
241 try:
256 try:
242 indexed_total = 0
257 indexed_total = 0
243 repo_name = None
258 repo_name = None
244 for repo_name, repo in self.repo_paths.items():
259 for repo_name, repo in self.repo_paths.items():
245 # skip indexing if there aren't any revs in the repo
260 # skip indexing if there aren't any revs in the repo
246 num_of_revs = len(repo)
261 num_of_revs = len(repo)
247 if num_of_revs < 1:
262 if num_of_revs < 1:
248 continue
263 continue
249
264
250 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
265 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
251 q = qp.parse(u"last:t AND %s" % repo_name)
266 q = qp.parse(u"last:t AND %s" % repo_name)
252
267
253 results = searcher.search(q)
268 results = searcher.search(q)
254
269
255 # default to scanning the entire repo
270 # default to scanning the entire repo
256 last_rev = 0
271 last_rev = 0
257 start_id = None
272 start_id = None
258
273
259 if len(results) > 0:
274 if len(results) > 0:
260 # assuming that there is only one result, if not this
275 # assuming that there is only one result, if not this
261 # may require a full re-index.
276 # may require a full re-index.
262 start_id = results[0]['raw_id']
277 start_id = results[0]['raw_id']
263 last_rev = repo.get_changeset(revision=start_id).revision
278 last_rev = repo.get_changeset(revision=start_id).revision
264
279
265 # there are new changesets to index or a new repo to index
280 # there are new changesets to index or a new repo to index
266 if last_rev == 0 or num_of_revs > last_rev + 1:
281 if last_rev == 0 or num_of_revs > last_rev + 1:
267 # delete the docs in the index for the previous
282 # delete the docs in the index for the previous
268 # last changeset(s)
283 # last changeset(s)
269 for hit in results:
284 for hit in results:
270 q = qp.parse(u"last:t AND %s AND raw_id:%s" %
285 q = qp.parse(u"last:t AND %s AND raw_id:%s" %
271 (repo_name, hit['raw_id']))
286 (repo_name, hit['raw_id']))
272 writer.delete_by_query(q)
287 writer.delete_by_query(q)
273
288
274 # index from the previous last changeset + all new ones
289 # index from the previous last changeset + all new ones
275 indexed_total += self.index_changesets(writer,
290 indexed_total += self.index_changesets(writer,
276 repo_name, repo, start_id)
291 repo_name, repo, start_id)
277 writer_is_dirty = True
292 writer_is_dirty = True
278 log.debug('indexed %s changesets for repo %s' % (
293 log.debug('indexed %s changesets for repo %s' % (
279 indexed_total, repo_name)
294 indexed_total, repo_name)
280 )
295 )
281 finally:
296 finally:
282 if writer_is_dirty:
297 if writer_is_dirty:
283 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
298 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
284 writer.commit(merge=True)
299 writer.commit(merge=True)
285 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
300 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
286 else:
301 else:
287 writer.cancel
288 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
302 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
289
303
290 def update_file_index(self):
304 def update_file_index(self):
291 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
305 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
292 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
306 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
293
307
294 idx = open_dir(self.index_location, indexname=self.indexname)
308 idx = open_dir(self.index_location, indexname=self.indexname)
295 # The set of all paths in the index
309 # The set of all paths in the index
296 indexed_paths = set()
310 indexed_paths = set()
297 # The set of all paths we need to re-index
311 # The set of all paths we need to re-index
298 to_index = set()
312 to_index = set()
299
313
300 writer = idx.writer()
314 writer = idx.writer()
301 writer_is_dirty = False
315 writer_is_dirty = False
302 try:
316 try:
303 with idx.reader() as reader:
317 with idx.reader() as reader:
304
318
305 # Loop over the stored fields in the index
319 # Loop over the stored fields in the index
306 for fields in reader.all_stored_fields():
320 for fields in reader.all_stored_fields():
307 indexed_path = fields['path']
321 indexed_path = fields['path']
308 indexed_repo_path = fields['repository']
322 indexed_repo_path = fields['repository']
309 indexed_paths.add(indexed_path)
323 indexed_paths.add(indexed_path)
310
324
311 if not indexed_repo_path in self.filtered_repo_update_paths:
325 if not indexed_repo_path in self.filtered_repo_update_paths:
312 continue
326 continue
313
327
314 repo = self.repo_paths[indexed_repo_path]
328 repo = self.repo_paths[indexed_repo_path]
315
329
316 try:
330 try:
317 node = self.get_node(repo, indexed_path)
331 node = self.get_node(repo, indexed_path)
318 # Check if this file was changed since it was indexed
332 # Check if this file was changed since it was indexed
319 indexed_time = fields['modtime']
333 indexed_time = fields['modtime']
320 mtime = self.get_node_mtime(node)
334 mtime = self.get_node_mtime(node)
321 if mtime > indexed_time:
335 if mtime > indexed_time:
322 # The file has changed, delete it and add it to
336 # The file has changed, delete it and add it to
323 # the list of files to reindex
337 # the list of files to reindex
324 log.debug(
338 log.debug(
325 'adding to reindex list %s mtime: %s vs %s' % (
339 'adding to reindex list %s mtime: %s vs %s' % (
326 indexed_path, mtime, indexed_time)
340 indexed_path, mtime, indexed_time)
327 )
341 )
328 writer.delete_by_term('fileid', indexed_path)
342 writer.delete_by_term('fileid', indexed_path)
329 writer_is_dirty = True
343 writer_is_dirty = True
330
344
331 to_index.add(indexed_path)
345 to_index.add(indexed_path)
332 except (ChangesetError, NodeDoesNotExistError):
346 except (ChangesetError, NodeDoesNotExistError):
333 # This file was deleted since it was indexed
347 # This file was deleted since it was indexed
334 log.debug('removing from index %s' % indexed_path)
348 log.debug('removing from index %s' % indexed_path)
335 writer.delete_by_term('path', indexed_path)
349 writer.delete_by_term('path', indexed_path)
336 writer_is_dirty = True
350 writer_is_dirty = True
337
351
338 # Loop over the files in the filesystem
352 # Loop over the files in the filesystem
339 # Assume we have a function that gathers the filenames of the
353 # Assume we have a function that gathers the filenames of the
340 # documents to be indexed
354 # documents to be indexed
341 ri_cnt_total = 0 # indexed
355 ri_cnt_total = 0 # indexed
342 riwc_cnt_total = 0 # indexed with content
356 riwc_cnt_total = 0 # indexed with content
343 for repo_name, repo in self.repo_paths.items():
357 for repo_name, repo in self.repo_paths.items():
344 # skip indexing if there aren't any revisions
358 # skip indexing if there aren't any revisions
345 if len(repo) < 1:
359 if len(repo) < 1:
346 continue
360 continue
347 ri_cnt = 0 # indexed
361 ri_cnt = 0 # indexed
348 riwc_cnt = 0 # indexed with content
362 riwc_cnt = 0 # indexed with content
349 for path in self.get_paths(repo):
363 for path in self.get_paths(repo):
350 path = safe_unicode(path)
364 path = safe_unicode(path)
351 if path in to_index or path not in indexed_paths:
365 if path in to_index or path not in indexed_paths:
352
366
353 # This is either a file that's changed, or a new file
367 # This is either a file that's changed, or a new file
354 # that wasn't indexed before. So index it!
368 # that wasn't indexed before. So index it!
355 i, iwc = self.add_doc(writer, path, repo, repo_name)
369 i, iwc = self.add_doc(writer, path, repo, repo_name)
356 writer_is_dirty = True
370 writer_is_dirty = True
357 log.debug('re indexing %s' % path)
371 log.debug('re indexing %s' % path)
358 ri_cnt += i
372 ri_cnt += i
359 ri_cnt_total += 1
373 ri_cnt_total += 1
360 riwc_cnt += iwc
374 riwc_cnt += iwc
361 riwc_cnt_total += iwc
375 riwc_cnt_total += iwc
362 log.debug('added %s files %s with content for repo %s' % (
376 log.debug('added %s files %s with content for repo %s' % (
363 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
377 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
364 )
378 )
365 log.debug('indexed %s files in total and %s with content' % (
379 log.debug('indexed %s files in total and %s with content' % (
366 ri_cnt_total, riwc_cnt_total)
380 ri_cnt_total, riwc_cnt_total)
367 )
381 )
368 finally:
382 finally:
369 if writer_is_dirty:
383 if writer_is_dirty:
370 log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
384 log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
371 writer.commit(merge=True)
385 writer.commit(merge=True)
372 log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
386 log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
373 else:
387 else:
374 log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
388 log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
375 writer.cancel()
389 writer.cancel()
376
390
377 def build_indexes(self):
391 def build_indexes(self):
378 if os.path.exists(self.index_location):
392 if os.path.exists(self.index_location):
379 log.debug('removing previous index')
393 log.debug('removing previous index')
380 rmtree(self.index_location)
394 rmtree(self.index_location)
381
395
382 if not os.path.exists(self.index_location):
396 if not os.path.exists(self.index_location):
383 os.mkdir(self.index_location)
397 os.mkdir(self.index_location)
384
398
385 chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
399 chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
386 indexname=CHGSET_IDX_NAME)
400 indexname=CHGSET_IDX_NAME)
387 chgset_idx_writer = chgset_idx.writer()
401 chgset_idx_writer = chgset_idx.writer()
388
402
389 file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
403 file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
390 file_idx_writer = file_idx.writer()
404 file_idx_writer = file_idx.writer()
391 log.debug('BUILDING INDEX FOR EXTENSIONS %s '
405 log.debug('BUILDING INDEX FOR EXTENSIONS %s '
392 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
406 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
393
407
394 for repo_name, repo in self.repo_paths.items():
408 for repo_name, repo in self.repo_paths.items():
395 # skip indexing if there aren't any revisions
409 # skip indexing if there aren't any revisions
396 if len(repo) < 1:
410 if len(repo) < 1:
397 continue
411 continue
398
412
399 self.index_files(file_idx_writer, repo_name, repo)
413 self.index_files(file_idx_writer, repo_name, repo)
400 self.index_changesets(chgset_idx_writer, repo_name, repo)
414 self.index_changesets(chgset_idx_writer, repo_name, repo)
401
415
402 log.debug('>> COMMITING CHANGES <<')
416 log.debug('>> COMMITING CHANGES <<')
403 file_idx_writer.commit(merge=True)
417 file_idx_writer.commit(merge=True)
404 chgset_idx_writer.commit(merge=True)
418 chgset_idx_writer.commit(merge=True)
405 log.debug('>>> FINISHED BUILDING INDEX <<<')
419 log.debug('>>> FINISHED BUILDING INDEX <<<')
406
420
407 def update_indexes(self):
421 def update_indexes(self):
408 self.update_file_index()
422 self.update_file_index()
409 self.update_changeset_index()
423 self.update_changeset_index()
410
424
411 def run(self, full_index=False):
425 def run(self, full_index=False):
412 """Run daemon"""
426 """Run daemon"""
413 if full_index or self.initial:
427 if full_index or self.initial:
414 self.build_indexes()
428 self.build_indexes()
415 else:
429 else:
416 self.update_indexes()
430 self.update_indexes()
General Comments 0
You need to be logged in to leave comments. Login now