##// END OF EJS Templates
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files
marcink -
r2388:a0ef98f2 beta
parent child Browse files
Show More
@@ -1,240 +1,241 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2 """
2 """
3 rhodecode.lib.indexers.__init__
3 rhodecode.lib.indexers.__init__
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5
5
6 Whoosh indexing module for RhodeCode
6 Whoosh indexing module for RhodeCode
7
7
8 :created_on: Aug 17, 2010
8 :created_on: Aug 17, 2010
9 :author: marcink
9 :author: marcink
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 :license: GPLv3, see COPYING for more details.
11 :license: GPLv3, see COPYING for more details.
12 """
12 """
13 # This program is free software: you can redistribute it and/or modify
13 # This program is free software: you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation, either version 3 of the License, or
15 # the Free Software Foundation, either version 3 of the License, or
16 # (at your option) any later version.
16 # (at your option) any later version.
17 #
17 #
18 # This program is distributed in the hope that it will be useful,
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
21 # GNU General Public License for more details.
22 #
22 #
23 # You should have received a copy of the GNU General Public License
23 # You should have received a copy of the GNU General Public License
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 import os
25 import os
26 import sys
26 import sys
27 import traceback
27 import traceback
28 import logging
28 import logging
29 from os.path import dirname as dn, join as jn
29 from os.path import dirname as dn, join as jn
30
30
31 #to get the rhodecode import
31 #to get the rhodecode import
32 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
32 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
33
33
34 from string import strip
34 from string import strip
35 from shutil import rmtree
35 from shutil import rmtree
36
36
37 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
37 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
38 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
38 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
39 from whoosh.index import create_in, open_dir
39 from whoosh.index import create_in, open_dir
40 from whoosh.formats import Characters
40 from whoosh.formats import Characters
41 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
41 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
42
42
43 from webhelpers.html.builder import escape
43 from webhelpers.html.builder import escape
44 from sqlalchemy import engine_from_config
44 from sqlalchemy import engine_from_config
45
45
46 from rhodecode.model import init_model
46 from rhodecode.model import init_model
47 from rhodecode.model.scm import ScmModel
47 from rhodecode.model.scm import ScmModel
48 from rhodecode.model.repo import RepoModel
48 from rhodecode.model.repo import RepoModel
49 from rhodecode.config.environment import load_environment
49 from rhodecode.config.environment import load_environment
50 from rhodecode.lib.utils2 import LazyProperty
50 from rhodecode.lib.utils2 import LazyProperty
51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
52 load_rcextensions
52 load_rcextensions
53
53
54 # CUSTOM ANALYZER wordsplit + lowercase filter
54 # CUSTOM ANALYZER wordsplit + lowercase filter
55 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
55 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
56
56
57
57
58 #INDEX SCHEMA DEFINITION
58 #INDEX SCHEMA DEFINITION
59 SCHEMA = Schema(
59 SCHEMA = Schema(
60 fileid=ID(unique=True),
60 owner=TEXT(),
61 owner=TEXT(),
61 repository=TEXT(stored=True),
62 repository=TEXT(stored=True),
62 path=TEXT(stored=True),
63 path=TEXT(stored=True),
63 content=FieldType(format=Characters(), analyzer=ANALYZER,
64 content=FieldType(format=Characters(), analyzer=ANALYZER,
64 scorable=True, stored=True),
65 scorable=True, stored=True),
65 modtime=STORED(),
66 modtime=STORED(),
66 extension=TEXT(stored=True)
67 extension=TEXT(stored=True)
67 )
68 )
68
69
69 IDX_NAME = 'HG_INDEX'
70 IDX_NAME = 'HG_INDEX'
70 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
71 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
71 FRAGMENTER = ContextFragmenter(200)
72 FRAGMENTER = ContextFragmenter(200)
72
73
73
74
74 class MakeIndex(BasePasterCommand):
75 class MakeIndex(BasePasterCommand):
75
76
76 max_args = 1
77 max_args = 1
77 min_args = 1
78 min_args = 1
78
79
79 usage = "CONFIG_FILE"
80 usage = "CONFIG_FILE"
80 summary = "Creates index for full text search given configuration file"
81 summary = "Creates index for full text search given configuration file"
81 group_name = "RhodeCode"
82 group_name = "RhodeCode"
82 takes_config_file = -1
83 takes_config_file = -1
83 parser = Command.standard_parser(verbose=True)
84 parser = Command.standard_parser(verbose=True)
84
85
85 def command(self):
86 def command(self):
86 logging.config.fileConfig(self.path_to_ini_file)
87 logging.config.fileConfig(self.path_to_ini_file)
87 from pylons import config
88 from pylons import config
88 add_cache(config)
89 add_cache(config)
89 engine = engine_from_config(config, 'sqlalchemy.db1.')
90 engine = engine_from_config(config, 'sqlalchemy.db1.')
90 init_model(engine)
91 init_model(engine)
91 index_location = config['index_dir']
92 index_location = config['index_dir']
92 repo_location = self.options.repo_location \
93 repo_location = self.options.repo_location \
93 if self.options.repo_location else RepoModel().repos_path
94 if self.options.repo_location else RepoModel().repos_path
94 repo_list = map(strip, self.options.repo_list.split(',')) \
95 repo_list = map(strip, self.options.repo_list.split(',')) \
95 if self.options.repo_list else None
96 if self.options.repo_list else None
96 repo_update_list = map(strip, self.options.repo_update_list.split(',')) \
97 repo_update_list = map(strip, self.options.repo_update_list.split(',')) \
97 if self.options.repo_update_list else None
98 if self.options.repo_update_list else None
98 load_rcextensions(config['here'])
99 load_rcextensions(config['here'])
99 #======================================================================
100 #======================================================================
100 # WHOOSH DAEMON
101 # WHOOSH DAEMON
101 #======================================================================
102 #======================================================================
102 from rhodecode.lib.pidlock import LockHeld, DaemonLock
103 from rhodecode.lib.pidlock import LockHeld, DaemonLock
103 from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
104 from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
104 try:
105 try:
105 l = DaemonLock(file_=jn(dn(dn(index_location)), 'make_index.lock'))
106 l = DaemonLock(file_=jn(dn(dn(index_location)), 'make_index.lock'))
106 WhooshIndexingDaemon(index_location=index_location,
107 WhooshIndexingDaemon(index_location=index_location,
107 repo_location=repo_location,
108 repo_location=repo_location,
108 repo_list=repo_list,
109 repo_list=repo_list,
109 repo_update_list=repo_update_list)\
110 repo_update_list=repo_update_list)\
110 .run(full_index=self.options.full_index)
111 .run(full_index=self.options.full_index)
111 l.release()
112 l.release()
112 except LockHeld:
113 except LockHeld:
113 sys.exit(1)
114 sys.exit(1)
114
115
115 def update_parser(self):
116 def update_parser(self):
116 self.parser.add_option('--repo-location',
117 self.parser.add_option('--repo-location',
117 action='store',
118 action='store',
118 dest='repo_location',
119 dest='repo_location',
119 help="Specifies repositories location to index OPTIONAL",
120 help="Specifies repositories location to index OPTIONAL",
120 )
121 )
121 self.parser.add_option('--index-only',
122 self.parser.add_option('--index-only',
122 action='store',
123 action='store',
123 dest='repo_list',
124 dest='repo_list',
124 help="Specifies a comma separated list of repositores "
125 help="Specifies a comma separated list of repositores "
125 "to build index on. If not given all repositories "
126 "to build index on. If not given all repositories "
126 "are scanned for indexing. OPTIONAL",
127 "are scanned for indexing. OPTIONAL",
127 )
128 )
128 self.parser.add_option('--update-only',
129 self.parser.add_option('--update-only',
129 action='store',
130 action='store',
130 dest='repo_update_list',
131 dest='repo_update_list',
131 help="Specifies a comma separated list of repositores "
132 help="Specifies a comma separated list of repositores "
132 "to re-build index on. OPTIONAL",
133 "to re-build index on. OPTIONAL",
133 )
134 )
134 self.parser.add_option('-f',
135 self.parser.add_option('-f',
135 action='store_true',
136 action='store_true',
136 dest='full_index',
137 dest='full_index',
137 help="Specifies that index should be made full i.e"
138 help="Specifies that index should be made full i.e"
138 " destroy old and build from scratch",
139 " destroy old and build from scratch",
139 default=False)
140 default=False)
140
141
141
142
142 class WhooshResultWrapper(object):
143 class WhooshResultWrapper(object):
143 def __init__(self, search_type, searcher, matcher, highlight_items,
144 def __init__(self, search_type, searcher, matcher, highlight_items,
144 repo_location):
145 repo_location):
145 self.search_type = search_type
146 self.search_type = search_type
146 self.searcher = searcher
147 self.searcher = searcher
147 self.matcher = matcher
148 self.matcher = matcher
148 self.highlight_items = highlight_items
149 self.highlight_items = highlight_items
149 self.fragment_size = 200
150 self.fragment_size = 200
150 self.repo_location = repo_location
151 self.repo_location = repo_location
151
152
152 @LazyProperty
153 @LazyProperty
153 def doc_ids(self):
154 def doc_ids(self):
154 docs_id = []
155 docs_id = []
155 while self.matcher.is_active():
156 while self.matcher.is_active():
156 docnum = self.matcher.id()
157 docnum = self.matcher.id()
157 chunks = [offsets for offsets in self.get_chunks()]
158 chunks = [offsets for offsets in self.get_chunks()]
158 docs_id.append([docnum, chunks])
159 docs_id.append([docnum, chunks])
159 self.matcher.next()
160 self.matcher.next()
160 return docs_id
161 return docs_id
161
162
162 def __str__(self):
163 def __str__(self):
163 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
164 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
164
165
165 def __repr__(self):
166 def __repr__(self):
166 return self.__str__()
167 return self.__str__()
167
168
168 def __len__(self):
169 def __len__(self):
169 return len(self.doc_ids)
170 return len(self.doc_ids)
170
171
171 def __iter__(self):
172 def __iter__(self):
172 """
173 """
173 Allows Iteration over results,and lazy generate content
174 Allows Iteration over results,and lazy generate content
174
175
175 *Requires* implementation of ``__getitem__`` method.
176 *Requires* implementation of ``__getitem__`` method.
176 """
177 """
177 for docid in self.doc_ids:
178 for docid in self.doc_ids:
178 yield self.get_full_content(docid)
179 yield self.get_full_content(docid)
179
180
180 def __getitem__(self, key):
181 def __getitem__(self, key):
181 """
182 """
182 Slicing of resultWrapper
183 Slicing of resultWrapper
183 """
184 """
184 i, j = key.start, key.stop
185 i, j = key.start, key.stop
185
186
186 slices = []
187 slices = []
187 for docid in self.doc_ids[i:j]:
188 for docid in self.doc_ids[i:j]:
188 slices.append(self.get_full_content(docid))
189 slices.append(self.get_full_content(docid))
189 return slices
190 return slices
190
191
191 def get_full_content(self, docid):
192 def get_full_content(self, docid):
192 res = self.searcher.stored_fields(docid[0])
193 res = self.searcher.stored_fields(docid[0])
193 full_repo_path = jn(self.repo_location, res['repository'])
194 full_repo_path = jn(self.repo_location, res['repository'])
194 f_path = res['path'].split(full_repo_path)[-1]
195 f_path = res['path'].split(full_repo_path)[-1]
195 f_path = f_path.lstrip(os.sep)
196 f_path = f_path.lstrip(os.sep)
196
197
197 content_short = self.get_short_content(res, docid[1])
198 content_short = self.get_short_content(res, docid[1])
198 res.update({'content_short': content_short,
199 res.update({'content_short': content_short,
199 'content_short_hl': self.highlight(content_short),
200 'content_short_hl': self.highlight(content_short),
200 'f_path': f_path})
201 'f_path': f_path})
201
202
202 return res
203 return res
203
204
204 def get_short_content(self, res, chunks):
205 def get_short_content(self, res, chunks):
205
206
206 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
207 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
207
208
208 def get_chunks(self):
209 def get_chunks(self):
209 """
210 """
210 Smart function that implements chunking the content
211 Smart function that implements chunking the content
211 but not overlap chunks so it doesn't highlight the same
212 but not overlap chunks so it doesn't highlight the same
212 close occurrences twice.
213 close occurrences twice.
213
214
214 :param matcher:
215 :param matcher:
215 :param size:
216 :param size:
216 """
217 """
217 memory = [(0, 0)]
218 memory = [(0, 0)]
218 for span in self.matcher.spans():
219 for span in self.matcher.spans():
219 start = span.startchar or 0
220 start = span.startchar or 0
220 end = span.endchar or 0
221 end = span.endchar or 0
221 start_offseted = max(0, start - self.fragment_size)
222 start_offseted = max(0, start - self.fragment_size)
222 end_offseted = end + self.fragment_size
223 end_offseted = end + self.fragment_size
223
224
224 if start_offseted < memory[-1][1]:
225 if start_offseted < memory[-1][1]:
225 start_offseted = memory[-1][1]
226 start_offseted = memory[-1][1]
226 memory.append((start_offseted, end_offseted,))
227 memory.append((start_offseted, end_offseted,))
227 yield (start_offseted, end_offseted,)
228 yield (start_offseted, end_offseted,)
228
229
229 def highlight(self, content, top=5):
230 def highlight(self, content, top=5):
230 if self.search_type != 'content':
231 if self.search_type != 'content':
231 return ''
232 return ''
232 hl = highlight(
233 hl = highlight(
233 text=escape(content),
234 text=escape(content),
234 terms=self.highlight_items,
235 terms=self.highlight_items,
235 analyzer=ANALYZER,
236 analyzer=ANALYZER,
236 fragmenter=FRAGMENTER,
237 fragmenter=FRAGMENTER,
237 formatter=FORMATTER,
238 formatter=FORMATTER,
238 top=top
239 top=top
239 )
240 )
240 return hl
241 return hl
@@ -1,251 +1,257 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2 """
2 """
3 rhodecode.lib.indexers.daemon
3 rhodecode.lib.indexers.daemon
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5
5
6 A daemon will read from task table and run tasks
6 A daemon will read from task table and run tasks
7
7
8 :created_on: Jan 26, 2010
8 :created_on: Jan 26, 2010
9 :author: marcink
9 :author: marcink
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 :license: GPLv3, see COPYING for more details.
11 :license: GPLv3, see COPYING for more details.
12 """
12 """
13 # This program is free software: you can redistribute it and/or modify
13 # This program is free software: you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation, either version 3 of the License, or
15 # the Free Software Foundation, either version 3 of the License, or
16 # (at your option) any later version.
16 # (at your option) any later version.
17 #
17 #
18 # This program is distributed in the hope that it will be useful,
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
21 # GNU General Public License for more details.
22 #
22 #
23 # You should have received a copy of the GNU General Public License
23 # You should have received a copy of the GNU General Public License
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25
25
26 import os
26 import os
27 import sys
27 import sys
28 import logging
28 import logging
29 import traceback
29 import traceback
30
30
31 from shutil import rmtree
31 from shutil import rmtree
32 from time import mktime
32 from time import mktime
33
33
34 from os.path import dirname as dn
34 from os.path import dirname as dn
35 from os.path import join as jn
35 from os.path import join as jn
36
36
37 #to get the rhodecode import
37 #to get the rhodecode import
38 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
38 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
39 sys.path.append(project_path)
39 sys.path.append(project_path)
40
40
41 from rhodecode.config.conf import INDEX_EXTENSIONS
41 from rhodecode.config.conf import INDEX_EXTENSIONS
42 from rhodecode.model.scm import ScmModel
42 from rhodecode.model.scm import ScmModel
43 from rhodecode.lib.utils2 import safe_unicode
43 from rhodecode.lib.utils2 import safe_unicode
44 from rhodecode.lib.indexers import SCHEMA, IDX_NAME
44 from rhodecode.lib.indexers import SCHEMA, IDX_NAME
45
45
46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
47 NodeDoesNotExistError
47 NodeDoesNotExistError
48
48
49 from whoosh.index import create_in, open_dir
49 from whoosh.index import create_in, open_dir
50
50
51 log = logging.getLogger('whoosh_indexer')
51 log = logging.getLogger('whoosh_indexer')
52
52
53
53
54 class WhooshIndexingDaemon(object):
54 class WhooshIndexingDaemon(object):
55 """
55 """
56 Daemon for atomic indexing jobs
56 Daemon for atomic indexing jobs
57 """
57 """
58
58
59 def __init__(self, indexname=IDX_NAME, index_location=None,
59 def __init__(self, indexname=IDX_NAME, index_location=None,
60 repo_location=None, sa=None, repo_list=None,
60 repo_location=None, sa=None, repo_list=None,
61 repo_update_list=None):
61 repo_update_list=None):
62 self.indexname = indexname
62 self.indexname = indexname
63
63
64 self.index_location = index_location
64 self.index_location = index_location
65 if not index_location:
65 if not index_location:
66 raise Exception('You have to provide index location')
66 raise Exception('You have to provide index location')
67
67
68 self.repo_location = repo_location
68 self.repo_location = repo_location
69 if not repo_location:
69 if not repo_location:
70 raise Exception('You have to provide repositories location')
70 raise Exception('You have to provide repositories location')
71
71
72 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
72 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
73
73
74 #filter repo list
74 #filter repo list
75 if repo_list:
75 if repo_list:
76 self.filtered_repo_paths = {}
76 self.filtered_repo_paths = {}
77 for repo_name, repo in self.repo_paths.items():
77 for repo_name, repo in self.repo_paths.items():
78 if repo_name in repo_list:
78 if repo_name in repo_list:
79 self.filtered_repo_paths[repo_name] = repo
79 self.filtered_repo_paths[repo_name] = repo
80
80
81 self.repo_paths = self.filtered_repo_paths
81 self.repo_paths = self.filtered_repo_paths
82
82
83 #filter update repo list
83 #filter update repo list
84 self.filtered_repo_update_paths = {}
84 self.filtered_repo_update_paths = {}
85 if repo_update_list:
85 if repo_update_list:
86 self.filtered_repo_update_paths = {}
86 self.filtered_repo_update_paths = {}
87 for repo_name, repo in self.repo_paths.items():
87 for repo_name, repo in self.repo_paths.items():
88 if repo_name in repo_update_list:
88 if repo_name in repo_update_list:
89 self.filtered_repo_update_paths[repo_name] = repo
89 self.filtered_repo_update_paths[repo_name] = repo
90 self.repo_paths = self.filtered_repo_update_paths
90 self.repo_paths = self.filtered_repo_update_paths
91
91
92 self.initial = False
92 self.initial = False
93 if not os.path.isdir(self.index_location):
93 if not os.path.isdir(self.index_location):
94 os.makedirs(self.index_location)
94 os.makedirs(self.index_location)
95 log.info('Cannot run incremental index since it does not'
95 log.info('Cannot run incremental index since it does not'
96 ' yet exist running full build')
96 ' yet exist running full build')
97 self.initial = True
97 self.initial = True
98
98
99 def get_paths(self, repo):
99 def get_paths(self, repo):
100 """
100 """
101 recursive walk in root dir and return a set of all path in that dir
101 recursive walk in root dir and return a set of all path in that dir
102 based on repository walk function
102 based on repository walk function
103 """
103 """
104 index_paths_ = set()
104 index_paths_ = set()
105 try:
105 try:
106 tip = repo.get_changeset('tip')
106 tip = repo.get_changeset('tip')
107 for topnode, dirs, files in tip.walk('/'):
107 for topnode, dirs, files in tip.walk('/'):
108 for f in files:
108 for f in files:
109 index_paths_.add(jn(repo.path, f.path))
109 index_paths_.add(jn(repo.path, f.path))
110
110
111 except RepositoryError, e:
111 except RepositoryError, e:
112 log.debug(traceback.format_exc())
112 log.debug(traceback.format_exc())
113 pass
113 pass
114 return index_paths_
114 return index_paths_
115
115
116 def get_node(self, repo, path):
116 def get_node(self, repo, path):
117 n_path = path[len(repo.path) + 1:]
117 n_path = path[len(repo.path) + 1:]
118 node = repo.get_changeset().get_node(n_path)
118 node = repo.get_changeset().get_node(n_path)
119 return node
119 return node
120
120
121 def get_node_mtime(self, node):
121 def get_node_mtime(self, node):
122 return mktime(node.last_changeset.date.timetuple())
122 return mktime(node.last_changeset.date.timetuple())
123
123
124 def add_doc(self, writer, path, repo, repo_name):
124 def add_doc(self, writer, path, repo, repo_name):
125 """
125 """
126 Adding doc to writer this function itself fetches data from
126 Adding doc to writer this function itself fetches data from
127 the instance of vcs backend
127 the instance of vcs backend
128 """
128 """
129
129
130 node = self.get_node(repo, path)
130 node = self.get_node(repo, path)
131 indexed = indexed_w_content = 0
131 indexed = indexed_w_content = 0
132 # we just index the content of chosen files, and skip binary files
132 # we just index the content of chosen files, and skip binary files
133 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
133 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
134 u_content = node.content
134 u_content = node.content
135 if not isinstance(u_content, unicode):
135 if not isinstance(u_content, unicode):
136 log.warning(' >> %s Could not get this content as unicode '
136 log.warning(' >> %s Could not get this content as unicode '
137 'replacing with empty content' % path)
137 'replacing with empty content' % path)
138 u_content = u''
138 u_content = u''
139 else:
139 else:
140 log.debug(' >> %s [WITH CONTENT]' % path)
140 log.debug(' >> %s [WITH CONTENT]' % path)
141 indexed_w_content += 1
141 indexed_w_content += 1
142
142
143 else:
143 else:
144 log.debug(' >> %s' % path)
144 log.debug(' >> %s' % path)
145 # just index file name without it's content
145 # just index file name without it's content
146 u_content = u''
146 u_content = u''
147 indexed += 1
147 indexed += 1
148
148
149 p = safe_unicode(path)
149 writer.add_document(
150 writer.add_document(
151 fileid=p,
150 owner=unicode(repo.contact),
152 owner=unicode(repo.contact),
151 repository=safe_unicode(repo_name),
153 repository=safe_unicode(repo_name),
152 path=safe_unicode(path),
154 path=p,
153 content=u_content,
155 content=u_content,
154 modtime=self.get_node_mtime(node),
156 modtime=self.get_node_mtime(node),
155 extension=node.extension
157 extension=node.extension
156 )
158 )
157 return indexed, indexed_w_content
159 return indexed, indexed_w_content
158
160
159 def build_index(self):
161 def build_index(self):
160 if os.path.exists(self.index_location):
162 if os.path.exists(self.index_location):
161 log.debug('removing previous index')
163 log.debug('removing previous index')
162 rmtree(self.index_location)
164 rmtree(self.index_location)
163
165
164 if not os.path.exists(self.index_location):
166 if not os.path.exists(self.index_location):
165 os.mkdir(self.index_location)
167 os.mkdir(self.index_location)
166
168
167 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
169 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
168 writer = idx.writer()
170 writer = idx.writer()
169 log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS)
171 log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS)
170 for repo_name, repo in self.repo_paths.items():
172 for repo_name, repo in self.repo_paths.items():
171 log.debug('building index @ %s' % repo.path)
173 log.debug('building index @ %s' % repo.path)
172 i_cnt = iwc_cnt = 0
174 i_cnt = iwc_cnt = 0
173 for idx_path in self.get_paths(repo):
175 for idx_path in self.get_paths(repo):
174 i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
176 i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
175 i_cnt += i
177 i_cnt += i
176 iwc_cnt += iwc
178 iwc_cnt += iwc
177 log.debug('added %s files %s with content for repo %s' % (
179 log.debug('added %s files %s with content for repo %s' % (
178 i_cnt + iwc_cnt, iwc_cnt, repo.path)
180 i_cnt + iwc_cnt, iwc_cnt, repo.path)
179 )
181 )
180
182
181 log.debug('>> COMMITING CHANGES <<')
183 log.debug('>> COMMITING CHANGES <<')
182 writer.commit(merge=True)
184 writer.commit(merge=True)
183 log.debug('>>> FINISHED BUILDING INDEX <<<')
185 log.debug('>>> FINISHED BUILDING INDEX <<<')
184
186
185 def update_index(self):
187 def update_index(self):
186 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
188 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
187 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
189 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
188
190
189 idx = open_dir(self.index_location, indexname=self.indexname)
191 idx = open_dir(self.index_location, indexname=self.indexname)
190 # The set of all paths in the index
192 # The set of all paths in the index
191 indexed_paths = set()
193 indexed_paths = set()
192 # The set of all paths we need to re-index
194 # The set of all paths we need to re-index
193 to_index = set()
195 to_index = set()
194
196
195 reader = idx.reader()
197 reader = idx.reader()
196 writer = idx.writer()
198 writer = idx.writer()
197
199
198 # Loop over the stored fields in the index
200 # Loop over the stored fields in the index
199 for fields in reader.all_stored_fields():
201 for fields in reader.all_stored_fields():
200 indexed_path = fields['path']
202 indexed_path = fields['path']
201 indexed_repo_path = fields['repository']
203 indexed_repo_path = fields['repository']
202 indexed_paths.add(indexed_path)
204 indexed_paths.add(indexed_path)
203
205
204 if not indexed_repo_path in self.filtered_repo_update_paths:
206 if not indexed_repo_path in self.filtered_repo_update_paths:
205 continue
207 continue
206
208
207 repo = self.repo_paths[indexed_repo_path]
209 repo = self.repo_paths[indexed_repo_path]
208
210
209 try:
211 try:
210 node = self.get_node(repo, indexed_path)
212 node = self.get_node(repo, indexed_path)
211 # Check if this file was changed since it was indexed
213 # Check if this file was changed since it was indexed
212 indexed_time = fields['modtime']
214 indexed_time = fields['modtime']
213 mtime = self.get_node_mtime(node)
215 mtime = self.get_node_mtime(node)
214 if mtime > indexed_time:
216 if mtime > indexed_time:
215 # The file has changed, delete it and add it to the list of
217 # The file has changed, delete it and add it to the list of
216 # files to reindex
218 # files to reindex
217 log.debug('adding to reindex list %s' % indexed_path)
219 log.debug('adding to reindex list %s mtime: %s vs %s' % (
218 writer.delete_by_term('path', indexed_path)
220 indexed_path, mtime, indexed_time)
221 )
222 writer.delete_by_term('fileid', indexed_path)
223
219 to_index.add(indexed_path)
224 to_index.add(indexed_path)
220 except (ChangesetError, NodeDoesNotExistError):
225 except (ChangesetError, NodeDoesNotExistError):
221 # This file was deleted since it was indexed
226 # This file was deleted since it was indexed
222 log.debug('removing from index %s' % indexed_path)
227 log.debug('removing from index %s' % indexed_path)
223 writer.delete_by_term('path', indexed_path)
228 writer.delete_by_term('path', indexed_path)
224
229
225 # Loop over the files in the filesystem
230 # Loop over the files in the filesystem
226 # Assume we have a function that gathers the filenames of the
231 # Assume we have a function that gathers the filenames of the
227 # documents to be indexed
232 # documents to be indexed
228 ri_cnt = riwc_cnt = 0
233 ri_cnt = riwc_cnt = 0
229 for repo_name, repo in self.repo_paths.items():
234 for repo_name, repo in self.repo_paths.items():
230 for path in self.get_paths(repo):
235 for path in self.get_paths(repo):
231 path = safe_unicode(path)
236 path = safe_unicode(path)
232 if path in to_index or path not in indexed_paths:
237 if path in to_index or path not in indexed_paths:
238
233 # This is either a file that's changed, or a new file
239 # This is either a file that's changed, or a new file
234 # that wasn't indexed before. So index it!
240 # that wasn't indexed before. So index it!
235 i, iwc = self.add_doc(writer, path, repo, repo_name)
241 i, iwc = self.add_doc(writer, path, repo, repo_name)
236 log.debug('re indexing %s' % path)
242 log.debug('re indexing %s' % path)
237 ri_cnt += i
243 ri_cnt += i
238 riwc_cnt += iwc
244 riwc_cnt += iwc
239 log.debug('added %s files %s with content for repo %s' % (
245 log.debug('added %s files %s with content for repo %s' % (
240 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
246 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
241 )
247 )
242 log.debug('>> COMMITING CHANGES <<')
248 log.debug('>> COMMITING CHANGES <<')
243 writer.commit(merge=True)
249 writer.commit(merge=True)
244 log.debug('>>> FINISHED REBUILDING INDEX <<<')
250 log.debug('>>> FINISHED REBUILDING INDEX <<<')
245
251
246 def run(self, full_index=False):
252 def run(self, full_index=False):
247 """Run daemon"""
253 """Run daemon"""
248 if full_index or self.initial:
254 if full_index or self.initial:
249 self.build_index()
255 self.build_index()
250 else:
256 else:
251 self.update_index()
257 self.update_index()
General Comments 0
You need to be logged in to leave comments. Login now