##// END OF EJS Templates
#453 added ID field in whoosh SCHEMA that solves the issue of reindexing modified files
marcink -
r2388:a0ef98f2 beta
parent child Browse files
Show More
@@ -1,240 +1,241 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 rhodecode.lib.indexers.__init__
4 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 5
6 6 Whoosh indexing module for RhodeCode
7 7
8 8 :created_on: Aug 17, 2010
9 9 :author: marcink
10 10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 11 :license: GPLv3, see COPYING for more details.
12 12 """
13 13 # This program is free software: you can redistribute it and/or modify
14 14 # it under the terms of the GNU General Public License as published by
15 15 # the Free Software Foundation, either version 3 of the License, or
16 16 # (at your option) any later version.
17 17 #
18 18 # This program is distributed in the hope that it will be useful,
19 19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 21 # GNU General Public License for more details.
22 22 #
23 23 # You should have received a copy of the GNU General Public License
24 24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 25 import os
26 26 import sys
27 27 import traceback
28 28 import logging
29 29 from os.path import dirname as dn, join as jn
30 30
31 31 #to get the rhodecode import
32 32 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
33 33
34 34 from string import strip
35 35 from shutil import rmtree
36 36
37 37 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
38 38 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
39 39 from whoosh.index import create_in, open_dir
40 40 from whoosh.formats import Characters
41 41 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
42 42
43 43 from webhelpers.html.builder import escape
44 44 from sqlalchemy import engine_from_config
45 45
46 46 from rhodecode.model import init_model
47 47 from rhodecode.model.scm import ScmModel
48 48 from rhodecode.model.repo import RepoModel
49 49 from rhodecode.config.environment import load_environment
50 50 from rhodecode.lib.utils2 import LazyProperty
51 51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
52 52 load_rcextensions
53 53
54 54 # CUSTOM ANALYZER wordsplit + lowercase filter
55 55 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
56 56
57 57
58 58 #INDEX SCHEMA DEFINITION
59 59 SCHEMA = Schema(
60 fileid=ID(unique=True),
60 61 owner=TEXT(),
61 62 repository=TEXT(stored=True),
62 63 path=TEXT(stored=True),
63 64 content=FieldType(format=Characters(), analyzer=ANALYZER,
64 65 scorable=True, stored=True),
65 66 modtime=STORED(),
66 67 extension=TEXT(stored=True)
67 68 )
68 69
69 70 IDX_NAME = 'HG_INDEX'
70 71 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
71 72 FRAGMENTER = ContextFragmenter(200)
72 73
73 74
74 75 class MakeIndex(BasePasterCommand):
75 76
76 77 max_args = 1
77 78 min_args = 1
78 79
79 80 usage = "CONFIG_FILE"
80 81 summary = "Creates index for full text search given configuration file"
81 82 group_name = "RhodeCode"
82 83 takes_config_file = -1
83 84 parser = Command.standard_parser(verbose=True)
84 85
85 86 def command(self):
86 87 logging.config.fileConfig(self.path_to_ini_file)
87 88 from pylons import config
88 89 add_cache(config)
89 90 engine = engine_from_config(config, 'sqlalchemy.db1.')
90 91 init_model(engine)
91 92 index_location = config['index_dir']
92 93 repo_location = self.options.repo_location \
93 94 if self.options.repo_location else RepoModel().repos_path
94 95 repo_list = map(strip, self.options.repo_list.split(',')) \
95 96 if self.options.repo_list else None
96 97 repo_update_list = map(strip, self.options.repo_update_list.split(',')) \
97 98 if self.options.repo_update_list else None
98 99 load_rcextensions(config['here'])
99 100 #======================================================================
100 101 # WHOOSH DAEMON
101 102 #======================================================================
102 103 from rhodecode.lib.pidlock import LockHeld, DaemonLock
103 104 from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
104 105 try:
105 106 l = DaemonLock(file_=jn(dn(dn(index_location)), 'make_index.lock'))
106 107 WhooshIndexingDaemon(index_location=index_location,
107 108 repo_location=repo_location,
108 109 repo_list=repo_list,
109 110 repo_update_list=repo_update_list)\
110 111 .run(full_index=self.options.full_index)
111 112 l.release()
112 113 except LockHeld:
113 114 sys.exit(1)
114 115
115 116 def update_parser(self):
116 117 self.parser.add_option('--repo-location',
117 118 action='store',
118 119 dest='repo_location',
119 120 help="Specifies repositories location to index OPTIONAL",
120 121 )
121 122 self.parser.add_option('--index-only',
122 123 action='store',
123 124 dest='repo_list',
124 125 help="Specifies a comma separated list of repositores "
125 126 "to build index on. If not given all repositories "
126 127 "are scanned for indexing. OPTIONAL",
127 128 )
128 129 self.parser.add_option('--update-only',
129 130 action='store',
130 131 dest='repo_update_list',
131 132 help="Specifies a comma separated list of repositores "
132 133 "to re-build index on. OPTIONAL",
133 134 )
134 135 self.parser.add_option('-f',
135 136 action='store_true',
136 137 dest='full_index',
137 138 help="Specifies that index should be made full i.e"
138 139 " destroy old and build from scratch",
139 140 default=False)
140 141
141 142
142 143 class WhooshResultWrapper(object):
143 144 def __init__(self, search_type, searcher, matcher, highlight_items,
144 145 repo_location):
145 146 self.search_type = search_type
146 147 self.searcher = searcher
147 148 self.matcher = matcher
148 149 self.highlight_items = highlight_items
149 150 self.fragment_size = 200
150 151 self.repo_location = repo_location
151 152
152 153 @LazyProperty
153 154 def doc_ids(self):
154 155 docs_id = []
155 156 while self.matcher.is_active():
156 157 docnum = self.matcher.id()
157 158 chunks = [offsets for offsets in self.get_chunks()]
158 159 docs_id.append([docnum, chunks])
159 160 self.matcher.next()
160 161 return docs_id
161 162
162 163 def __str__(self):
163 164 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
164 165
165 166 def __repr__(self):
166 167 return self.__str__()
167 168
168 169 def __len__(self):
169 170 return len(self.doc_ids)
170 171
171 172 def __iter__(self):
172 173 """
173 174 Allows Iteration over results,and lazy generate content
174 175
175 176 *Requires* implementation of ``__getitem__`` method.
176 177 """
177 178 for docid in self.doc_ids:
178 179 yield self.get_full_content(docid)
179 180
180 181 def __getitem__(self, key):
181 182 """
182 183 Slicing of resultWrapper
183 184 """
184 185 i, j = key.start, key.stop
185 186
186 187 slices = []
187 188 for docid in self.doc_ids[i:j]:
188 189 slices.append(self.get_full_content(docid))
189 190 return slices
190 191
191 192 def get_full_content(self, docid):
192 193 res = self.searcher.stored_fields(docid[0])
193 194 full_repo_path = jn(self.repo_location, res['repository'])
194 195 f_path = res['path'].split(full_repo_path)[-1]
195 196 f_path = f_path.lstrip(os.sep)
196 197
197 198 content_short = self.get_short_content(res, docid[1])
198 199 res.update({'content_short': content_short,
199 200 'content_short_hl': self.highlight(content_short),
200 201 'f_path': f_path})
201 202
202 203 return res
203 204
204 205 def get_short_content(self, res, chunks):
205 206
206 207 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
207 208
208 209 def get_chunks(self):
209 210 """
210 211 Smart function that implements chunking the content
211 212 but not overlap chunks so it doesn't highlight the same
212 213 close occurrences twice.
213 214
214 215 :param matcher:
215 216 :param size:
216 217 """
217 218 memory = [(0, 0)]
218 219 for span in self.matcher.spans():
219 220 start = span.startchar or 0
220 221 end = span.endchar or 0
221 222 start_offseted = max(0, start - self.fragment_size)
222 223 end_offseted = end + self.fragment_size
223 224
224 225 if start_offseted < memory[-1][1]:
225 226 start_offseted = memory[-1][1]
226 227 memory.append((start_offseted, end_offseted,))
227 228 yield (start_offseted, end_offseted,)
228 229
229 230 def highlight(self, content, top=5):
230 231 if self.search_type != 'content':
231 232 return ''
232 233 hl = highlight(
233 234 text=escape(content),
234 235 terms=self.highlight_items,
235 236 analyzer=ANALYZER,
236 237 fragmenter=FRAGMENTER,
237 238 formatter=FORMATTER,
238 239 top=top
239 240 )
240 241 return hl
@@ -1,251 +1,257 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 rhodecode.lib.indexers.daemon
4 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 5
6 6 A daemon will read from task table and run tasks
7 7
8 8 :created_on: Jan 26, 2010
9 9 :author: marcink
10 10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 11 :license: GPLv3, see COPYING for more details.
12 12 """
13 13 # This program is free software: you can redistribute it and/or modify
14 14 # it under the terms of the GNU General Public License as published by
15 15 # the Free Software Foundation, either version 3 of the License, or
16 16 # (at your option) any later version.
17 17 #
18 18 # This program is distributed in the hope that it will be useful,
19 19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 21 # GNU General Public License for more details.
22 22 #
23 23 # You should have received a copy of the GNU General Public License
24 24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 25
26 26 import os
27 27 import sys
28 28 import logging
29 29 import traceback
30 30
31 31 from shutil import rmtree
32 32 from time import mktime
33 33
34 34 from os.path import dirname as dn
35 35 from os.path import join as jn
36 36
37 37 #to get the rhodecode import
38 38 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
39 39 sys.path.append(project_path)
40 40
41 41 from rhodecode.config.conf import INDEX_EXTENSIONS
42 42 from rhodecode.model.scm import ScmModel
43 43 from rhodecode.lib.utils2 import safe_unicode
44 44 from rhodecode.lib.indexers import SCHEMA, IDX_NAME
45 45
46 46 from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
47 47 NodeDoesNotExistError
48 48
49 49 from whoosh.index import create_in, open_dir
50 50
51 51 log = logging.getLogger('whoosh_indexer')
52 52
53 53
54 54 class WhooshIndexingDaemon(object):
55 55 """
56 56 Daemon for atomic indexing jobs
57 57 """
58 58
59 59 def __init__(self, indexname=IDX_NAME, index_location=None,
60 60 repo_location=None, sa=None, repo_list=None,
61 61 repo_update_list=None):
62 62 self.indexname = indexname
63 63
64 64 self.index_location = index_location
65 65 if not index_location:
66 66 raise Exception('You have to provide index location')
67 67
68 68 self.repo_location = repo_location
69 69 if not repo_location:
70 70 raise Exception('You have to provide repositories location')
71 71
72 72 self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
73 73
74 74 #filter repo list
75 75 if repo_list:
76 76 self.filtered_repo_paths = {}
77 77 for repo_name, repo in self.repo_paths.items():
78 78 if repo_name in repo_list:
79 79 self.filtered_repo_paths[repo_name] = repo
80 80
81 81 self.repo_paths = self.filtered_repo_paths
82 82
83 83 #filter update repo list
84 84 self.filtered_repo_update_paths = {}
85 85 if repo_update_list:
86 86 self.filtered_repo_update_paths = {}
87 87 for repo_name, repo in self.repo_paths.items():
88 88 if repo_name in repo_update_list:
89 89 self.filtered_repo_update_paths[repo_name] = repo
90 90 self.repo_paths = self.filtered_repo_update_paths
91 91
92 92 self.initial = False
93 93 if not os.path.isdir(self.index_location):
94 94 os.makedirs(self.index_location)
95 95 log.info('Cannot run incremental index since it does not'
96 96 ' yet exist running full build')
97 97 self.initial = True
98 98
99 99 def get_paths(self, repo):
100 100 """
101 101 recursive walk in root dir and return a set of all path in that dir
102 102 based on repository walk function
103 103 """
104 104 index_paths_ = set()
105 105 try:
106 106 tip = repo.get_changeset('tip')
107 107 for topnode, dirs, files in tip.walk('/'):
108 108 for f in files:
109 109 index_paths_.add(jn(repo.path, f.path))
110 110
111 111 except RepositoryError, e:
112 112 log.debug(traceback.format_exc())
113 113 pass
114 114 return index_paths_
115 115
116 116 def get_node(self, repo, path):
117 117 n_path = path[len(repo.path) + 1:]
118 118 node = repo.get_changeset().get_node(n_path)
119 119 return node
120 120
121 121 def get_node_mtime(self, node):
122 122 return mktime(node.last_changeset.date.timetuple())
123 123
124 124 def add_doc(self, writer, path, repo, repo_name):
125 125 """
126 126 Adding doc to writer this function itself fetches data from
127 127 the instance of vcs backend
128 128 """
129 129
130 130 node = self.get_node(repo, path)
131 131 indexed = indexed_w_content = 0
132 132 # we just index the content of chosen files, and skip binary files
133 133 if node.extension in INDEX_EXTENSIONS and not node.is_binary:
134 134 u_content = node.content
135 135 if not isinstance(u_content, unicode):
136 136 log.warning(' >> %s Could not get this content as unicode '
137 137 'replacing with empty content' % path)
138 138 u_content = u''
139 139 else:
140 140 log.debug(' >> %s [WITH CONTENT]' % path)
141 141 indexed_w_content += 1
142 142
143 143 else:
144 144 log.debug(' >> %s' % path)
145 145 # just index file name without it's content
146 146 u_content = u''
147 147 indexed += 1
148 148
149 p = safe_unicode(path)
149 150 writer.add_document(
151 fileid=p,
150 152 owner=unicode(repo.contact),
151 153 repository=safe_unicode(repo_name),
152 path=safe_unicode(path),
154 path=p,
153 155 content=u_content,
154 156 modtime=self.get_node_mtime(node),
155 157 extension=node.extension
156 158 )
157 159 return indexed, indexed_w_content
158 160
159 161 def build_index(self):
160 162 if os.path.exists(self.index_location):
161 163 log.debug('removing previous index')
162 164 rmtree(self.index_location)
163 165
164 166 if not os.path.exists(self.index_location):
165 167 os.mkdir(self.index_location)
166 168
167 169 idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
168 170 writer = idx.writer()
169 171 log.debug('BUILDIN INDEX FOR EXTENSIONS %s' % INDEX_EXTENSIONS)
170 172 for repo_name, repo in self.repo_paths.items():
171 173 log.debug('building index @ %s' % repo.path)
172 174 i_cnt = iwc_cnt = 0
173 175 for idx_path in self.get_paths(repo):
174 176 i, iwc = self.add_doc(writer, idx_path, repo, repo_name)
175 177 i_cnt += i
176 178 iwc_cnt += iwc
177 179 log.debug('added %s files %s with content for repo %s' % (
178 180 i_cnt + iwc_cnt, iwc_cnt, repo.path)
179 181 )
180 182
181 183 log.debug('>> COMMITING CHANGES <<')
182 184 writer.commit(merge=True)
183 185 log.debug('>>> FINISHED BUILDING INDEX <<<')
184 186
185 187 def update_index(self):
186 188 log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
187 189 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
188 190
189 191 idx = open_dir(self.index_location, indexname=self.indexname)
190 192 # The set of all paths in the index
191 193 indexed_paths = set()
192 194 # The set of all paths we need to re-index
193 195 to_index = set()
194 196
195 197 reader = idx.reader()
196 198 writer = idx.writer()
197 199
198 200 # Loop over the stored fields in the index
199 201 for fields in reader.all_stored_fields():
200 202 indexed_path = fields['path']
201 203 indexed_repo_path = fields['repository']
202 204 indexed_paths.add(indexed_path)
203 205
204 206 if not indexed_repo_path in self.filtered_repo_update_paths:
205 207 continue
206 208
207 209 repo = self.repo_paths[indexed_repo_path]
208 210
209 211 try:
210 212 node = self.get_node(repo, indexed_path)
211 213 # Check if this file was changed since it was indexed
212 214 indexed_time = fields['modtime']
213 215 mtime = self.get_node_mtime(node)
214 216 if mtime > indexed_time:
215 217 # The file has changed, delete it and add it to the list of
216 218 # files to reindex
217 log.debug('adding to reindex list %s' % indexed_path)
218 writer.delete_by_term('path', indexed_path)
219 log.debug('adding to reindex list %s mtime: %s vs %s' % (
220 indexed_path, mtime, indexed_time)
221 )
222 writer.delete_by_term('fileid', indexed_path)
223
219 224 to_index.add(indexed_path)
220 225 except (ChangesetError, NodeDoesNotExistError):
221 226 # This file was deleted since it was indexed
222 227 log.debug('removing from index %s' % indexed_path)
223 228 writer.delete_by_term('path', indexed_path)
224 229
225 230 # Loop over the files in the filesystem
226 231 # Assume we have a function that gathers the filenames of the
227 232 # documents to be indexed
228 233 ri_cnt = riwc_cnt = 0
229 234 for repo_name, repo in self.repo_paths.items():
230 235 for path in self.get_paths(repo):
231 236 path = safe_unicode(path)
232 237 if path in to_index or path not in indexed_paths:
238
233 239 # This is either a file that's changed, or a new file
234 240 # that wasn't indexed before. So index it!
235 241 i, iwc = self.add_doc(writer, path, repo, repo_name)
236 242 log.debug('re indexing %s' % path)
237 243 ri_cnt += i
238 244 riwc_cnt += iwc
239 245 log.debug('added %s files %s with content for repo %s' % (
240 246 ri_cnt + riwc_cnt, riwc_cnt, repo.path)
241 247 )
242 248 log.debug('>> COMMITING CHANGES <<')
243 249 writer.commit(merge=True)
244 250 log.debug('>>> FINISHED REBUILDING INDEX <<<')
245 251
246 252 def run(self, full_index=False):
247 253 """Run daemon"""
248 254 if full_index or self.initial:
249 255 self.build_index()
250 256 else:
251 257 self.update_index()
General Comments 0
You need to be logged in to leave comments. Login now