##// END OF EJS Templates
search: added sorting of search results for whoosh backend.
marcink -
r3964:ec30b572 default
parent child Browse files
Show More
@@ -1,286 +1,293 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2
2
3 # Copyright (C) 2012-2019 RhodeCode GmbH
3 # Copyright (C) 2012-2019 RhodeCode GmbH
4 #
4 #
5 # This program is free software: you can redistribute it and/or modify
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License, version 3
6 # it under the terms of the GNU Affero General Public License, version 3
7 # (only), as published by the Free Software Foundation.
7 # (only), as published by the Free Software Foundation.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU Affero General Public License
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
16 #
17 # This program is dual-licensed. If you wish to learn more about the
17 # This program is dual-licensed. If you wish to learn more about the
18 # RhodeCode Enterprise Edition, including its added features, Support services,
18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20
20
21 """
21 """
22 Index schema for RhodeCode
22 Index schema for RhodeCode
23 """
23 """
24
24
25 from __future__ import absolute_import
25 from __future__ import absolute_import
26 import os
26 import os
27 import re
27 import re
28 import logging
28 import logging
29
29
30 from whoosh import query as query_lib
30 from whoosh import query as query_lib
31 from whoosh.highlight import HtmlFormatter, ContextFragmenter
31 from whoosh.highlight import HtmlFormatter, ContextFragmenter
32 from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError
32 from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError
33 from whoosh.qparser import QueryParser, QueryParserError
33 from whoosh.qparser import QueryParser, QueryParserError
34
34
35 import rhodecode.lib.helpers as h
35 import rhodecode.lib.helpers as h
36 from rhodecode.lib.index import BaseSearcher
36 from rhodecode.lib.index import BaseSearcher
37 from rhodecode.lib.utils2 import safe_unicode
37 from rhodecode.lib.utils2 import safe_unicode
38
38
39 log = logging.getLogger(__name__)
39 log = logging.getLogger(__name__)
40
40
41
41
42 try:
42 try:
43 # we first try to import from rhodecode tools, fallback to copies if
43 # we first try to import from rhodecode tools, fallback to copies if
44 # we're unable to
44 # we're unable to
45 from rhodecode_tools.lib.fts_index.whoosh_schema import (
45 from rhodecode_tools.lib.fts_index.whoosh_schema import (
46 ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
46 ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
47 COMMIT_SCHEMA)
47 COMMIT_SCHEMA)
48 except ImportError:
48 except ImportError:
49 log.warning('rhodecode_tools schema not available, doing a fallback '
49 log.warning('rhodecode_tools schema not available, doing a fallback '
50 'import from `rhodecode.lib.index.whoosh_fallback_schema`')
50 'import from `rhodecode.lib.index.whoosh_fallback_schema`')
51 from rhodecode.lib.index.whoosh_fallback_schema import (
51 from rhodecode.lib.index.whoosh_fallback_schema import (
52 ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
52 ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
53 COMMIT_SCHEMA)
53 COMMIT_SCHEMA)
54
54
55
55
56 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
56 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
57 FRAGMENTER = ContextFragmenter(200)
57 FRAGMENTER = ContextFragmenter(200)
58
58
59 log = logging.getLogger(__name__)
59 log = logging.getLogger(__name__)
60
60
61
61
62 class WhooshSearcher(BaseSearcher):
62 class WhooshSearcher(BaseSearcher):
63 # this also shows in UI
63 # this also shows in UI
64 query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'
64 query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'
65 name = 'whoosh'
65 name = 'whoosh'
66
66
67 def __init__(self, config):
67 def __init__(self, config):
68 super(Searcher, self).__init__()
68 super(Searcher, self).__init__()
69 self.config = config
69 self.config = config
70 if not os.path.isdir(self.config['location']):
70 if not os.path.isdir(self.config['location']):
71 os.makedirs(self.config['location'])
71 os.makedirs(self.config['location'])
72
72
73 opener = create_in
73 opener = create_in
74 if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):
74 if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):
75 opener = open_dir
75 opener = open_dir
76 file_index = opener(self.config['location'], schema=FILE_SCHEMA,
76 file_index = opener(self.config['location'], schema=FILE_SCHEMA,
77 indexname=FILE_INDEX_NAME)
77 indexname=FILE_INDEX_NAME)
78
78
79 opener = create_in
79 opener = create_in
80 if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):
80 if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):
81 opener = open_dir
81 opener = open_dir
82 changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,
82 changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,
83 indexname=COMMIT_INDEX_NAME)
83 indexname=COMMIT_INDEX_NAME)
84
84
85 self.commit_schema = COMMIT_SCHEMA
85 self.commit_schema = COMMIT_SCHEMA
86 self.commit_index = changeset_index
86 self.commit_index = changeset_index
87 self.file_schema = FILE_SCHEMA
87 self.file_schema = FILE_SCHEMA
88 self.file_index = file_index
88 self.file_index = file_index
89 self.searcher = None
89 self.searcher = None
90
90
91 def cleanup(self):
91 def cleanup(self):
92 if self.searcher:
92 if self.searcher:
93 self.searcher.close()
93 self.searcher.close()
94
94
95 def _extend_query(self, query):
95 def _extend_query(self, query):
96 hashes = re.compile('([0-9a-f]{5,40})').findall(query)
96 hashes = re.compile('([0-9a-f]{5,40})').findall(query)
97 if hashes:
97 if hashes:
98 hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)
98 hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)
99 query = u'(%s) OR %s' % (query, hashes_or_query)
99 query = u'(%s) OR %s' % (query, hashes_or_query)
100 return query
100 return query
101
101
102 def search(self, query, document_type, search_user,
102 def search(self, query, document_type, search_user,
103 repo_name=None, repo_group_name=None,
103 repo_name=None, repo_group_name=None,
104 requested_page=1, page_limit=10, sort=None, raise_on_exc=True):
104 requested_page=1, page_limit=10, sort=None, raise_on_exc=True):
105
105
106 original_query = query
106 original_query = query
107 query = self._extend_query(query)
107 query = self._extend_query(query)
108
108
109 log.debug(u'QUERY: %s on %s', query, document_type)
109 log.debug(u'QUERY: %s on %s', query, document_type)
110 result = {
110 result = {
111 'results': [],
111 'results': [],
112 'count': 0,
112 'count': 0,
113 'error': None,
113 'error': None,
114 'runtime': 0
114 'runtime': 0
115 }
115 }
116 search_type, index_name, schema_defn = self._prepare_for_search(
116 search_type, index_name, schema_defn = self._prepare_for_search(
117 document_type)
117 document_type)
118 self._init_searcher(index_name)
118 self._init_searcher(index_name)
119 try:
119 try:
120 qp = QueryParser(search_type, schema=schema_defn)
120 qp = QueryParser(search_type, schema=schema_defn)
121 allowed_repos_filter = self._get_repo_filter(
121 allowed_repos_filter = self._get_repo_filter(
122 search_user, repo_name)
122 search_user, repo_name)
123 try:
123 try:
124 query = qp.parse(safe_unicode(query))
124 query = qp.parse(safe_unicode(query))
125 log.debug('query: %s (%s)', query, repr(query))
125 log.debug('query: %s (%s)', query, repr(query))
126
126
127 reverse, sortedby = False, None
127 def sort_def(_direction, _sort_field):
128 if search_type == 'message':
128 field2whoosh = {
129 if sort == 'oldfirst':
129 'message.raw': 'message',
130 sortedby = 'date'
130 'author.email.raw': 'author',
131 }
132 return field2whoosh.get(_sort_field) or _sort_field
133
134 reverse, sorted_by = False, None
135 direction, sort_field = self.get_sort(search_type, sort)
136 if sort_field:
137 if direction == Searcher.DIRECTION_DESC:
138 reverse = True
139 if direction == Searcher.DIRECTION_ASC:
131 reverse = False
140 reverse = False
132 elif sort == 'newfirst':
141 sorted_by = sort_def(direction, sort_field)
133 sortedby = 'date'
134 reverse = True
135
142
136 whoosh_results = self.searcher.search(
143 whoosh_results = self.searcher.search(
137 query, filter=allowed_repos_filter, limit=None,
144 query, filter=allowed_repos_filter, limit=None,
138 sortedby=sortedby, reverse=reverse)
145 sortedby=sorted_by, reverse=reverse)
139
146
140 # fixes for 32k limit that whoosh uses for highlight
147 # fixes for 32k limit that whoosh uses for highlight
141 whoosh_results.fragmenter.charlimit = None
148 whoosh_results.fragmenter.charlimit = None
142 res_ln = whoosh_results.scored_length()
149 res_ln = whoosh_results.scored_length()
143 result['runtime'] = whoosh_results.runtime
150 result['runtime'] = whoosh_results.runtime
144 result['count'] = res_ln
151 result['count'] = res_ln
145 result['results'] = WhooshResultWrapper(
152 result['results'] = WhooshResultWrapper(
146 search_type, res_ln, whoosh_results)
153 search_type, res_ln, whoosh_results)
147
154
148 except QueryParserError:
155 except QueryParserError:
149 result['error'] = 'Invalid search query. Try quoting it.'
156 result['error'] = 'Invalid search query. Try quoting it.'
150 except (EmptyIndexError, IOError, OSError):
157 except (EmptyIndexError, IOError, OSError):
151 msg = 'There is no index to search in. Please run whoosh indexer'
158 msg = 'There is no index to search in. Please run whoosh indexer'
152 log.exception(msg)
159 log.exception(msg)
153 result['error'] = msg
160 result['error'] = msg
154 except Exception:
161 except Exception:
155 msg = 'An error occurred during this search operation'
162 msg = 'An error occurred during this search operation'
156 log.exception(msg)
163 log.exception(msg)
157 result['error'] = msg
164 result['error'] = msg
158
165
159 return result
166 return result
160
167
161 def statistics(self, translator):
168 def statistics(self, translator):
162 _ = translator
169 _ = translator
163 stats = [
170 stats = [
164 {'key': _('Index Type'), 'value': 'Whoosh'},
171 {'key': _('Index Type'), 'value': 'Whoosh'},
165 {'sep': True},
172 {'sep': True},
166
173
167 {'key': _('File Index'), 'value': str(self.file_index)},
174 {'key': _('File Index'), 'value': str(self.file_index)},
168 {'key': _('Indexed documents'), 'value': self.file_index.doc_count()},
175 {'key': _('Indexed documents'), 'value': self.file_index.doc_count()},
169 {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},
176 {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},
170
177
171 {'sep': True},
178 {'sep': True},
172
179
173 {'key': _('Commit index'), 'value': str(self.commit_index)},
180 {'key': _('Commit index'), 'value': str(self.commit_index)},
174 {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},
181 {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},
175 {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}
182 {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}
176 ]
183 ]
177 return stats
184 return stats
178
185
179 def _get_repo_filter(self, auth_user, repo_name):
186 def _get_repo_filter(self, auth_user, repo_name):
180
187
181 allowed_to_search = [
188 allowed_to_search = [
182 repo for repo, perm in
189 repo for repo, perm in
183 auth_user.permissions['repositories'].items()
190 auth_user.permissions['repositories'].items()
184 if perm != 'repository.none']
191 if perm != 'repository.none']
185
192
186 if repo_name:
193 if repo_name:
187 repo_filter = [query_lib.Term('repository', repo_name)]
194 repo_filter = [query_lib.Term('repository', repo_name)]
188
195
189 elif 'hg.admin' in auth_user.permissions.get('global', []):
196 elif 'hg.admin' in auth_user.permissions.get('global', []):
190 return None
197 return None
191
198
192 else:
199 else:
193 repo_filter = [query_lib.Term('repository', _rn)
200 repo_filter = [query_lib.Term('repository', _rn)
194 for _rn in allowed_to_search]
201 for _rn in allowed_to_search]
195 # in case we're not allowed to search anywhere, it's a trick
202 # in case we're not allowed to search anywhere, it's a trick
196 # to tell whoosh we're filtering, on ALL results
203 # to tell whoosh we're filtering, on ALL results
197 repo_filter = repo_filter or [query_lib.Term('repository', '')]
204 repo_filter = repo_filter or [query_lib.Term('repository', '')]
198
205
199 return query_lib.Or(repo_filter)
206 return query_lib.Or(repo_filter)
200
207
201 def _prepare_for_search(self, cur_type):
208 def _prepare_for_search(self, cur_type):
202 search_type = {
209 search_type = {
203 'content': 'content',
210 'content': 'content',
204 'commit': 'message',
211 'commit': 'message',
205 'path': 'path',
212 'path': 'path',
206 'repository': 'repository'
213 'repository': 'repository'
207 }.get(cur_type, 'content')
214 }.get(cur_type, 'content')
208
215
209 index_name = {
216 index_name = {
210 'content': FILE_INDEX_NAME,
217 'content': FILE_INDEX_NAME,
211 'commit': COMMIT_INDEX_NAME,
218 'commit': COMMIT_INDEX_NAME,
212 'path': FILE_INDEX_NAME
219 'path': FILE_INDEX_NAME
213 }.get(cur_type, FILE_INDEX_NAME)
220 }.get(cur_type, FILE_INDEX_NAME)
214
221
215 schema_defn = {
222 schema_defn = {
216 'content': self.file_schema,
223 'content': self.file_schema,
217 'commit': self.commit_schema,
224 'commit': self.commit_schema,
218 'path': self.file_schema
225 'path': self.file_schema
219 }.get(cur_type, self.file_schema)
226 }.get(cur_type, self.file_schema)
220
227
221 log.debug('IDX: %s', index_name)
228 log.debug('IDX: %s', index_name)
222 log.debug('SCHEMA: %s', schema_defn)
229 log.debug('SCHEMA: %s', schema_defn)
223 return search_type, index_name, schema_defn
230 return search_type, index_name, schema_defn
224
231
225 def _init_searcher(self, index_name):
232 def _init_searcher(self, index_name):
226 idx = open_dir(self.config['location'], indexname=index_name)
233 idx = open_dir(self.config['location'], indexname=index_name)
227 self.searcher = idx.searcher()
234 self.searcher = idx.searcher()
228 return self.searcher
235 return self.searcher
229
236
230
237
231 Searcher = WhooshSearcher
238 Searcher = WhooshSearcher
232
239
233
240
234 class WhooshResultWrapper(object):
241 class WhooshResultWrapper(object):
235 def __init__(self, search_type, total_hits, results):
242 def __init__(self, search_type, total_hits, results):
236 self.search_type = search_type
243 self.search_type = search_type
237 self.results = results
244 self.results = results
238 self.total_hits = total_hits
245 self.total_hits = total_hits
239
246
240 def __str__(self):
247 def __str__(self):
241 return '<%s at %s>' % (self.__class__.__name__, len(self))
248 return '<%s at %s>' % (self.__class__.__name__, len(self))
242
249
243 def __repr__(self):
250 def __repr__(self):
244 return self.__str__()
251 return self.__str__()
245
252
246 def __len__(self):
253 def __len__(self):
247 return self.total_hits
254 return self.total_hits
248
255
249 def __iter__(self):
256 def __iter__(self):
250 """
257 """
251 Allows Iteration over results,and lazy generate content
258 Allows Iteration over results,and lazy generate content
252
259
253 *Requires* implementation of ``__getitem__`` method.
260 *Requires* implementation of ``__getitem__`` method.
254 """
261 """
255 for hit in self.results:
262 for hit in self.results:
256 yield self.get_full_content(hit)
263 yield self.get_full_content(hit)
257
264
258 def __getitem__(self, key):
265 def __getitem__(self, key):
259 """
266 """
260 Slicing of resultWrapper
267 Slicing of resultWrapper
261 """
268 """
262 i, j = key.start, key.stop
269 i, j = key.start, key.stop
263 for hit in self.results[i:j]:
270 for hit in self.results[i:j]:
264 yield self.get_full_content(hit)
271 yield self.get_full_content(hit)
265
272
266 def get_full_content(self, hit):
273 def get_full_content(self, hit):
267 # TODO: marcink: this feels like an overkill, there's a lot of data
274 # TODO: marcink: this feels like an overkill, there's a lot of data
268 # inside hit object, and we don't need all
275 # inside hit object, and we don't need all
269 res = dict(hit)
276 res = dict(hit)
270 # elastic search uses that, we set it empty so it fallbacks to regular HL logic
277 # elastic search uses that, we set it empty so it fallbacks to regular HL logic
271 res['content_highlight'] = ''
278 res['content_highlight'] = ''
272
279
273 f_path = '' # pragma: no cover
280 f_path = '' # pragma: no cover
274 if self.search_type in ['content', 'path']:
281 if self.search_type in ['content', 'path']:
275 f_path = res['path'][len(res['repository']):]
282 f_path = res['path'][len(res['repository']):]
276 f_path = f_path.lstrip(os.sep)
283 f_path = f_path.lstrip(os.sep)
277
284
278 if self.search_type == 'content':
285 if self.search_type == 'content':
279 res.update({'content_short_hl': hit.highlights('content'),
286 res.update({'content_short_hl': hit.highlights('content'),
280 'f_path': f_path})
287 'f_path': f_path})
281 elif self.search_type == 'path':
288 elif self.search_type == 'path':
282 res.update({'f_path': f_path})
289 res.update({'f_path': f_path})
283 elif self.search_type == 'message':
290 elif self.search_type == 'message':
284 res.update({'message_hl': hit.highlights('message')})
291 res.update({'message_hl': hit.highlights('message')})
285
292
286 return res
293 return res
General Comments 0
You need to be logged in to leave comments. Login now