##// END OF EJS Templates
search: added sorting of search results for whoosh backend.
marcink -
r3964:ec30b572 default
parent child Browse files
Show More
@@ -1,286 +1,293 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2012-2019 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21 """
22 22 Index schema for RhodeCode
23 23 """
24 24
25 25 from __future__ import absolute_import
26 26 import os
27 27 import re
28 28 import logging
29 29
30 30 from whoosh import query as query_lib
31 31 from whoosh.highlight import HtmlFormatter, ContextFragmenter
32 32 from whoosh.index import create_in, open_dir, exists_in, EmptyIndexError
33 33 from whoosh.qparser import QueryParser, QueryParserError
34 34
35 35 import rhodecode.lib.helpers as h
36 36 from rhodecode.lib.index import BaseSearcher
37 37 from rhodecode.lib.utils2 import safe_unicode
38 38
39 39 log = logging.getLogger(__name__)
40 40
41 41
42 42 try:
43 43 # we first try to import from rhodecode tools, fallback to copies if
44 44 # we're unable to
45 45 from rhodecode_tools.lib.fts_index.whoosh_schema import (
46 46 ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
47 47 COMMIT_SCHEMA)
48 48 except ImportError:
49 49 log.warning('rhodecode_tools schema not available, doing a fallback '
50 50 'import from `rhodecode.lib.index.whoosh_fallback_schema`')
51 51 from rhodecode.lib.index.whoosh_fallback_schema import (
52 52 ANALYZER, FILE_INDEX_NAME, FILE_SCHEMA, COMMIT_INDEX_NAME,
53 53 COMMIT_SCHEMA)
54 54
55 55
56 56 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
57 57 FRAGMENTER = ContextFragmenter(200)
58 58
59 59 log = logging.getLogger(__name__)
60 60
61 61
62 62 class WhooshSearcher(BaseSearcher):
63 63 # this also shows in UI
64 64 query_lang_doc = 'http://whoosh.readthedocs.io/en/latest/querylang.html'
65 65 name = 'whoosh'
66 66
67 67 def __init__(self, config):
68 68 super(Searcher, self).__init__()
69 69 self.config = config
70 70 if not os.path.isdir(self.config['location']):
71 71 os.makedirs(self.config['location'])
72 72
73 73 opener = create_in
74 74 if exists_in(self.config['location'], indexname=FILE_INDEX_NAME):
75 75 opener = open_dir
76 76 file_index = opener(self.config['location'], schema=FILE_SCHEMA,
77 77 indexname=FILE_INDEX_NAME)
78 78
79 79 opener = create_in
80 80 if exists_in(self.config['location'], indexname=COMMIT_INDEX_NAME):
81 81 opener = open_dir
82 82 changeset_index = opener(self.config['location'], schema=COMMIT_SCHEMA,
83 83 indexname=COMMIT_INDEX_NAME)
84 84
85 85 self.commit_schema = COMMIT_SCHEMA
86 86 self.commit_index = changeset_index
87 87 self.file_schema = FILE_SCHEMA
88 88 self.file_index = file_index
89 89 self.searcher = None
90 90
91 91 def cleanup(self):
92 92 if self.searcher:
93 93 self.searcher.close()
94 94
95 95 def _extend_query(self, query):
96 96 hashes = re.compile('([0-9a-f]{5,40})').findall(query)
97 97 if hashes:
98 98 hashes_or_query = ' OR '.join('commit_id:%s*' % h for h in hashes)
99 99 query = u'(%s) OR %s' % (query, hashes_or_query)
100 100 return query
101 101
102 102 def search(self, query, document_type, search_user,
103 103 repo_name=None, repo_group_name=None,
104 104 requested_page=1, page_limit=10, sort=None, raise_on_exc=True):
105 105
106 106 original_query = query
107 107 query = self._extend_query(query)
108 108
109 109 log.debug(u'QUERY: %s on %s', query, document_type)
110 110 result = {
111 111 'results': [],
112 112 'count': 0,
113 113 'error': None,
114 114 'runtime': 0
115 115 }
116 116 search_type, index_name, schema_defn = self._prepare_for_search(
117 117 document_type)
118 118 self._init_searcher(index_name)
119 119 try:
120 120 qp = QueryParser(search_type, schema=schema_defn)
121 121 allowed_repos_filter = self._get_repo_filter(
122 122 search_user, repo_name)
123 123 try:
124 124 query = qp.parse(safe_unicode(query))
125 125 log.debug('query: %s (%s)', query, repr(query))
126 126
127 reverse, sortedby = False, None
128 if search_type == 'message':
129 if sort == 'oldfirst':
130 sortedby = 'date'
127 def sort_def(_direction, _sort_field):
128 field2whoosh = {
129 'message.raw': 'message',
130 'author.email.raw': 'author',
131 }
132 return field2whoosh.get(_sort_field) or _sort_field
133
134 reverse, sorted_by = False, None
135 direction, sort_field = self.get_sort(search_type, sort)
136 if sort_field:
137 if direction == Searcher.DIRECTION_DESC:
138 reverse = True
139 if direction == Searcher.DIRECTION_ASC:
131 140 reverse = False
132 elif sort == 'newfirst':
133 sortedby = 'date'
134 reverse = True
141 sorted_by = sort_def(direction, sort_field)
135 142
136 143 whoosh_results = self.searcher.search(
137 144 query, filter=allowed_repos_filter, limit=None,
138 sortedby=sortedby, reverse=reverse)
145 sortedby=sorted_by, reverse=reverse)
139 146
140 147 # fixes for 32k limit that whoosh uses for highlight
141 148 whoosh_results.fragmenter.charlimit = None
142 149 res_ln = whoosh_results.scored_length()
143 150 result['runtime'] = whoosh_results.runtime
144 151 result['count'] = res_ln
145 152 result['results'] = WhooshResultWrapper(
146 153 search_type, res_ln, whoosh_results)
147 154
148 155 except QueryParserError:
149 156 result['error'] = 'Invalid search query. Try quoting it.'
150 157 except (EmptyIndexError, IOError, OSError):
151 158 msg = 'There is no index to search in. Please run whoosh indexer'
152 159 log.exception(msg)
153 160 result['error'] = msg
154 161 except Exception:
155 162 msg = 'An error occurred during this search operation'
156 163 log.exception(msg)
157 164 result['error'] = msg
158 165
159 166 return result
160 167
161 168 def statistics(self, translator):
162 169 _ = translator
163 170 stats = [
164 171 {'key': _('Index Type'), 'value': 'Whoosh'},
165 172 {'sep': True},
166 173
167 174 {'key': _('File Index'), 'value': str(self.file_index)},
168 175 {'key': _('Indexed documents'), 'value': self.file_index.doc_count()},
169 176 {'key': _('Last update'), 'value': h.time_to_datetime(self.file_index.last_modified())},
170 177
171 178 {'sep': True},
172 179
173 180 {'key': _('Commit index'), 'value': str(self.commit_index)},
174 181 {'key': _('Indexed documents'), 'value': str(self.commit_index.doc_count())},
175 182 {'key': _('Last update'), 'value': h.time_to_datetime(self.commit_index.last_modified())}
176 183 ]
177 184 return stats
178 185
179 186 def _get_repo_filter(self, auth_user, repo_name):
180 187
181 188 allowed_to_search = [
182 189 repo for repo, perm in
183 190 auth_user.permissions['repositories'].items()
184 191 if perm != 'repository.none']
185 192
186 193 if repo_name:
187 194 repo_filter = [query_lib.Term('repository', repo_name)]
188 195
189 196 elif 'hg.admin' in auth_user.permissions.get('global', []):
190 197 return None
191 198
192 199 else:
193 200 repo_filter = [query_lib.Term('repository', _rn)
194 201 for _rn in allowed_to_search]
195 202 # in case we're not allowed to search anywhere, it's a trick
196 203 # to tell whoosh we're filtering, on ALL results
197 204 repo_filter = repo_filter or [query_lib.Term('repository', '')]
198 205
199 206 return query_lib.Or(repo_filter)
200 207
201 208 def _prepare_for_search(self, cur_type):
202 209 search_type = {
203 210 'content': 'content',
204 211 'commit': 'message',
205 212 'path': 'path',
206 213 'repository': 'repository'
207 214 }.get(cur_type, 'content')
208 215
209 216 index_name = {
210 217 'content': FILE_INDEX_NAME,
211 218 'commit': COMMIT_INDEX_NAME,
212 219 'path': FILE_INDEX_NAME
213 220 }.get(cur_type, FILE_INDEX_NAME)
214 221
215 222 schema_defn = {
216 223 'content': self.file_schema,
217 224 'commit': self.commit_schema,
218 225 'path': self.file_schema
219 226 }.get(cur_type, self.file_schema)
220 227
221 228 log.debug('IDX: %s', index_name)
222 229 log.debug('SCHEMA: %s', schema_defn)
223 230 return search_type, index_name, schema_defn
224 231
225 232 def _init_searcher(self, index_name):
226 233 idx = open_dir(self.config['location'], indexname=index_name)
227 234 self.searcher = idx.searcher()
228 235 return self.searcher
229 236
230 237
231 238 Searcher = WhooshSearcher
232 239
233 240
234 241 class WhooshResultWrapper(object):
235 242 def __init__(self, search_type, total_hits, results):
236 243 self.search_type = search_type
237 244 self.results = results
238 245 self.total_hits = total_hits
239 246
240 247 def __str__(self):
241 248 return '<%s at %s>' % (self.__class__.__name__, len(self))
242 249
243 250 def __repr__(self):
244 251 return self.__str__()
245 252
246 253 def __len__(self):
247 254 return self.total_hits
248 255
249 256 def __iter__(self):
250 257 """
251 258 Allows Iteration over results,and lazy generate content
252 259
253 260 *Requires* implementation of ``__getitem__`` method.
254 261 """
255 262 for hit in self.results:
256 263 yield self.get_full_content(hit)
257 264
258 265 def __getitem__(self, key):
259 266 """
260 267 Slicing of resultWrapper
261 268 """
262 269 i, j = key.start, key.stop
263 270 for hit in self.results[i:j]:
264 271 yield self.get_full_content(hit)
265 272
266 273 def get_full_content(self, hit):
267 274 # TODO: marcink: this feels like an overkill, there's a lot of data
268 275 # inside hit object, and we don't need all
269 276 res = dict(hit)
270 277 # elastic search uses that, we set it empty so it fallbacks to regular HL logic
271 278 res['content_highlight'] = ''
272 279
273 280 f_path = '' # pragma: no cover
274 281 if self.search_type in ['content', 'path']:
275 282 f_path = res['path'][len(res['repository']):]
276 283 f_path = f_path.lstrip(os.sep)
277 284
278 285 if self.search_type == 'content':
279 286 res.update({'content_short_hl': hit.highlights('content'),
280 287 'f_path': f_path})
281 288 elif self.search_type == 'path':
282 289 res.update({'f_path': f_path})
283 290 elif self.search_type == 'message':
284 291 res.update({'message_hl': hit.highlights('message')})
285 292
286 293 return res
General Comments 0
You need to be logged in to leave comments. Login now