##// END OF EJS Templates
fixes issue #524...
marcink -
r2718:82fb2a16 beta
parent child Browse files
Show More
@@ -1,265 +1,271 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 rhodecode.lib.indexers.__init__
4 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 5
6 6 Whoosh indexing module for RhodeCode
7 7
8 8 :created_on: Aug 17, 2010
9 9 :author: marcink
10 10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 11 :license: GPLv3, see COPYING for more details.
12 12 """
13 13 # This program is free software: you can redistribute it and/or modify
14 14 # it under the terms of the GNU General Public License as published by
15 15 # the Free Software Foundation, either version 3 of the License, or
16 16 # (at your option) any later version.
17 17 #
18 18 # This program is distributed in the hope that it will be useful,
19 19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 21 # GNU General Public License for more details.
22 22 #
23 23 # You should have received a copy of the GNU General Public License
24 24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 25 import os
26 26 import sys
27 27 import traceback
28 28 import logging
29 29 from os.path import dirname as dn, join as jn
30 30
31 31 #to get the rhodecode import
32 32 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
33 33
34 34 from string import strip
35 35 from shutil import rmtree
36 36
37 37 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
38 38 from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType
39 39 from whoosh.index import create_in, open_dir
40 40 from whoosh.formats import Characters
41 41 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
42 42
43 43 from webhelpers.html.builder import escape, literal
44 44 from sqlalchemy import engine_from_config
45 45
46 46 from rhodecode.model import init_model
47 47 from rhodecode.model.scm import ScmModel
48 48 from rhodecode.model.repo import RepoModel
49 49 from rhodecode.config.environment import load_environment
50 50 from rhodecode.lib.utils2 import LazyProperty
51 51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
52 52 load_rcextensions
53 53
54 54 log = logging.getLogger(__name__)
55 55
56 56 # CUSTOM ANALYZER wordsplit + lowercase filter
57 57 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
58 58
59 59 #INDEX SCHEMA DEFINITION
60 60 SCHEMA = Schema(
61 61 fileid=ID(unique=True),
62 62 owner=TEXT(),
63 63 repository=TEXT(stored=True),
64 64 path=TEXT(stored=True),
65 65 content=FieldType(format=Characters(), analyzer=ANALYZER,
66 66 scorable=True, stored=True),
67 67 modtime=STORED(),
68 68 extension=TEXT(stored=True)
69 69 )
70 70
71 71 IDX_NAME = 'HG_INDEX'
72 72 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
73 73 FRAGMENTER = ContextFragmenter(200)
74 74
75 75 CHGSETS_SCHEMA = Schema(
76 76 raw_id=ID(unique=True, stored=True),
77 77 date=NUMERIC(stored=True),
78 78 last=BOOLEAN(),
79 79 owner=TEXT(),
80 80 repository=ID(unique=True, stored=True),
81 81 author=TEXT(stored=True),
82 82 message=FieldType(format=Characters(), analyzer=ANALYZER,
83 83 scorable=True, stored=True),
84 84 parents=TEXT(),
85 85 added=TEXT(),
86 86 removed=TEXT(),
87 87 changed=TEXT(),
88 88 )
89 89
90 90 CHGSET_IDX_NAME = 'CHGSET_INDEX'
91 91
92
92 93 class MakeIndex(BasePasterCommand):
93 94
94 95 max_args = 1
95 96 min_args = 1
96 97
97 98 usage = "CONFIG_FILE"
98 99 summary = "Creates index for full text search given configuration file"
99 100 group_name = "RhodeCode"
100 101 takes_config_file = -1
101 102 parser = Command.standard_parser(verbose=True)
102 103
103 104 def command(self):
104 105 logging.config.fileConfig(self.path_to_ini_file)
105 106 from pylons import config
106 107 add_cache(config)
107 108 engine = engine_from_config(config, 'sqlalchemy.db1.')
108 109 init_model(engine)
109 110 index_location = config['index_dir']
110 111 repo_location = self.options.repo_location \
111 112 if self.options.repo_location else RepoModel().repos_path
112 113 repo_list = map(strip, self.options.repo_list.split(',')) \
113 114 if self.options.repo_list else None
114 115 repo_update_list = map(strip, self.options.repo_update_list.split(',')) \
115 116 if self.options.repo_update_list else None
116 117 load_rcextensions(config['here'])
117 118 #======================================================================
118 119 # WHOOSH DAEMON
119 120 #======================================================================
120 121 from rhodecode.lib.pidlock import LockHeld, DaemonLock
121 122 from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
122 123 try:
123 124 l = DaemonLock(file_=jn(dn(dn(index_location)), 'make_index.lock'))
124 125 WhooshIndexingDaemon(index_location=index_location,
125 126 repo_location=repo_location,
126 127 repo_list=repo_list,
127 128 repo_update_list=repo_update_list)\
128 129 .run(full_index=self.options.full_index)
129 130 l.release()
130 131 except LockHeld:
131 132 sys.exit(1)
132 133
133 134 def update_parser(self):
134 135 self.parser.add_option('--repo-location',
135 136 action='store',
136 137 dest='repo_location',
137 138 help="Specifies repositories location to index OPTIONAL",
138 139 )
139 140 self.parser.add_option('--index-only',
140 141 action='store',
141 142 dest='repo_list',
142 143 help="Specifies a comma separated list of repositores "
143 144 "to build index on. If not given all repositories "
144 145 "are scanned for indexing. OPTIONAL",
145 146 )
146 147 self.parser.add_option('--update-only',
147 148 action='store',
148 149 dest='repo_update_list',
149 150 help="Specifies a comma separated list of repositores "
150 151 "to re-build index on. OPTIONAL",
151 152 )
152 153 self.parser.add_option('-f',
153 154 action='store_true',
154 155 dest='full_index',
155 156 help="Specifies that index should be made full i.e"
156 157 " destroy old and build from scratch",
157 158 default=False)
158 159
159 160
160 161 class WhooshResultWrapper(object):
161 162 def __init__(self, search_type, searcher, matcher, highlight_items,
162 163 repo_location):
163 164 self.search_type = search_type
164 165 self.searcher = searcher
165 166 self.matcher = matcher
166 167 self.highlight_items = highlight_items
167 168 self.fragment_size = 200
168 169 self.repo_location = repo_location
169 170
170 171 @LazyProperty
171 172 def doc_ids(self):
172 173 docs_id = []
173 174 while self.matcher.is_active():
174 175 docnum = self.matcher.id()
175 176 chunks = [offsets for offsets in self.get_chunks()]
176 177 docs_id.append([docnum, chunks])
177 178 self.matcher.next()
178 179 return docs_id
179 180
180 181 def __str__(self):
181 182 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
182 183
183 184 def __repr__(self):
184 185 return self.__str__()
185 186
186 187 def __len__(self):
187 188 return len(self.doc_ids)
188 189
189 190 def __iter__(self):
190 191 """
191 192 Allows Iteration over results,and lazy generate content
192 193
193 194 *Requires* implementation of ``__getitem__`` method.
194 195 """
195 196 for docid in self.doc_ids:
196 197 yield self.get_full_content(docid)
197 198
198 199 def __getitem__(self, key):
199 200 """
200 201 Slicing of resultWrapper
201 202 """
202 203 i, j = key.start, key.stop
203 204
204 205 slices = []
205 206 for docid in self.doc_ids[i:j]:
206 207 slices.append(self.get_full_content(docid))
207 208 return slices
208 209
209 210 def get_full_content(self, docid):
210 211 res = self.searcher.stored_fields(docid[0])
211 212 log.debug('result: %s' % res)
212 213 if self.search_type == 'content':
213 214 full_repo_path = jn(self.repo_location, res['repository'])
214 215 f_path = res['path'].split(full_repo_path)[-1]
215 216 f_path = f_path.lstrip(os.sep)
216 217 content_short = self.get_short_content(res, docid[1])
217 218 res.update({'content_short': content_short,
218 219 'content_short_hl': self.highlight(content_short),
219 220 'f_path': f_path
220 221 })
222 elif self.search_type == 'path':
223 full_repo_path = jn(self.repo_location, res['repository'])
224 f_path = res['path'].split(full_repo_path)[-1]
225 f_path = f_path.lstrip(os.sep)
226 res.update({'f_path': f_path})
221 227 elif self.search_type == 'message':
222 228 res.update({'message_hl': self.highlight(res['message'])})
223 229
224 230 log.debug('result: %s' % res)
225 231
226 232 return res
227 233
228 234 def get_short_content(self, res, chunks):
229 235
230 236 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
231 237
232 238 def get_chunks(self):
233 239 """
234 240 Smart function that implements chunking the content
235 241 but not overlap chunks so it doesn't highlight the same
236 242 close occurrences twice.
237 243
238 244 :param matcher:
239 245 :param size:
240 246 """
241 247 memory = [(0, 0)]
242 248 if self.matcher.supports('positions'):
243 249 for span in self.matcher.spans():
244 250 start = span.startchar or 0
245 251 end = span.endchar or 0
246 252 start_offseted = max(0, start - self.fragment_size)
247 253 end_offseted = end + self.fragment_size
248 254
249 255 if start_offseted < memory[-1][1]:
250 256 start_offseted = memory[-1][1]
251 257 memory.append((start_offseted, end_offseted,))
252 258 yield (start_offseted, end_offseted,)
253 259
254 260 def highlight(self, content, top=5):
255 261 if self.search_type not in ['content', 'message']:
256 262 return ''
257 263 hl = highlight(
258 264 text=content,
259 265 terms=self.highlight_items,
260 266 analyzer=ANALYZER,
261 267 fragmenter=FRAGMENTER,
262 268 formatter=FORMATTER,
263 269 top=top
264 270 )
265 271 return hl
@@ -1,92 +1,99 b''
1 1 import os
2 2 from rhodecode.tests import *
3 3 from nose.plugins.skip import SkipTest
4 4
5 5
6 6 class TestSearchController(TestController):
7 7
8 8 def test_index(self):
9 9 self.log_user()
10 10 response = self.app.get(url(controller='search', action='index'))
11 11
12 12 self.assertTrue('class="small" id="q" name="q" type="text"' in
13 13 response.body)
14 14 # Test response...
15 15
16 16 def test_empty_search(self):
17 17 if os.path.isdir(self.index_location):
18 18 raise SkipTest('skipped due to existing index')
19 19 else:
20 20 self.log_user()
21 21 response = self.app.get(url(controller='search', action='index'),
22 22 {'q': HG_REPO})
23 23 self.assertTrue('There is no index to search in. '
24 24 'Please run whoosh indexer' in response.body)
25 25
26 26 def test_normal_search(self):
27 27 self.log_user()
28 28 response = self.app.get(url(controller='search', action='index'),
29 29 {'q': 'def repo'})
30 30 response.mustcontain('39 results')
31 31
32 32 def test_repo_search(self):
33 33 self.log_user()
34 34 response = self.app.get(url(controller='search', action='index'),
35 35 {'q': 'repository:%s def test' % HG_REPO})
36 36
37 37 response.mustcontain('4 results')
38 38
39 39 def test_search_last(self):
40 40 self.log_user()
41 41 response = self.app.get(url(controller='search', action='index'),
42 42 {'q': 'last:t', 'type': 'commit'})
43 43
44 44 response.mustcontain('2 results')
45 45
46 46 def test_search_commit_message(self):
47 47 self.log_user()
48 48 response = self.app.get(url(controller='search', action='index'),
49 49 {'q': 'bother to ask where to fetch repo during tests',
50 50 'type': 'commit'})
51 51
52 52 response.mustcontain('2 results')
53 53 response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
54 54 response.mustcontain('c6eb379775c578a95dad8ddab53f963b80894850')
55 55
56 56 def test_search_commit_message_hg_repo(self):
57 57 self.log_user()
58 58 response = self.app.get(url(controller='search', action='index',
59 59 search_repo=HG_REPO),
60 60 {'q': 'bother to ask where to fetch repo during tests',
61 61 'type': 'commit'})
62 62
63 63 response.mustcontain('1 results')
64 64 response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
65 65
66 66 def test_search_commit_changed_file(self):
67 67 self.log_user()
68 68 response = self.app.get(url(controller='search', action='index'),
69 69 {'q': 'changed:tests/utils.py',
70 70 'type': 'commit'})
71 71
72 72 response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1')
73 73
74 74 def test_search_commit_added_file(self):
75 75 self.log_user()
76 76 response = self.app.get(url(controller='search', action='index'),
77 77 {'q': 'added:README.rst',
78 78 'type': 'commit'})
79 79
80 80 response.mustcontain('2 results')
81 81 #HG
82 82 response.mustcontain('3803844fdbd3b711175fc3da9bdacfcd6d29a6fb')
83 83 #GIT
84 84 response.mustcontain('ff7ca51e58c505fec0dd2491de52c622bb7a806b')
85 85
86 86 def test_search_author(self):
87 87 self.log_user()
88 88 response = self.app.get(url(controller='search', action='index'),
89 89 {'q': 'author:marcin@python-blog.com raw_id:b986218ba1c9b0d6a259fac9b050b1724ed8e545',
90 90 'type': 'commit'})
91 91
92 92 response.mustcontain('1 results')
93
94 def test_search_file_name(self):
95 self.log_user()
96 response = self.app.get(url(controller='search', action='index'),
97 {'q': 'README.rst', 'type': 'path'})
98
99 response.mustcontain('2 results') No newline at end of file
General Comments 0
You need to be logged in to leave comments. Login now