##// END OF EJS Templates
Allowing indexing job to resolve repos path on its own if not given.
Jared Bunting -
r1407:2744f5b0 beta
parent child Browse files
Show More
@@ -1,224 +1,225
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2 """
2 """
3 rhodecode.lib.indexers.__init__
3 rhodecode.lib.indexers.__init__
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5
5
6 Whoosh indexing module for RhodeCode
6 Whoosh indexing module for RhodeCode
7
7
8 :created_on: Aug 17, 2010
8 :created_on: Aug 17, 2010
9 :author: marcink
9 :author: marcink
10 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
10 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
11 :license: GPLv3, see COPYING for more details.
11 :license: GPLv3, see COPYING for more details.
12 """
12 """
13 # This program is free software: you can redistribute it and/or modify
13 # This program is free software: you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation, either version 3 of the License, or
15 # the Free Software Foundation, either version 3 of the License, or
16 # (at your option) any later version.
16 # (at your option) any later version.
17 #
17 #
18 # This program is distributed in the hope that it will be useful,
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
21 # GNU General Public License for more details.
22 #
22 #
23 # You should have received a copy of the GNU General Public License
23 # You should have received a copy of the GNU General Public License
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 # along with this program. If not, see <http://www.gnu.org/licenses/>.
25 import os
25 import os
26 import sys
26 import sys
27 import traceback
27 import traceback
28 from os.path import dirname as dn, join as jn
28 from os.path import dirname as dn, join as jn
29
29
30 #to get the rhodecode import
30 #to get the rhodecode import
31 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
31 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
32
32
33 from string import strip
33 from string import strip
34 from shutil import rmtree
34 from shutil import rmtree
35
35
36 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
36 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
37 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
37 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
38 from whoosh.index import create_in, open_dir
38 from whoosh.index import create_in, open_dir
39 from whoosh.formats import Characters
39 from whoosh.formats import Characters
40 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
40 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
41
41
42 from webhelpers.html.builder import escape
42 from webhelpers.html.builder import escape
43 from sqlalchemy import engine_from_config
43 from sqlalchemy import engine_from_config
44 from vcs.utils.lazy import LazyProperty
44 from vcs.utils.lazy import LazyProperty
45
45
46 from rhodecode.model import init_model
46 from rhodecode.model import init_model
47 from rhodecode.model.scm import ScmModel
47 from rhodecode.model.scm import ScmModel
48 from rhodecode.model.repo import RepoModel
48 from rhodecode.config.environment import load_environment
49 from rhodecode.config.environment import load_environment
49 from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
50 from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
50 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
51
52
52 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
53 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
53 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
54 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
54
55
55 #CUSTOM ANALYZER wordsplit + lowercase filter
56 #CUSTOM ANALYZER wordsplit + lowercase filter
56 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
57 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
57
58
58
59
59 #INDEX SCHEMA DEFINITION
60 #INDEX SCHEMA DEFINITION
60 SCHEMA = Schema(owner=TEXT(),
61 SCHEMA = Schema(owner=TEXT(),
61 repository=TEXT(stored=True),
62 repository=TEXT(stored=True),
62 path=TEXT(stored=True),
63 path=TEXT(stored=True),
63 content=FieldType(format=Characters(ANALYZER),
64 content=FieldType(format=Characters(ANALYZER),
64 scorable=True, stored=True),
65 scorable=True, stored=True),
65 modtime=STORED(), extension=TEXT(stored=True))
66 modtime=STORED(), extension=TEXT(stored=True))
66
67
67
68
68 IDX_NAME = 'HG_INDEX'
69 IDX_NAME = 'HG_INDEX'
69 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
70 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
70 FRAGMENTER = SimpleFragmenter(200)
71 FRAGMENTER = SimpleFragmenter(200)
71
72
72
73
73 class MakeIndex(BasePasterCommand):
74 class MakeIndex(BasePasterCommand):
74
75
75 max_args = 1
76 max_args = 1
76 min_args = 1
77 min_args = 1
77
78
78 usage = "CONFIG_FILE"
79 usage = "CONFIG_FILE"
79 summary = "Creates index for full text search given configuration file"
80 summary = "Creates index for full text search given configuration file"
80 group_name = "RhodeCode"
81 group_name = "RhodeCode"
81 takes_config_file = -1
82 takes_config_file = -1
82 parser = Command.standard_parser(verbose=True)
83 parser = Command.standard_parser(verbose=True)
83
84
84 def command(self):
85 def command(self):
85
86
86 from pylons import config
87 from pylons import config
87 add_cache(config)
88 add_cache(config)
88 engine = engine_from_config(config, 'sqlalchemy.db1.')
89 engine = engine_from_config(config, 'sqlalchemy.db1.')
89 init_model(engine)
90 init_model(engine)
90
91
91 index_location = config['index_dir']
92 index_location = config['index_dir']
92 repo_location = self.options.repo_location
93 repo_location = self.options.repo_location if self.options.repo_location else RepoModel().repos_path
93 repo_list = map(strip, self.options.repo_list.split(',')) \
94 repo_list = map(strip, self.options.repo_list.split(',')) \
94 if self.options.repo_list else None
95 if self.options.repo_list else None
95
96
96 #======================================================================
97 #======================================================================
97 # WHOOSH DAEMON
98 # WHOOSH DAEMON
98 #======================================================================
99 #======================================================================
99 from rhodecode.lib.pidlock import LockHeld, DaemonLock
100 from rhodecode.lib.pidlock import LockHeld, DaemonLock
100 from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
101 from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
101 try:
102 try:
102 l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))
103 l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))
103 WhooshIndexingDaemon(index_location=index_location,
104 WhooshIndexingDaemon(index_location=index_location,
104 repo_location=repo_location,
105 repo_location=repo_location,
105 repo_list=repo_list)\
106 repo_list=repo_list)\
106 .run(full_index=self.options.full_index)
107 .run(full_index=self.options.full_index)
107 l.release()
108 l.release()
108 except LockHeld:
109 except LockHeld:
109 sys.exit(1)
110 sys.exit(1)
110
111
111 def update_parser(self):
112 def update_parser(self):
112 self.parser.add_option('--repo-location',
113 self.parser.add_option('--repo-location',
113 action='store',
114 action='store',
114 dest='repo_location',
115 dest='repo_location',
115 help="Specifies repositories location to index REQUIRED",
116 help="Specifies repositories location to index REQUIRED",
116 )
117 )
117 self.parser.add_option('--index-only',
118 self.parser.add_option('--index-only',
118 action='store',
119 action='store',
119 dest='repo_list',
120 dest='repo_list',
120 help="Specifies a comma separated list of repositores "
121 help="Specifies a comma separated list of repositores "
121 "to build index on OPTIONAL",
122 "to build index on OPTIONAL",
122 )
123 )
123 self.parser.add_option('-f',
124 self.parser.add_option('-f',
124 action='store_true',
125 action='store_true',
125 dest='full_index',
126 dest='full_index',
126 help="Specifies that index should be made full i.e"
127 help="Specifies that index should be made full i.e"
127 " destroy old and build from scratch",
128 " destroy old and build from scratch",
128 default=False)
129 default=False)
129
130
130 class ResultWrapper(object):
131 class ResultWrapper(object):
131 def __init__(self, search_type, searcher, matcher, highlight_items):
132 def __init__(self, search_type, searcher, matcher, highlight_items):
132 self.search_type = search_type
133 self.search_type = search_type
133 self.searcher = searcher
134 self.searcher = searcher
134 self.matcher = matcher
135 self.matcher = matcher
135 self.highlight_items = highlight_items
136 self.highlight_items = highlight_items
136 self.fragment_size = 200 / 2
137 self.fragment_size = 200 / 2
137
138
138 @LazyProperty
139 @LazyProperty
139 def doc_ids(self):
140 def doc_ids(self):
140 docs_id = []
141 docs_id = []
141 while self.matcher.is_active():
142 while self.matcher.is_active():
142 docnum = self.matcher.id()
143 docnum = self.matcher.id()
143 chunks = [offsets for offsets in self.get_chunks()]
144 chunks = [offsets for offsets in self.get_chunks()]
144 docs_id.append([docnum, chunks])
145 docs_id.append([docnum, chunks])
145 self.matcher.next()
146 self.matcher.next()
146 return docs_id
147 return docs_id
147
148
148 def __str__(self):
149 def __str__(self):
149 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
150 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
150
151
151 def __repr__(self):
152 def __repr__(self):
152 return self.__str__()
153 return self.__str__()
153
154
154 def __len__(self):
155 def __len__(self):
155 return len(self.doc_ids)
156 return len(self.doc_ids)
156
157
157 def __iter__(self):
158 def __iter__(self):
158 """
159 """
159 Allows Iteration over results,and lazy generate content
160 Allows Iteration over results,and lazy generate content
160
161
161 *Requires* implementation of ``__getitem__`` method.
162 *Requires* implementation of ``__getitem__`` method.
162 """
163 """
163 for docid in self.doc_ids:
164 for docid in self.doc_ids:
164 yield self.get_full_content(docid)
165 yield self.get_full_content(docid)
165
166
166 def __getitem__(self, key):
167 def __getitem__(self, key):
167 """
168 """
168 Slicing of resultWrapper
169 Slicing of resultWrapper
169 """
170 """
170 i, j = key.start, key.stop
171 i, j = key.start, key.stop
171
172
172 slice = []
173 slice = []
173 for docid in self.doc_ids[i:j]:
174 for docid in self.doc_ids[i:j]:
174 slice.append(self.get_full_content(docid))
175 slice.append(self.get_full_content(docid))
175 return slice
176 return slice
176
177
177
178
178 def get_full_content(self, docid):
179 def get_full_content(self, docid):
179 res = self.searcher.stored_fields(docid[0])
180 res = self.searcher.stored_fields(docid[0])
180 f_path = res['path'][res['path'].find(res['repository']) \
181 f_path = res['path'][res['path'].find(res['repository']) \
181 + len(res['repository']):].lstrip('/')
182 + len(res['repository']):].lstrip('/')
182
183
183 content_short = self.get_short_content(res, docid[1])
184 content_short = self.get_short_content(res, docid[1])
184 res.update({'content_short':content_short,
185 res.update({'content_short':content_short,
185 'content_short_hl':self.highlight(content_short),
186 'content_short_hl':self.highlight(content_short),
186 'f_path':f_path})
187 'f_path':f_path})
187
188
188 return res
189 return res
189
190
190 def get_short_content(self, res, chunks):
191 def get_short_content(self, res, chunks):
191
192
192 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
193 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
193
194
194 def get_chunks(self):
195 def get_chunks(self):
195 """
196 """
196 Smart function that implements chunking the content
197 Smart function that implements chunking the content
197 but not overlap chunks so it doesn't highlight the same
198 but not overlap chunks so it doesn't highlight the same
198 close occurrences twice.
199 close occurrences twice.
199
200
200 :param matcher:
201 :param matcher:
201 :param size:
202 :param size:
202 """
203 """
203 memory = [(0, 0)]
204 memory = [(0, 0)]
204 for span in self.matcher.spans():
205 for span in self.matcher.spans():
205 start = span.startchar or 0
206 start = span.startchar or 0
206 end = span.endchar or 0
207 end = span.endchar or 0
207 start_offseted = max(0, start - self.fragment_size)
208 start_offseted = max(0, start - self.fragment_size)
208 end_offseted = end + self.fragment_size
209 end_offseted = end + self.fragment_size
209
210
210 if start_offseted < memory[-1][1]:
211 if start_offseted < memory[-1][1]:
211 start_offseted = memory[-1][1]
212 start_offseted = memory[-1][1]
212 memory.append((start_offseted, end_offseted,))
213 memory.append((start_offseted, end_offseted,))
213 yield (start_offseted, end_offseted,)
214 yield (start_offseted, end_offseted,)
214
215
215 def highlight(self, content, top=5):
216 def highlight(self, content, top=5):
216 if self.search_type != 'content':
217 if self.search_type != 'content':
217 return ''
218 return ''
218 hl = highlight(escape(content),
219 hl = highlight(escape(content),
219 self.highlight_items,
220 self.highlight_items,
220 analyzer=ANALYZER,
221 analyzer=ANALYZER,
221 fragmenter=FRAGMENTER,
222 fragmenter=FRAGMENTER,
222 formatter=FORMATTER,
223 formatter=FORMATTER,
223 top=top)
224 top=top)
224 return hl
225 return hl
General Comments 0
You need to be logged in to leave comments. Login now