##// END OF EJS Templates
Reimplemented searching for speed on large files and added paging for search results...
marcink -
r478:7010af6e celery
parent child Browse files
Show More
@@ -1,113 +1,98 b''
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 # search controller for pylons
3 # search controller for pylons
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 #
5 #
6 # This program is free software; you can redistribute it and/or
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; version 2
8 # as published by the Free Software Foundation; version 2
9 # of the License or (at your opinion) any later version of the license.
9 # of the License or (at your opinion) any later version of the license.
10 #
10 #
11 # This program is distributed in the hope that it will be useful,
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
14 # GNU General Public License for more details.
15 #
15 #
16 # You should have received a copy of the GNU General Public License
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 # MA 02110-1301, USA.
19 # MA 02110-1301, USA.
20 """
20 """
21 Created on Aug 7, 2010
21 Created on Aug 7, 2010
22 search controller for pylons
22 search controller for pylons
23 @author: marcink
23 @author: marcink
24 """
24 """
25 from pylons import request, response, session, tmpl_context as c, url
25 from pylons import request, response, session, tmpl_context as c, url
26 from pylons.controllers.util import abort, redirect
26 from pylons.controllers.util import abort, redirect
27 from pylons_app.lib.auth import LoginRequired
27 from pylons_app.lib.auth import LoginRequired
28 from pylons_app.lib.base import BaseController, render
28 from pylons_app.lib.base import BaseController, render
29 from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
29 from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
30 from webhelpers.html.builder import escape
30 from webhelpers.paginate import Page
31 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
31 from webhelpers.util import update_params
32 ContextFragmenter
33 from pylons.i18n.translation import _
32 from pylons.i18n.translation import _
34 from whoosh.index import open_dir, EmptyIndexError
33 from whoosh.index import open_dir, EmptyIndexError
35 from whoosh.qparser import QueryParser, QueryParserError
34 from whoosh.qparser import QueryParser, QueryParserError
36 from whoosh.query import Phrase
35 from whoosh.query import Phrase
37 import logging
36 import logging
38 import traceback
37 import traceback
39
38
40 log = logging.getLogger(__name__)
39 log = logging.getLogger(__name__)
41
40
42 class SearchController(BaseController):
41 class SearchController(BaseController):
43
42
44 @LoginRequired()
43 @LoginRequired()
45 def __before__(self):
44 def __before__(self):
46 super(SearchController, self).__before__()
45 super(SearchController, self).__before__()
47
46
48
49 def index(self):
47 def index(self):
50 c.formated_results = []
48 c.formated_results = []
51 c.runtime = ''
49 c.runtime = ''
52 search_items = set()
53 c.cur_query = request.GET.get('q', None)
50 c.cur_query = request.GET.get('q', None)
54 if c.cur_query:
51 if c.cur_query:
55 cur_query = c.cur_query.lower()
52 cur_query = c.cur_query.lower()
56
53
57
58 if c.cur_query:
54 if c.cur_query:
55 p = int(request.params.get('page', 1))
56 highlight_items = set()
59 try:
57 try:
60 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
58 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
61 searcher = idx.searcher()
59 searcher = idx.searcher()
62
60
63 qp = QueryParser("content", schema=SCHEMA)
61 qp = QueryParser("content", schema=SCHEMA)
64 try:
62 try:
65 query = qp.parse(unicode(cur_query))
63 query = qp.parse(unicode(cur_query))
66
64
67 if isinstance(query, Phrase):
65 if isinstance(query, Phrase):
68 search_items.update(query.words)
66 highlight_items.update(query.words)
69 else:
67 else:
70 for i in query.all_terms():
68 for i in query.all_terms():
71 search_items.add(i[1])
69 if i[0] == 'content':
72
70 highlight_items.add(i[1])
73 log.debug(query)
74 log.debug(search_items)
75 results = searcher.search(query)
76 c.runtime = '%s results (%.3f seconds)' \
77 % (len(results), results.runtime)
78
71
79 analyzer = ANALYZER
72 matcher = query.matcher(searcher)
80 formatter = HtmlFormatter('span',
81 between='\n<span class="break">...</span>\n')
82
83 #how the parts are splitted within the same text part
84 fragmenter = SimpleFragmenter(200)
85 #fragmenter = ContextFragmenter(search_items)
86
73
87 for res in results:
74 log.debug(query)
88 d = {}
75 log.debug(highlight_items)
89 d.update(res)
76 results = searcher.search(query)
90 hl = highlight(escape(res['content']), search_items,
77 res_ln = len(results)
91 analyzer=analyzer,
78 c.runtime = '%s results (%.3f seconds)' \
92 fragmenter=fragmenter,
79 % (res_ln, results.runtime)
93 formatter=formatter,
80
94 top=5)
81 def url_generator(**kw):
95 f_path = res['path'][res['path'].find(res['repository']) \
82 return update_params("?q=%s" % c.cur_query, **kw)
96 + len(res['repository']):].lstrip('/')
83
97 d.update({'content_short':hl,
84 c.formated_results = Page(
98 'f_path':f_path})
85 ResultWrapper(searcher, matcher, highlight_items),
99 #del d['content']
86 page=p, item_count=res_ln,
100 c.formated_results.append(d)
87 items_per_page=10, url=url_generator)
101
88
102 except QueryParserError:
89 except QueryParserError:
103 c.runtime = _('Invalid search query. Try quoting it.')
90 c.runtime = _('Invalid search query. Try quoting it.')
104
91 searcher.close()
105 except (EmptyIndexError, IOError):
92 except (EmptyIndexError, IOError):
106 log.error(traceback.format_exc())
93 log.error(traceback.format_exc())
107 log.error('Empty Index data')
94 log.error('Empty Index data')
108 c.runtime = _('There is no index to search in. Please run whoosh indexer')
95 c.runtime = _('There is no index to search in. Please run whoosh indexer')
109
96
110
111
112 # Return a rendered template
97 # Return a rendered template
113 return render('/search/search.html')
98 return render('/search/search.html')
@@ -1,41 +1,140 b''
1 import sys
1 from os.path import dirname as dn, join as jn
2 from pidlock import LockHeld, DaemonLock
3 from pylons_app.config.environment import load_environment
4 from pylons_app.model.hg_model import HgModel
5 from shutil import rmtree
6 from webhelpers.html.builder import escape
7 from vcs.utils.lazy import LazyProperty
8
9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
11 from whoosh.index import create_in, open_dir
12 from whoosh.formats import Characters
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14
2 import os
15 import os
3 from pidlock import LockHeld, DaemonLock
16 import sys
4 import traceback
17 import traceback
5
18
6 from os.path import dirname as dn
19
7 from os.path import join as jn
8
20
9 #to get the pylons_app import
21 #to get the pylons_app import
10 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
22 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
11
23
12 from pylons_app.config.environment import load_environment
13 from pylons_app.model.hg_model import HgModel
14 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
15 from whoosh.fields import TEXT, ID, STORED, Schema
16 from whoosh.index import create_in, open_dir
17 from shutil import rmtree
18
24
19 #LOCATION WE KEEP THE INDEX
25 #LOCATION WE KEEP THE INDEX
20 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
26 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
21
27
22 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
28 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
23 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
29 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
24 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h',
30 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
25 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
31 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
26 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
32 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
27 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
33 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
28 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt',
34 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
29 'yaws']
35 'yaws']
30
36
31 #CUSTOM ANALYZER wordsplit + lowercase filter
37 #CUSTOM ANALYZER wordsplit + lowercase filter
32 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
38 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
33
39
40
34 #INDEX SCHEMA DEFINITION
41 #INDEX SCHEMA DEFINITION
35 SCHEMA = Schema(owner=TEXT(),
42 SCHEMA = Schema(owner=TEXT(),
36 repository=TEXT(stored=True),
43 repository=TEXT(stored=True),
37 path=ID(stored=True, unique=True),
44 path=ID(stored=True, unique=True),
38 content=TEXT(stored=True, analyzer=ANALYZER),
45 content=FieldType(format=Characters(ANALYZER),
39 modtime=STORED(),extension=TEXT(stored=True))
46 scorable=True, stored=True),
47 modtime=STORED(), extension=TEXT(stored=True))
48
49
50 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56
57 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher
60 self.matcher = matcher
61 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
63
64 @LazyProperty
65 def doc_ids(self):
66 docs_id = []
67 while self.matcher.is_active():
68 docnum = self.matcher.id()
69 docs_id.append(docnum)
70 self.matcher.next()
71 return docs_id
72
73 def __str__(self):
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
75
76 def __repr__(self):
77 return self.__str__()
78
79 def __len__(self):
80 return len(self.doc_ids)
81
82 def __iter__(self):
83 """
84 Allows Iteration over results,and lazy generate content
85
86 *Requires* implementation of ``__getitem__`` method.
87 """
88 for docid in self.doc_ids:
89 yield self.get_full_content(docid)
40
90
41 IDX_NAME = 'HG_INDEX' No newline at end of file
91 def __getslice__(self, i, j):
92 """
93 Slicing of resultWrapper
94 """
95 slice = []
96 for docid in self.doc_ids[i:j]:
97 slice.append(self.get_full_content(docid))
98 return slice
99
100
101 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
103 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/')
105
106 content_short = ''.join(self.get_short_content(res))
107 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path})
110
111 return res
112
113 def get_short_content(self, res):
114 """
115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same
117 close occurences twice.
118 @param matcher:
119 @param size:
120 """
121 memory = [(0, 0)]
122 for span in self.matcher.spans():
123 start = span.startchar or 0
124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
132
133 def highlight(self, content, top=5):
134 hl = highlight(escape(content),
135 self.highlight_items,
136 analyzer=ANALYZER,
137 fragmenter=FRAGMENTER,
138 formatter=FORMATTER,
139 top=top)
140 return hl
@@ -1,69 +1,71 b''
1 ## -*- coding: utf-8 -*-
1 ## -*- coding: utf-8 -*-
2 <%inherit file="/base/base.html"/>
2 <%inherit file="/base/base.html"/>
3 <%def name="title()">
3 <%def name="title()">
4 ${_('Search')}: ${c.cur_query}
4 ${_('Search')}: ${c.cur_query}
5 </%def>
5 </%def>
6 <%def name="breadcrumbs()">
6 <%def name="breadcrumbs()">
7 ${c.hg_app_name}
7 ${c.hg_app_name}
8 </%def>
8 </%def>
9 <%def name="page_nav()">
9 <%def name="page_nav()">
10 ${self.menu('home')}
10 ${self.menu('home')}
11 </%def>
11 </%def>
12 <%def name="main()">
12 <%def name="main()">
13
13
14 <div class="box">
14 <div class="box">
15 <!-- box / title -->
15 <!-- box / title -->
16 <div class="title">
16 <div class="title">
17 <h5>${_('Search')}</h5>
17 <h5>${_('Search')}</h5>
18 </div>
18 </div>
19 <!-- end box / title -->
19 <!-- end box / title -->
20 ${h.form('search',method='get')}
20 ${h.form('search',method='get')}
21 <div class="form">
21 <div class="form">
22 <div class="fields">
22 <div class="fields">
23
23
24 <div class="field ">
24 <div class="field ">
25 <div class="label">
25 <div class="label">
26 <label for="q">${_('Search:')}</label>
26 <label for="q">${_('Search:')}</label>
27 </div>
27 </div>
28 <div class="input">
28 <div class="input">
29 ${h.text('q',c.cur_query,class_="small")}
29 ${h.text('q',c.cur_query,class_="small")}
30 <div class="button highlight">
30 <div class="button highlight">
31 <input type="submit" value="${_('Search')}" class="ui-button ui-widget ui-state-default ui-corner-all"/>
31 <input type="submit" value="${_('Search')}" class="ui-button ui-widget ui-state-default ui-corner-all"/>
32 </div>
32 </div>
33 <div style="font-weight: bold;clear:both;padding: 5px">${c.runtime}</div>
33 <div style="font-weight: bold;clear:both;padding: 5px">${c.runtime}</div>
34 </div>
34 </div>
35 </div>
35 </div>
36 </div>
36 </div>
37 </div>
37 </div>
38 ${h.end_form()}
38 ${h.end_form()}
39
39
40 %for cnt,sr in enumerate(c.formated_results):
40 %for cnt,sr in enumerate(c.formated_results):
41 %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
41 %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
42 <div class="table">
42 <div class="table">
43 <div id="body${cnt}" class="codeblock">
43 <div id="body${cnt}" class="codeblock">
44 <div class="code-header">
44 <div class="code-header">
45 <div class="revision">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
45 <div class="revision">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
46 h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
46 h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
47 </div>
47 </div>
48 <div class="code-body">
48 <div class="code-body">
49 <pre>${h.literal(sr['content_short'])}</pre>
49 <pre>${h.literal(sr['content_short_hl'])}</pre>
50 </div>
50 </div>
51 </div>
51 </div>
52 </div>
52 </div>
53 %else:
53 %else:
54 %if cnt == 0:
54 %if cnt == 0:
55 <div class="table">
55 <div class="table">
56 <div id="body${cnt}" class="codeblock">
56 <div id="body${cnt}" class="codeblock">
57 <div class="error">${_('Permission denied')}</div>
57 <div class="error">${_('Permission denied')}</div>
58 </div>
58 </div>
59 </div>
59 </div>
60 %endif
60 %endif
61
61
62 %endif
62 %endif
63 %endfor
63 %endfor
64
64 %if c.cur_query:
65
65 <div class="pagination-wh pagination-left">
66
66 ${c.formated_results.pager('$link_previous ~2~ $link_next')}
67 </div>
68 %endif
67 </div>
69 </div>
68
70
69 </%def>
71 </%def>
@@ -1,49 +1,49 b''
1 from pylons_app import get_version
1 from pylons_app import get_version
2 try:
2 try:
3 from setuptools import setup, find_packages
3 from setuptools import setup, find_packages
4 except ImportError:
4 except ImportError:
5 from ez_setup import use_setuptools
5 from ez_setup import use_setuptools
6 use_setuptools()
6 use_setuptools()
7 from setuptools import setup, find_packages
7 from setuptools import setup, find_packages
8
8
9 setup(
9 setup(
10 name='HgApp-%s'%get_version(),
10 name='HgApp-%s' % get_version(),
11 version=get_version(),
11 version=get_version(),
12 description='Mercurial repository serving and browsing app',
12 description='Mercurial repository serving and browsing app',
13 keywords='mercurial web hgwebdir replacement serving hgweb',
13 keywords='mercurial web hgwebdir replacement serving hgweb',
14 license='BSD',
14 license='BSD',
15 author='marcin kuzminski',
15 author='marcin kuzminski',
16 author_email='marcin@python-works.com',
16 author_email='marcin@python-works.com',
17 url='http://hg.python-works.com',
17 url='http://hg.python-works.com',
18 install_requires=[
18 install_requires=[
19 "Pylons>=1.0.0",
19 "Pylons>=1.0.0",
20 "SQLAlchemy>=0.6",
20 "SQLAlchemy>=0.6",
21 "babel",
21 "babel",
22 "Mako>=0.3.2",
22 "Mako>=0.3.2",
23 "vcs>=0.1.4",
23 "vcs>=0.1.5",
24 "pygments>=1.3.0",
24 "pygments>=1.3.0",
25 "mercurial>=1.6",
25 "mercurial>=1.6",
26 "pysqlite",
26 "pysqlite",
27 "whoosh==1.0.0b10",
27 "whoosh==1.0.0b16",
28 "py-bcrypt",
28 "py-bcrypt",
29 "celery",
29 "celery",
30 ],
30 ],
31 setup_requires=["PasteScript>=1.6.3"],
31 setup_requires=["PasteScript>=1.6.3"],
32 packages=find_packages(exclude=['ez_setup']),
32 packages=find_packages(exclude=['ez_setup']),
33 include_package_data=True,
33 include_package_data=True,
34 test_suite='nose.collector',
34 test_suite='nose.collector',
35 package_data={'pylons_app': ['i18n/*/LC_MESSAGES/*.mo']},
35 package_data={'pylons_app': ['i18n/*/LC_MESSAGES/*.mo']},
36 message_extractors={'pylons_app': [
36 message_extractors={'pylons_app': [
37 ('**.py', 'python', None),
37 ('**.py', 'python', None),
38 ('templates/**.mako', 'mako', {'input_encoding': 'utf-8'}),
38 ('templates/**.mako', 'mako', {'input_encoding': 'utf-8'}),
39 ('public/**', 'ignore', None)]},
39 ('public/**', 'ignore', None)]},
40 zip_safe=False,
40 zip_safe=False,
41 paster_plugins=['PasteScript', 'Pylons'],
41 paster_plugins=['PasteScript', 'Pylons'],
42 entry_points="""
42 entry_points="""
43 [paste.app_factory]
43 [paste.app_factory]
44 main = pylons_app.config.middleware:make_app
44 main = pylons_app.config.middleware:make_app
45
45
46 [paste.app_install]
46 [paste.app_install]
47 main = pylons.util:PylonsInstaller
47 main = pylons.util:PylonsInstaller
48 """,
48 """,
49 )
49 )
General Comments 0
You need to be logged in to leave comments. Login now