##// END OF EJS Templates
Reimplemented searching for speed on large files and added paging for search results...
marcink -
r478:7010af6e celery
parent child Browse files
Show More
@@ -1,113 +1,98 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3 # search controller for pylons
4 4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 5 #
6 6 # This program is free software; you can redistribute it and/or
7 7 # modify it under the terms of the GNU General Public License
8 8 # as published by the Free Software Foundation; version 2
9 9 # of the License or (at your opinion) any later version of the license.
10 10 #
11 11 # This program is distributed in the hope that it will be useful,
12 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 14 # GNU General Public License for more details.
15 15 #
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program; if not, write to the Free Software
18 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 19 # MA 02110-1301, USA.
20 20 """
21 21 Created on Aug 7, 2010
22 22 search controller for pylons
23 23 @author: marcink
24 24 """
25 25 from pylons import request, response, session, tmpl_context as c, url
26 26 from pylons.controllers.util import abort, redirect
27 27 from pylons_app.lib.auth import LoginRequired
28 28 from pylons_app.lib.base import BaseController, render
29 from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
30 from webhelpers.html.builder import escape
31 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
32 ContextFragmenter
29 from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
30 from webhelpers.paginate import Page
31 from webhelpers.util import update_params
33 32 from pylons.i18n.translation import _
34 33 from whoosh.index import open_dir, EmptyIndexError
35 34 from whoosh.qparser import QueryParser, QueryParserError
36 35 from whoosh.query import Phrase
37 36 import logging
38 37 import traceback
39 38
40 39 log = logging.getLogger(__name__)
41 40
42 41 class SearchController(BaseController):
43 42
44 43 @LoginRequired()
45 44 def __before__(self):
46 45 super(SearchController, self).__before__()
47 46
48
49 47 def index(self):
50 48 c.formated_results = []
51 49 c.runtime = ''
52 search_items = set()
53 50 c.cur_query = request.GET.get('q', None)
54 51 if c.cur_query:
55 52 cur_query = c.cur_query.lower()
56 53
57
58 54 if c.cur_query:
55 p = int(request.params.get('page', 1))
56 highlight_items = set()
59 57 try:
60 58 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
61 59 searcher = idx.searcher()
62 60
63 61 qp = QueryParser("content", schema=SCHEMA)
64 62 try:
65 63 query = qp.parse(unicode(cur_query))
66 64
67 65 if isinstance(query, Phrase):
68 search_items.update(query.words)
66 highlight_items.update(query.words)
69 67 else:
70 68 for i in query.all_terms():
71 search_items.add(i[1])
69 if i[0] == 'content':
70 highlight_items.add(i[1])
71
72 matcher = query.matcher(searcher)
72 73
73 74 log.debug(query)
74 log.debug(search_items)
75 log.debug(highlight_items)
75 76 results = searcher.search(query)
77 res_ln = len(results)
76 78 c.runtime = '%s results (%.3f seconds)' \
77 % (len(results), results.runtime)
78
79 analyzer = ANALYZER
80 formatter = HtmlFormatter('span',
81 between='\n<span class="break">...</span>\n')
82
83 #how the parts are splitted within the same text part
84 fragmenter = SimpleFragmenter(200)
85 #fragmenter = ContextFragmenter(search_items)
79 % (res_ln, results.runtime)
86 80
87 for res in results:
88 d = {}
89 d.update(res)
90 hl = highlight(escape(res['content']), search_items,
91 analyzer=analyzer,
92 fragmenter=fragmenter,
93 formatter=formatter,
94 top=5)
95 f_path = res['path'][res['path'].find(res['repository']) \
96 + len(res['repository']):].lstrip('/')
97 d.update({'content_short':hl,
98 'f_path':f_path})
99 #del d['content']
100 c.formated_results.append(d)
81 def url_generator(**kw):
82 return update_params("?q=%s" % c.cur_query, **kw)
83
84 c.formated_results = Page(
85 ResultWrapper(searcher, matcher, highlight_items),
86 page=p, item_count=res_ln,
87 items_per_page=10, url=url_generator)
101 88
102 89 except QueryParserError:
103 90 c.runtime = _('Invalid search query. Try quoting it.')
104
91 searcher.close()
105 92 except (EmptyIndexError, IOError):
106 93 log.error(traceback.format_exc())
107 94 log.error('Empty Index data')
108 95 c.runtime = _('There is no index to search in. Please run whoosh indexer')
109 96
110
111
112 97 # Return a rendered template
113 98 return render('/search/search.html')
@@ -1,41 +1,140 b''
1 import sys
1 from os.path import dirname as dn, join as jn
2 from pidlock import LockHeld, DaemonLock
3 from pylons_app.config.environment import load_environment
4 from pylons_app.model.hg_model import HgModel
5 from shutil import rmtree
6 from webhelpers.html.builder import escape
7 from vcs.utils.lazy import LazyProperty
8
9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
11 from whoosh.index import create_in, open_dir
12 from whoosh.formats import Characters
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14
2 15 import os
3 from pidlock import LockHeld, DaemonLock
16 import sys
4 17 import traceback
5 18
6 from os.path import dirname as dn
7 from os.path import join as jn
19
8 20
9 21 #to get the pylons_app import
10 22 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
11 23
12 from pylons_app.config.environment import load_environment
13 from pylons_app.model.hg_model import HgModel
14 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
15 from whoosh.fields import TEXT, ID, STORED, Schema
16 from whoosh.index import create_in, open_dir
17 from shutil import rmtree
18 24
19 25 #LOCATION WE KEEP THE INDEX
20 26 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
21 27
22 28 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
23 29 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
24 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h',
25 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
30 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
31 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
26 32 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
27 33 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
28 34 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt',
29 35 'yaws']
30 36
31 37 #CUSTOM ANALYZER wordsplit + lowercase filter
32 38 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
33 39
40
34 41 #INDEX SCHEMA DEFINITION
35 42 SCHEMA = Schema(owner=TEXT(),
36 43 repository=TEXT(stored=True),
37 44 path=ID(stored=True, unique=True),
38 content=TEXT(stored=True, analyzer=ANALYZER),
45 content=FieldType(format=Characters(ANALYZER),
46 scorable=True, stored=True),
39 47 modtime=STORED(),extension=TEXT(stored=True))
40 48
41 IDX_NAME = 'HG_INDEX' No newline at end of file
49
50 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56
57 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher
60 self.matcher = matcher
61 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
63
64 @LazyProperty
65 def doc_ids(self):
66 docs_id = []
67 while self.matcher.is_active():
68 docnum = self.matcher.id()
69 docs_id.append(docnum)
70 self.matcher.next()
71 return docs_id
72
73 def __str__(self):
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
75
76 def __repr__(self):
77 return self.__str__()
78
79 def __len__(self):
80 return len(self.doc_ids)
81
82 def __iter__(self):
83 """
84 Allows Iteration over results,and lazy generate content
85
86 *Requires* implementation of ``__getitem__`` method.
87 """
88 for docid in self.doc_ids:
89 yield self.get_full_content(docid)
90
91 def __getslice__(self, i, j):
92 """
93 Slicing of resultWrapper
94 """
95 slice = []
96 for docid in self.doc_ids[i:j]:
97 slice.append(self.get_full_content(docid))
98 return slice
99
100
101 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
103 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/')
105
106 content_short = ''.join(self.get_short_content(res))
107 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path})
110
111 return res
112
113 def get_short_content(self, res):
114 """
115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same
117 close occurences twice.
118 @param matcher:
119 @param size:
120 """
121 memory = [(0, 0)]
122 for span in self.matcher.spans():
123 start = span.startchar or 0
124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
132
133 def highlight(self, content, top=5):
134 hl = highlight(escape(content),
135 self.highlight_items,
136 analyzer=ANALYZER,
137 fragmenter=FRAGMENTER,
138 formatter=FORMATTER,
139 top=top)
140 return hl
@@ -1,69 +1,71 b''
1 1 ## -*- coding: utf-8 -*-
2 2 <%inherit file="/base/base.html"/>
3 3 <%def name="title()">
4 4 ${_('Search')}: ${c.cur_query}
5 5 </%def>
6 6 <%def name="breadcrumbs()">
7 7 ${c.hg_app_name}
8 8 </%def>
9 9 <%def name="page_nav()">
10 10 ${self.menu('home')}
11 11 </%def>
12 12 <%def name="main()">
13 13
14 14 <div class="box">
15 15 <!-- box / title -->
16 16 <div class="title">
17 17 <h5>${_('Search')}</h5>
18 18 </div>
19 19 <!-- end box / title -->
20 20 ${h.form('search',method='get')}
21 21 <div class="form">
22 22 <div class="fields">
23 23
24 24 <div class="field ">
25 25 <div class="label">
26 26 <label for="q">${_('Search:')}</label>
27 27 </div>
28 28 <div class="input">
29 29 ${h.text('q',c.cur_query,class_="small")}
30 30 <div class="button highlight">
31 31 <input type="submit" value="${_('Search')}" class="ui-button ui-widget ui-state-default ui-corner-all"/>
32 32 </div>
33 33 <div style="font-weight: bold;clear:both;padding: 5px">${c.runtime}</div>
34 34 </div>
35 35 </div>
36 36 </div>
37 37 </div>
38 38 ${h.end_form()}
39 39
40 40 %for cnt,sr in enumerate(c.formated_results):
41 41 %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'):
42 42 <div class="table">
43 43 <div id="body${cnt}" class="codeblock">
44 44 <div class="code-header">
45 45 <div class="revision">${h.link_to(h.literal('%s &raquo; %s' % (sr['repository'],sr['f_path'])),
46 46 h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
47 47 </div>
48 48 <div class="code-body">
49 <pre>${h.literal(sr['content_short'])}</pre>
49 <pre>${h.literal(sr['content_short_hl'])}</pre>
50 50 </div>
51 51 </div>
52 52 </div>
53 53 %else:
54 54 %if cnt == 0:
55 55 <div class="table">
56 56 <div id="body${cnt}" class="codeblock">
57 57 <div class="error">${_('Permission denied')}</div>
58 58 </div>
59 59 </div>
60 60 %endif
61 61
62 62 %endif
63 63 %endfor
64
65
66
64 %if c.cur_query:
65 <div class="pagination-wh pagination-left">
66 ${c.formated_results.pager('$link_previous ~2~ $link_next')}
67 </div>
68 %endif
67 69 </div>
68 70
69 71 </%def>
@@ -1,49 +1,49 b''
1 1 from pylons_app import get_version
2 2 try:
3 3 from setuptools import setup, find_packages
4 4 except ImportError:
5 5 from ez_setup import use_setuptools
6 6 use_setuptools()
7 7 from setuptools import setup, find_packages
8 8
9 9 setup(
10 10 name='HgApp-%s'%get_version(),
11 11 version=get_version(),
12 12 description='Mercurial repository serving and browsing app',
13 13 keywords='mercurial web hgwebdir replacement serving hgweb',
14 14 license='BSD',
15 15 author='marcin kuzminski',
16 16 author_email='marcin@python-works.com',
17 17 url='http://hg.python-works.com',
18 18 install_requires=[
19 19 "Pylons>=1.0.0",
20 20 "SQLAlchemy>=0.6",
21 21 "babel",
22 22 "Mako>=0.3.2",
23 "vcs>=0.1.4",
23 "vcs>=0.1.5",
24 24 "pygments>=1.3.0",
25 25 "mercurial>=1.6",
26 26 "pysqlite",
27 "whoosh==1.0.0b10",
27 "whoosh==1.0.0b16",
28 28 "py-bcrypt",
29 29 "celery",
30 30 ],
31 31 setup_requires=["PasteScript>=1.6.3"],
32 32 packages=find_packages(exclude=['ez_setup']),
33 33 include_package_data=True,
34 34 test_suite='nose.collector',
35 35 package_data={'pylons_app': ['i18n/*/LC_MESSAGES/*.mo']},
36 36 message_extractors={'pylons_app': [
37 37 ('**.py', 'python', None),
38 38 ('templates/**.mako', 'mako', {'input_encoding': 'utf-8'}),
39 39 ('public/**', 'ignore', None)]},
40 40 zip_safe=False,
41 41 paster_plugins=['PasteScript', 'Pylons'],
42 42 entry_points="""
43 43 [paste.app_factory]
44 44 main = pylons_app.config.middleware:make_app
45 45
46 46 [paste.app_install]
47 47 main = pylons.util:PylonsInstaller
48 48 """,
49 49 )
General Comments 0
You need to be logged in to leave comments. Login now