##// END OF EJS Templates
Reimplemented searching for speed on large files and added paging for search results...
marcink -
r478:7010af6e celery
parent child Browse files
Show More
@@ -26,10 +26,9 b' from pylons import request, response, se'
26 from pylons.controllers.util import abort, redirect
26 from pylons.controllers.util import abort, redirect
27 from pylons_app.lib.auth import LoginRequired
27 from pylons_app.lib.auth import LoginRequired
28 from pylons_app.lib.base import BaseController, render
28 from pylons_app.lib.base import BaseController, render
29 from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
29 from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
30 from webhelpers.html.builder import escape
30 from webhelpers.paginate import Page
31 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
31 from webhelpers.util import update_params
32 ContextFragmenter
33 from pylons.i18n.translation import _
32 from pylons.i18n.translation import _
34 from whoosh.index import open_dir, EmptyIndexError
33 from whoosh.index import open_dir, EmptyIndexError
35 from whoosh.qparser import QueryParser, QueryParserError
34 from whoosh.qparser import QueryParser, QueryParserError
@@ -45,69 +44,55 b' class SearchController(BaseController):'
45 def __before__(self):
44 def __before__(self):
46 super(SearchController, self).__before__()
45 super(SearchController, self).__before__()
47
46
48
49 def index(self):
47 def index(self):
50 c.formated_results = []
48 c.formated_results = []
51 c.runtime = ''
49 c.runtime = ''
52 search_items = set()
53 c.cur_query = request.GET.get('q', None)
50 c.cur_query = request.GET.get('q', None)
54 if c.cur_query:
51 if c.cur_query:
55 cur_query = c.cur_query.lower()
52 cur_query = c.cur_query.lower()
56
53
57
58 if c.cur_query:
54 if c.cur_query:
55 p = int(request.params.get('page', 1))
56 highlight_items = set()
59 try:
57 try:
60 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
58 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
61 searcher = idx.searcher()
59 searcher = idx.searcher()
62
60
63 qp = QueryParser("content", schema=SCHEMA)
61 qp = QueryParser("content", schema=SCHEMA)
64 try:
62 try:
65 query = qp.parse(unicode(cur_query))
63 query = qp.parse(unicode(cur_query))
66
64
67 if isinstance(query, Phrase):
65 if isinstance(query, Phrase):
68 search_items.update(query.words)
66 highlight_items.update(query.words)
69 else:
67 else:
70 for i in query.all_terms():
68 for i in query.all_terms():
71 search_items.add(i[1])
69 if i[0] == 'content':
72
70 highlight_items.add(i[1])
73 log.debug(query)
74 log.debug(search_items)
75 results = searcher.search(query)
76 c.runtime = '%s results (%.3f seconds)' \
77 % (len(results), results.runtime)
78
71
79 analyzer = ANALYZER
72 matcher = query.matcher(searcher)
80 formatter = HtmlFormatter('span',
81 between='\n<span class="break">...</span>\n')
82
83 #how the parts are splitted within the same text part
84 fragmenter = SimpleFragmenter(200)
85 #fragmenter = ContextFragmenter(search_items)
86
73
87 for res in results:
74 log.debug(query)
88 d = {}
75 log.debug(highlight_items)
89 d.update(res)
76 results = searcher.search(query)
90 hl = highlight(escape(res['content']), search_items,
77 res_ln = len(results)
91 analyzer=analyzer,
78 c.runtime = '%s results (%.3f seconds)' \
92 fragmenter=fragmenter,
79 % (res_ln, results.runtime)
93 formatter=formatter,
80
94 top=5)
81 def url_generator(**kw):
95 f_path = res['path'][res['path'].find(res['repository']) \
82 return update_params("?q=%s" % c.cur_query, **kw)
96 + len(res['repository']):].lstrip('/')
83
97 d.update({'content_short':hl,
84 c.formated_results = Page(
98 'f_path':f_path})
85 ResultWrapper(searcher, matcher, highlight_items),
99 #del d['content']
86 page=p, item_count=res_ln,
100 c.formated_results.append(d)
87 items_per_page=10, url=url_generator)
101
88
102 except QueryParserError:
89 except QueryParserError:
103 c.runtime = _('Invalid search query. Try quoting it.')
90 c.runtime = _('Invalid search query. Try quoting it.')
104
91 searcher.close()
105 except (EmptyIndexError, IOError):
92 except (EmptyIndexError, IOError):
106 log.error(traceback.format_exc())
93 log.error(traceback.format_exc())
107 log.error('Empty Index data')
94 log.error('Empty Index data')
108 c.runtime = _('There is no index to search in. Please run whoosh indexer')
95 c.runtime = _('There is no index to search in. Please run whoosh indexer')
109
96
110
111
112 # Return a rendered template
97 # Return a rendered template
113 return render('/search/search.html')
98 return render('/search/search.html')
@@ -1,41 +1,140 b''
1 import sys
1 from os.path import dirname as dn, join as jn
2 from pidlock import LockHeld, DaemonLock
3 from pylons_app.config.environment import load_environment
4 from pylons_app.model.hg_model import HgModel
5 from shutil import rmtree
6 from webhelpers.html.builder import escape
7 from vcs.utils.lazy import LazyProperty
8
9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
11 from whoosh.index import create_in, open_dir
12 from whoosh.formats import Characters
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14
2 import os
15 import os
3 from pidlock import LockHeld, DaemonLock
16 import sys
4 import traceback
17 import traceback
5
18
6 from os.path import dirname as dn
19
7 from os.path import join as jn
8
20
9 #to get the pylons_app import
21 #to get the pylons_app import
10 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
22 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
11
23
12 from pylons_app.config.environment import load_environment
13 from pylons_app.model.hg_model import HgModel
14 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
15 from whoosh.fields import TEXT, ID, STORED, Schema
16 from whoosh.index import create_in, open_dir
17 from shutil import rmtree
18
24
19 #LOCATION WE KEEP THE INDEX
25 #LOCATION WE KEEP THE INDEX
20 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
26 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
21
27
22 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
28 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
23 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
29 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
24 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h',
30 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
25 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
31 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
26 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
32 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
27 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
33 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
28 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt',
34 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
29 'yaws']
35 'yaws']
30
36
31 #CUSTOM ANALYZER wordsplit + lowercase filter
37 #CUSTOM ANALYZER wordsplit + lowercase filter
32 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
38 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
33
39
40
34 #INDEX SCHEMA DEFINITION
41 #INDEX SCHEMA DEFINITION
35 SCHEMA = Schema(owner=TEXT(),
42 SCHEMA = Schema(owner=TEXT(),
36 repository=TEXT(stored=True),
43 repository=TEXT(stored=True),
37 path=ID(stored=True, unique=True),
44 path=ID(stored=True, unique=True),
38 content=TEXT(stored=True, analyzer=ANALYZER),
45 content=FieldType(format=Characters(ANALYZER),
39 modtime=STORED(),extension=TEXT(stored=True))
46 scorable=True, stored=True),
47 modtime=STORED(), extension=TEXT(stored=True))
48
49
50 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56
57 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher
60 self.matcher = matcher
61 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
63
64 @LazyProperty
65 def doc_ids(self):
66 docs_id = []
67 while self.matcher.is_active():
68 docnum = self.matcher.id()
69 docs_id.append(docnum)
70 self.matcher.next()
71 return docs_id
72
73 def __str__(self):
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
75
76 def __repr__(self):
77 return self.__str__()
78
79 def __len__(self):
80 return len(self.doc_ids)
81
82 def __iter__(self):
83 """
84 Allows Iteration over results,and lazy generate content
85
86 *Requires* implementation of ``__getitem__`` method.
87 """
88 for docid in self.doc_ids:
89 yield self.get_full_content(docid)
40
90
41 IDX_NAME = 'HG_INDEX' No newline at end of file
91 def __getslice__(self, i, j):
92 """
93 Slicing of resultWrapper
94 """
95 slice = []
96 for docid in self.doc_ids[i:j]:
97 slice.append(self.get_full_content(docid))
98 return slice
99
100
101 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
103 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/')
105
106 content_short = ''.join(self.get_short_content(res))
107 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path})
110
111 return res
112
113 def get_short_content(self, res):
114 """
115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same
117 close occurences twice.
118 @param matcher:
119 @param size:
120 """
121 memory = [(0, 0)]
122 for span in self.matcher.spans():
123 start = span.startchar or 0
124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
132
133 def highlight(self, content, top=5):
134 hl = highlight(escape(content),
135 self.highlight_items,
136 analyzer=ANALYZER,
137 fragmenter=FRAGMENTER,
138 formatter=FORMATTER,
139 top=top)
140 return hl
@@ -46,7 +46,7 b''
46 h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
46 h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
47 </div>
47 </div>
48 <div class="code-body">
48 <div class="code-body">
49 <pre>${h.literal(sr['content_short'])}</pre>
49 <pre>${h.literal(sr['content_short_hl'])}</pre>
50 </div>
50 </div>
51 </div>
51 </div>
52 </div>
52 </div>
@@ -59,11 +59,13 b''
59 </div>
59 </div>
60 %endif
60 %endif
61
61
62 %endif
62 %endif
63 %endfor
63 %endfor
64
64 %if c.cur_query:
65
65 <div class="pagination-wh pagination-left">
66
66 ${c.formated_results.pager('$link_previous ~2~ $link_next')}
67 </div>
68 %endif
67 </div>
69 </div>
68
70
69 </%def>
71 </%def>
@@ -7,7 +7,7 b' except ImportError:'
7 from setuptools import setup, find_packages
7 from setuptools import setup, find_packages
8
8
9 setup(
9 setup(
10 name='HgApp-%s'%get_version(),
10 name='HgApp-%s' % get_version(),
11 version=get_version(),
11 version=get_version(),
12 description='Mercurial repository serving and browsing app',
12 description='Mercurial repository serving and browsing app',
13 keywords='mercurial web hgwebdir replacement serving hgweb',
13 keywords='mercurial web hgwebdir replacement serving hgweb',
@@ -20,11 +20,11 b' setup('
20 "SQLAlchemy>=0.6",
20 "SQLAlchemy>=0.6",
21 "babel",
21 "babel",
22 "Mako>=0.3.2",
22 "Mako>=0.3.2",
23 "vcs>=0.1.4",
23 "vcs>=0.1.5",
24 "pygments>=1.3.0",
24 "pygments>=1.3.0",
25 "mercurial>=1.6",
25 "mercurial>=1.6",
26 "pysqlite",
26 "pysqlite",
27 "whoosh==1.0.0b10",
27 "whoosh==1.0.0b16",
28 "py-bcrypt",
28 "py-bcrypt",
29 "celery",
29 "celery",
30 ],
30 ],
General Comments 0
You need to be logged in to leave comments. Login now