##// END OF EJS Templates
Reimplemented searching for speed on large files and added paging for search results...
marcink -
r478:7010af6e celery
parent child Browse files
Show More
@@ -26,10 +26,9 b' from pylons import request, response, se'
26 26 from pylons.controllers.util import abort, redirect
27 27 from pylons_app.lib.auth import LoginRequired
28 28 from pylons_app.lib.base import BaseController, render
29 from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
30 from webhelpers.html.builder import escape
31 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
32 ContextFragmenter
29 from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
30 from webhelpers.paginate import Page
31 from webhelpers.util import update_params
33 32 from pylons.i18n.translation import _
34 33 from whoosh.index import open_dir, EmptyIndexError
35 34 from whoosh.qparser import QueryParser, QueryParserError
@@ -45,17 +44,16 b' class SearchController(BaseController):'
45 44 def __before__(self):
46 45 super(SearchController, self).__before__()
47 46
48
49 47 def index(self):
50 48 c.formated_results = []
51 49 c.runtime = ''
52 search_items = set()
53 50 c.cur_query = request.GET.get('q', None)
54 51 if c.cur_query:
55 52 cur_query = c.cur_query.lower()
56 53
57
58 54 if c.cur_query:
55 p = int(request.params.get('page', 1))
56 highlight_items = set()
59 57 try:
60 58 idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
61 59 searcher = idx.searcher()
@@ -65,49 +63,36 b' class SearchController(BaseController):'
65 63 query = qp.parse(unicode(cur_query))
66 64
67 65 if isinstance(query, Phrase):
68 search_items.update(query.words)
66 highlight_items.update(query.words)
69 67 else:
70 68 for i in query.all_terms():
71 search_items.add(i[1])
69 if i[0] == 'content':
70 highlight_items.add(i[1])
71
72 matcher = query.matcher(searcher)
72 73
73 74 log.debug(query)
74 log.debug(search_items)
75 log.debug(highlight_items)
75 76 results = searcher.search(query)
77 res_ln = len(results)
76 78 c.runtime = '%s results (%.3f seconds)' \
77 % (len(results), results.runtime)
78
79 analyzer = ANALYZER
80 formatter = HtmlFormatter('span',
81 between='\n<span class="break">...</span>\n')
82
83 #how the parts are splitted within the same text part
84 fragmenter = SimpleFragmenter(200)
85 #fragmenter = ContextFragmenter(search_items)
79 % (res_ln, results.runtime)
86 80
87 for res in results:
88 d = {}
89 d.update(res)
90 hl = highlight(escape(res['content']), search_items,
91 analyzer=analyzer,
92 fragmenter=fragmenter,
93 formatter=formatter,
94 top=5)
95 f_path = res['path'][res['path'].find(res['repository']) \
96 + len(res['repository']):].lstrip('/')
97 d.update({'content_short':hl,
98 'f_path':f_path})
99 #del d['content']
100 c.formated_results.append(d)
81 def url_generator(**kw):
82 return update_params("?q=%s" % c.cur_query, **kw)
83
84 c.formated_results = Page(
85 ResultWrapper(searcher, matcher, highlight_items),
86 page=p, item_count=res_ln,
87 items_per_page=10, url=url_generator)
101 88
102 89 except QueryParserError:
103 90 c.runtime = _('Invalid search query. Try quoting it.')
104
91 searcher.close()
105 92 except (EmptyIndexError, IOError):
106 93 log.error(traceback.format_exc())
107 94 log.error('Empty Index data')
108 95 c.runtime = _('There is no index to search in. Please run whoosh indexer')
109 96
110
111
112 97 # Return a rendered template
113 98 return render('/search/search.html')
@@ -1,28 +1,34 b''
1 import sys
1 from os.path import dirname as dn, join as jn
2 from pidlock import LockHeld, DaemonLock
3 from pylons_app.config.environment import load_environment
4 from pylons_app.model.hg_model import HgModel
5 from shutil import rmtree
6 from webhelpers.html.builder import escape
7 from vcs.utils.lazy import LazyProperty
8
9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
11 from whoosh.index import create_in, open_dir
12 from whoosh.formats import Characters
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14
2 15 import os
3 from pidlock import LockHeld, DaemonLock
16 import sys
4 17 import traceback
5 18
6 from os.path import dirname as dn
7 from os.path import join as jn
19
8 20
9 21 #to get the pylons_app import
10 22 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
11 23
12 from pylons_app.config.environment import load_environment
13 from pylons_app.model.hg_model import HgModel
14 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
15 from whoosh.fields import TEXT, ID, STORED, Schema
16 from whoosh.index import create_in, open_dir
17 from shutil import rmtree
18 24
19 25 #LOCATION WE KEEP THE INDEX
20 26 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
21 27
22 28 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
23 29 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
24 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h',
25 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
30 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
31 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
26 32 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
27 33 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
28 34 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt',
@@ -31,11 +37,104 b" INDEX_EXTENSIONS = ['action', 'adp', 'as"
31 37 #CUSTOM ANALYZER wordsplit + lowercase filter
32 38 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
33 39
40
34 41 #INDEX SCHEMA DEFINITION
35 42 SCHEMA = Schema(owner=TEXT(),
36 43 repository=TEXT(stored=True),
37 44 path=ID(stored=True, unique=True),
38 content=TEXT(stored=True, analyzer=ANALYZER),
45 content=FieldType(format=Characters(ANALYZER),
46 scorable=True, stored=True),
39 47 modtime=STORED(),extension=TEXT(stored=True))
40 48
41 IDX_NAME = 'HG_INDEX' No newline at end of file
49
50 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56
57 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher
60 self.matcher = matcher
61 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
63
64 @LazyProperty
65 def doc_ids(self):
66 docs_id = []
67 while self.matcher.is_active():
68 docnum = self.matcher.id()
69 docs_id.append(docnum)
70 self.matcher.next()
71 return docs_id
72
73 def __str__(self):
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
75
76 def __repr__(self):
77 return self.__str__()
78
79 def __len__(self):
80 return len(self.doc_ids)
81
82 def __iter__(self):
83 """
84 Allows Iteration over results,and lazy generate content
85
86 *Requires* implementation of ``__getitem__`` method.
87 """
88 for docid in self.doc_ids:
89 yield self.get_full_content(docid)
90
91 def __getslice__(self, i, j):
92 """
93 Slicing of resultWrapper
94 """
95 slice = []
96 for docid in self.doc_ids[i:j]:
97 slice.append(self.get_full_content(docid))
98 return slice
99
100
101 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
103 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/')
105
106 content_short = ''.join(self.get_short_content(res))
107 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path})
110
111 return res
112
113 def get_short_content(self, res):
114 """
115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same
117 close occurences twice.
118 @param matcher:
119 @param size:
120 """
121 memory = [(0, 0)]
122 for span in self.matcher.spans():
123 start = span.startchar or 0
124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
132
133 def highlight(self, content, top=5):
134 hl = highlight(escape(content),
135 self.highlight_items,
136 analyzer=ANALYZER,
137 fragmenter=FRAGMENTER,
138 formatter=FORMATTER,
139 top=top)
140 return hl
@@ -46,7 +46,7 b''
46 46 h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
47 47 </div>
48 48 <div class="code-body">
49 <pre>${h.literal(sr['content_short'])}</pre>
49 <pre>${h.literal(sr['content_short_hl'])}</pre>
50 50 </div>
51 51 </div>
52 52 </div>
@@ -61,9 +61,11 b''
61 61
62 62 %endif
63 63 %endfor
64
65
66
64 %if c.cur_query:
65 <div class="pagination-wh pagination-left">
66 ${c.formated_results.pager('$link_previous ~2~ $link_next')}
67 </div>
68 %endif
67 69 </div>
68 70
69 71 </%def>
@@ -20,11 +20,11 b' setup('
20 20 "SQLAlchemy>=0.6",
21 21 "babel",
22 22 "Mako>=0.3.2",
23 "vcs>=0.1.4",
23 "vcs>=0.1.5",
24 24 "pygments>=1.3.0",
25 25 "mercurial>=1.6",
26 26 "pysqlite",
27 "whoosh==1.0.0b10",
27 "whoosh==1.0.0b16",
28 28 "py-bcrypt",
29 29 "celery",
30 30 ],
General Comments 0
You need to be logged in to leave comments. Login now