Show More
@@ -26,10 +26,9 b' from pylons import request, response, se' | |||
|
26 | 26 | from pylons.controllers.util import abort, redirect |
|
27 | 27 | from pylons_app.lib.auth import LoginRequired |
|
28 | 28 | from pylons_app.lib.base import BaseController, render |
|
29 |
from pylons_app.lib.indexers import |
|
|
30 |
from webhelpers. |
|
|
31 | from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \ | |
|
32 | ContextFragmenter | |
|
29 | from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper | |
|
30 | from webhelpers.paginate import Page | |
|
31 | from webhelpers.util import update_params | |
|
33 | 32 | from pylons.i18n.translation import _ |
|
34 | 33 | from whoosh.index import open_dir, EmptyIndexError |
|
35 | 34 | from whoosh.qparser import QueryParser, QueryParserError |
@@ -45,69 +44,55 b' class SearchController(BaseController):' | |||
|
45 | 44 | def __before__(self): |
|
46 | 45 | super(SearchController, self).__before__() |
|
47 | 46 | |
|
48 | ||
|
49 | 47 | def index(self): |
|
50 | 48 | c.formated_results = [] |
|
51 | 49 | c.runtime = '' |
|
52 | search_items = set() | |
|
53 | 50 | c.cur_query = request.GET.get('q', None) |
|
54 | 51 | if c.cur_query: |
|
55 | 52 | cur_query = c.cur_query.lower() |
|
56 | 53 | |
|
57 | ||
|
58 | 54 | if c.cur_query: |
|
55 | p = int(request.params.get('page', 1)) | |
|
56 | highlight_items = set() | |
|
59 | 57 | try: |
|
60 | 58 | idx = open_dir(IDX_LOCATION, indexname=IDX_NAME) |
|
61 | 59 | searcher = idx.searcher() |
|
62 | ||
|
60 | ||
|
63 | 61 | qp = QueryParser("content", schema=SCHEMA) |
|
64 | 62 | try: |
|
65 | 63 | query = qp.parse(unicode(cur_query)) |
|
66 | 64 | |
|
67 | 65 | if isinstance(query, Phrase): |
|
68 |
|
|
|
66 | highlight_items.update(query.words) | |
|
69 | 67 | else: |
|
70 | 68 | for i in query.all_terms(): |
|
71 |
|
|
|
72 | ||
|
73 | log.debug(query) | |
|
74 | log.debug(search_items) | |
|
75 | results = searcher.search(query) | |
|
76 | c.runtime = '%s results (%.3f seconds)' \ | |
|
77 | % (len(results), results.runtime) | |
|
69 | if i[0] == 'content': | |
|
70 | highlight_items.add(i[1]) | |
|
78 | 71 | |
|
79 | analyzer = ANALYZER | |
|
80 | formatter = HtmlFormatter('span', | |
|
81 | between='\n<span class="break">...</span>\n') | |
|
82 | ||
|
83 | #how the parts are splitted within the same text part | |
|
84 | fragmenter = SimpleFragmenter(200) | |
|
85 | #fragmenter = ContextFragmenter(search_items) | |
|
72 | matcher = query.matcher(searcher) | |
|
86 | 73 | |
|
87 | for res in results: | |
|
88 | d = {} | |
|
89 | d.update(res) | |
|
90 | hl = highlight(escape(res['content']), search_items, | |
|
91 | analyzer=analyzer, | |
|
92 | fragmenter=fragmenter, | |
|
93 | formatter=formatter, | |
|
94 | top=5) | |
|
95 | f_path = res['path'][res['path'].find(res['repository']) \ | |
|
96 | + len(res['repository']):].lstrip('/') | |
|
97 | d.update({'content_short':hl, | |
|
98 | 'f_path':f_path}) | |
|
99 |
|
|
|
100 | c.formated_results.append(d) | |
|
101 |
|
|
|
74 | log.debug(query) | |
|
75 | log.debug(highlight_items) | |
|
76 | results = searcher.search(query) | |
|
77 | res_ln = len(results) | |
|
78 | c.runtime = '%s results (%.3f seconds)' \ | |
|
79 | % (res_ln, results.runtime) | |
|
80 | ||
|
81 | def url_generator(**kw): | |
|
82 | return update_params("?q=%s" % c.cur_query, **kw) | |
|
83 | ||
|
84 | c.formated_results = Page( | |
|
85 | ResultWrapper(searcher, matcher, highlight_items), | |
|
86 | page=p, item_count=res_ln, | |
|
87 | items_per_page=10, url=url_generator) | |
|
88 | ||
|
102 | 89 | except QueryParserError: |
|
103 | 90 | c.runtime = _('Invalid search query. Try quoting it.') |
|
104 | ||
|
91 | searcher.close() | |
|
105 | 92 | except (EmptyIndexError, IOError): |
|
106 | 93 | log.error(traceback.format_exc()) |
|
107 | 94 | log.error('Empty Index data') |
|
108 | 95 | c.runtime = _('There is no index to search in. Please run whoosh indexer') |
|
109 | ||
|
110 | ||
|
111 | ||
|
96 | ||
|
112 | 97 | # Return a rendered template |
|
113 | 98 | return render('/search/search.html') |
@@ -1,41 +1,140 b'' | |||
|
1 | import sys | |
|
1 | from os.path import dirname as dn, join as jn | |
|
2 | from pidlock import LockHeld, DaemonLock | |
|
3 | from pylons_app.config.environment import load_environment | |
|
4 | from pylons_app.model.hg_model import HgModel | |
|
5 | from shutil import rmtree | |
|
6 | from webhelpers.html.builder import escape | |
|
7 | from vcs.utils.lazy import LazyProperty | |
|
8 | ||
|
9 | from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter | |
|
10 | from whoosh.fields import TEXT, ID, STORED, Schema, FieldType | |
|
11 | from whoosh.index import create_in, open_dir | |
|
12 | from whoosh.formats import Characters | |
|
13 | from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter | |
|
14 | ||
|
2 | 15 | import os |
|
3 | from pidlock import LockHeld, DaemonLock | |
|
16 | import sys | |
|
4 | 17 | import traceback |
|
5 | 18 | |
|
6 | from os.path import dirname as dn | |
|
7 | from os.path import join as jn | |
|
19 | ||
|
8 | 20 | |
|
9 | 21 | #to get the pylons_app import |
|
10 | 22 | sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) |
|
11 | 23 | |
|
12 | from pylons_app.config.environment import load_environment | |
|
13 | from pylons_app.model.hg_model import HgModel | |
|
14 | from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter | |
|
15 | from whoosh.fields import TEXT, ID, STORED, Schema | |
|
16 | from whoosh.index import create_in, open_dir | |
|
17 | from shutil import rmtree | |
|
18 | 24 | |
|
19 | 25 | #LOCATION WE KEEP THE INDEX |
|
20 | 26 | IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') |
|
21 | 27 | |
|
22 | 28 | #EXTENSIONS WE WANT TO INDEX CONTENT OFF |
|
23 |
INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', |
|
|
24 |
'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', |
|
|
25 |
'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', |
|
|
26 |
'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', |
|
|
27 |
'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', |
|
|
28 |
'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', |
|
|
29 | INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', | |
|
30 | 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', | |
|
31 | 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', | |
|
32 | 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', | |
|
33 | 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', | |
|
34 | 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt', | |
|
29 | 35 | 'yaws'] |
|
30 | 36 | |
|
31 | 37 | #CUSTOM ANALYZER wordsplit + lowercase filter |
|
32 | 38 | ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() |
|
33 | 39 | |
|
40 | ||
|
34 | 41 | #INDEX SCHEMA DEFINITION |
|
35 | 42 | SCHEMA = Schema(owner=TEXT(), |
|
36 | 43 | repository=TEXT(stored=True), |
|
37 | 44 | path=ID(stored=True, unique=True), |
|
38 |
content= |
|
|
39 | modtime=STORED(),extension=TEXT(stored=True)) | |
|
45 | content=FieldType(format=Characters(ANALYZER), | |
|
46 | scorable=True, stored=True), | |
|
47 | modtime=STORED(), extension=TEXT(stored=True)) | |
|
48 | ||
|
49 | ||
|
50 | IDX_NAME = 'HG_INDEX' | |
|
51 | FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') | |
|
52 | FRAGMENTER = SimpleFragmenter(200) | |
|
53 | ||
|
54 | ||
|
55 | ||
|
56 | ||
|
57 | class ResultWrapper(object): | |
|
58 | def __init__(self, searcher, matcher, highlight_items): | |
|
59 | self.searcher = searcher | |
|
60 | self.matcher = matcher | |
|
61 | self.highlight_items = highlight_items | |
|
62 | self.fragment_size = 150 * 2 | |
|
63 | ||
|
64 | @LazyProperty | |
|
65 | def doc_ids(self): | |
|
66 | docs_id = [] | |
|
67 | while self.matcher.is_active(): | |
|
68 | docnum = self.matcher.id() | |
|
69 | docs_id.append(docnum) | |
|
70 | self.matcher.next() | |
|
71 | return docs_id | |
|
72 | ||
|
73 | def __str__(self): | |
|
74 | return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) | |
|
75 | ||
|
76 | def __repr__(self): | |
|
77 | return self.__str__() | |
|
78 | ||
|
79 | def __len__(self): | |
|
80 | return len(self.doc_ids) | |
|
81 | ||
|
82 | def __iter__(self): | |
|
83 | """ | |
|
84 | Allows Iteration over results,and lazy generate content | |
|
85 | ||
|
86 | *Requires* implementation of ``__getitem__`` method. | |
|
87 | """ | |
|
88 | for docid in self.doc_ids: | |
|
89 | yield self.get_full_content(docid) | |
|
40 | 90 | |
|
41 | IDX_NAME = 'HG_INDEX' No newline at end of file | |
|
91 | def __getslice__(self, i, j): | |
|
92 | """ | |
|
93 | Slicing of resultWrapper | |
|
94 | """ | |
|
95 | slice = [] | |
|
96 | for docid in self.doc_ids[i:j]: | |
|
97 | slice.append(self.get_full_content(docid)) | |
|
98 | return slice | |
|
99 | ||
|
100 | ||
|
101 | def get_full_content(self, docid): | |
|
102 | res = self.searcher.stored_fields(docid) | |
|
103 | f_path = res['path'][res['path'].find(res['repository']) \ | |
|
104 | + len(res['repository']):].lstrip('/') | |
|
105 | ||
|
106 | content_short = ''.join(self.get_short_content(res)) | |
|
107 | res.update({'content_short':content_short, | |
|
108 | 'content_short_hl':self.highlight(content_short), | |
|
109 | 'f_path':f_path}) | |
|
110 | ||
|
111 | return res | |
|
112 | ||
|
113 | def get_short_content(self, res): | |
|
114 | """ | |
|
115 | Smart function that implements chunking the content | |
|
116 | but not overlap chunks so it doesn't highlight the same | |
|
117 | close occurences twice. | |
|
118 | @param matcher: | |
|
119 | @param size: | |
|
120 | """ | |
|
121 | memory = [(0, 0)] | |
|
122 | for span in self.matcher.spans(): | |
|
123 | start = span.startchar or 0 | |
|
124 | end = span.endchar or 0 | |
|
125 | start_offseted = max(0, start - self.fragment_size) | |
|
126 | end_offseted = end + self.fragment_size | |
|
127 | print start_offseted, end_offseted | |
|
128 | if start_offseted < memory[-1][1]: | |
|
129 | start_offseted = memory[-1][1] | |
|
130 | memory.append((start_offseted, end_offseted,)) | |
|
131 | yield res["content"][start_offseted:end_offseted] | |
|
132 | ||
|
133 | def highlight(self, content, top=5): | |
|
134 | hl = highlight(escape(content), | |
|
135 | self.highlight_items, | |
|
136 | analyzer=ANALYZER, | |
|
137 | fragmenter=FRAGMENTER, | |
|
138 | formatter=FORMATTER, | |
|
139 | top=top) | |
|
140 | return hl |
@@ -46,7 +46,7 b'' | |||
|
46 | 46 | h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div> |
|
47 | 47 | </div> |
|
48 | 48 | <div class="code-body"> |
|
49 | <pre>${h.literal(sr['content_short'])}</pre> | |
|
49 | <pre>${h.literal(sr['content_short_hl'])}</pre> | |
|
50 | 50 | </div> |
|
51 | 51 | </div> |
|
52 | 52 | </div> |
@@ -59,11 +59,13 b'' | |||
|
59 | 59 | </div> |
|
60 | 60 | %endif |
|
61 | 61 | |
|
62 | %endif | |
|
62 | %endif | |
|
63 | 63 | %endfor |
|
64 | ||
|
65 | ||
|
66 | ||
|
64 | %if c.cur_query: | |
|
65 | <div class="pagination-wh pagination-left"> | |
|
66 | ${c.formated_results.pager('$link_previous ~2~ $link_next')} | |
|
67 | </div> | |
|
68 | %endif | |
|
67 | 69 | </div> |
|
68 | 70 | |
|
69 | 71 | </%def> |
@@ -7,7 +7,7 b' except ImportError:' | |||
|
7 | 7 | from setuptools import setup, find_packages |
|
8 | 8 | |
|
9 | 9 | setup( |
|
10 | name='HgApp-%s'%get_version(), | |
|
10 | name='HgApp-%s' % get_version(), | |
|
11 | 11 | version=get_version(), |
|
12 | 12 | description='Mercurial repository serving and browsing app', |
|
13 | 13 | keywords='mercurial web hgwebdir replacement serving hgweb', |
@@ -20,11 +20,11 b' setup(' | |||
|
20 | 20 | "SQLAlchemy>=0.6", |
|
21 | 21 | "babel", |
|
22 | 22 | "Mako>=0.3.2", |
|
23 |
"vcs>=0.1. |
|
|
23 | "vcs>=0.1.5", | |
|
24 | 24 | "pygments>=1.3.0", |
|
25 | 25 | "mercurial>=1.6", |
|
26 | 26 | "pysqlite", |
|
27 |
"whoosh==1.0.0b1 |
|
|
27 | "whoosh==1.0.0b16", | |
|
28 | 28 | "py-bcrypt", |
|
29 | 29 | "celery", |
|
30 | 30 | ], |
General Comments 0
You need to be logged in to leave comments.
Login now