diff --git a/pylons_app/controllers/search.py b/pylons_app/controllers/search.py --- a/pylons_app/controllers/search.py +++ b/pylons_app/controllers/search.py @@ -26,10 +26,9 @@ from pylons import request, response, se from pylons.controllers.util import abort, redirect from pylons_app.lib.auth import LoginRequired from pylons_app.lib.base import BaseController, render -from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME -from webhelpers.html.builder import escape -from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \ - ContextFragmenter +from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper +from webhelpers.paginate import Page +from webhelpers.util import update_params from pylons.i18n.translation import _ from whoosh.index import open_dir, EmptyIndexError from whoosh.qparser import QueryParser, QueryParserError @@ -45,69 +44,55 @@ class SearchController(BaseController): def __before__(self): super(SearchController, self).__before__() - def index(self): c.formated_results = [] c.runtime = '' - search_items = set() c.cur_query = request.GET.get('q', None) if c.cur_query: cur_query = c.cur_query.lower() - if c.cur_query: + p = int(request.params.get('page', 1)) + highlight_items = set() try: idx = open_dir(IDX_LOCATION, indexname=IDX_NAME) searcher = idx.searcher() - + qp = QueryParser("content", schema=SCHEMA) try: query = qp.parse(unicode(cur_query)) if isinstance(query, Phrase): - search_items.update(query.words) + highlight_items.update(query.words) else: for i in query.all_terms(): - search_items.add(i[1]) - - log.debug(query) - log.debug(search_items) - results = searcher.search(query) - c.runtime = '%s results (%.3f seconds)' \ - % (len(results), results.runtime) + if i[0] == 'content': + highlight_items.add(i[1]) - analyzer = ANALYZER - formatter = HtmlFormatter('span', - between='\n...\n') - - #how the parts are splitted within the same text part - fragmenter = SimpleFragmenter(200) - #fragmenter = ContextFragmenter(search_items) + matcher = query.matcher(searcher) - for res in results: - d = {} - d.update(res) - hl = highlight(escape(res['content']), search_items, - analyzer=analyzer, - fragmenter=fragmenter, - formatter=formatter, - top=5) - f_path = res['path'][res['path'].find(res['repository']) \ - + len(res['repository']):].lstrip('/') - d.update({'content_short':hl, - 'f_path':f_path}) - #del d['content'] - c.formated_results.append(d) - + log.debug(query) + log.debug(highlight_items) + results = searcher.search(query) + res_ln = len(results) + c.runtime = '%s results (%.3f seconds)' \ + % (res_ln, results.runtime) + + def url_generator(**kw): + return update_params("?q=%s" % c.cur_query, **kw) + + c.formated_results = Page( + ResultWrapper(searcher, matcher, highlight_items), + page=p, item_count=res_ln, + items_per_page=10, url=url_generator) + except QueryParserError: c.runtime = _('Invalid search query. Try quoting it.') - + searcher.close() except (EmptyIndexError, IOError): log.error(traceback.format_exc()) log.error('Empty Index data') c.runtime = _('There is no index to search in. Please run whoosh indexer') - - - + # Return a rendered template return render('/search/search.html') diff --git a/pylons_app/lib/indexers/__init__.py b/pylons_app/lib/indexers/__init__.py --- a/pylons_app/lib/indexers/__init__.py +++ b/pylons_app/lib/indexers/__init__.py @@ -1,41 +1,140 @@ -import sys +from os.path import dirname as dn, join as jn +from pidlock import LockHeld, DaemonLock +from pylons_app.config.environment import load_environment +from pylons_app.model.hg_model import HgModel +from shutil import rmtree +from webhelpers.html.builder import escape +from vcs.utils.lazy import LazyProperty + +from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter +from whoosh.fields import TEXT, ID, STORED, Schema, FieldType +from whoosh.index import create_in, open_dir +from whoosh.formats import Characters +from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter + import os -from pidlock import LockHeld, DaemonLock +import sys import traceback -from os.path import dirname as dn -from os.path import join as jn + #to get the pylons_app import sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) -from pylons_app.config.environment import load_environment -from pylons_app.model.hg_model import HgModel -from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter -from whoosh.fields import TEXT, ID, STORED, Schema -from whoosh.index import create_in, open_dir -from shutil import rmtree #LOCATION WE KEEP THE INDEX IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') #EXTENSIONS WE WANT TO INDEX CONTENT OFF -INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', - 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', 'h', - 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', - 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', - 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', - 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt', +INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', + 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', + 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', + 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', + 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', + 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt', 'yaws'] #CUSTOM ANALYZER wordsplit + lowercase filter ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() + #INDEX SCHEMA DEFINITION SCHEMA = Schema(owner=TEXT(), repository=TEXT(stored=True), path=ID(stored=True, unique=True), - content=TEXT(stored=True, analyzer=ANALYZER), - modtime=STORED(),extension=TEXT(stored=True)) + content=FieldType(format=Characters(ANALYZER), + scorable=True, stored=True), + modtime=STORED(), extension=TEXT(stored=True)) + + +IDX_NAME = 'HG_INDEX' +FORMATTER = HtmlFormatter('span', between='\n...\n') +FRAGMENTER = SimpleFragmenter(200) + + + + +class ResultWrapper(object): + def __init__(self, searcher, matcher, highlight_items): + self.searcher = searcher + self.matcher = matcher + self.highlight_items = highlight_items + self.fragment_size = 150 * 2 + + @LazyProperty + def doc_ids(self): + docs_id = [] + while self.matcher.is_active(): + docnum = self.matcher.id() + docs_id.append(docnum) + self.matcher.next() + return docs_id + + def __str__(self): + return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) + + def __repr__(self): + return self.__str__() + + def __len__(self): + return len(self.doc_ids) + + def __iter__(self): + """ + Allows Iteration over results,and lazy generate content + + *Requires* implementation of ``__getitem__`` method. + """ + for docid in self.doc_ids: + yield self.get_full_content(docid) -IDX_NAME = 'HG_INDEX' \ No newline at end of file + def __getslice__(self, i, j): + """ + Slicing of resultWrapper + """ + slice = [] + for docid in self.doc_ids[i:j]: + slice.append(self.get_full_content(docid)) + return slice + + + def get_full_content(self, docid): + res = self.searcher.stored_fields(docid) + f_path = res['path'][res['path'].find(res['repository']) \ + + len(res['repository']):].lstrip('/') + + content_short = ''.join(self.get_short_content(res)) + res.update({'content_short':content_short, + 'content_short_hl':self.highlight(content_short), + 'f_path':f_path}) + + return res + + def get_short_content(self, res): + """ + Smart function that implements chunking the content + but not overlap chunks so it doesn't highlight the same + close occurences twice. + @param matcher: + @param size: + """ + memory = [(0, 0)] + for span in self.matcher.spans(): + start = span.startchar or 0 + end = span.endchar or 0 + start_offseted = max(0, start - self.fragment_size) + end_offseted = end + self.fragment_size + print start_offseted, end_offseted + if start_offseted < memory[-1][1]: + start_offseted = memory[-1][1] + memory.append((start_offseted, end_offseted,)) + yield res["content"][start_offseted:end_offseted] + + def highlight(self, content, top=5): + hl = highlight(escape(content), + self.highlight_items, + analyzer=ANALYZER, + fragmenter=FRAGMENTER, + formatter=FORMATTER, + top=top) + return hl diff --git a/pylons_app/templates/search/search.html b/pylons_app/templates/search/search.html --- a/pylons_app/templates/search/search.html +++ b/pylons_app/templates/search/search.html @@ -46,7 +46,7 @@ h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}
${h.literal(sr['content_short'])}+
${h.literal(sr['content_short_hl'])}