upstream/kallithea Commit - r478:7010af6e

Reimplemented searching for speed on large files and added paging for search results...

marcink -

r478:7010af6e celery

parent child

pylons_app/controllers/search.py

0 +27 -42

             from pylons.controllers.util import abort, redirect
             from pylons_app.lib.auth import LoginRequired
             from pylons_app.lib.base import BaseController, render
-            from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
+            from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
-            from webhelpers.html.builder import escape
+            from webhelpers.paginate import Page
-            from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
+            from webhelpers.util import update_params
-                ContextFragmenter
             from pylons.i18n.translation import _
             from whoosh.index import open_dir, EmptyIndexError
             from whoosh.qparser import QueryParser, QueryParserError
                 def __before__(self):
                     super(SearchController, self).__before__()
                 def index(self):
                     c.formated_results = []
                     c.runtime = ''
-                    search_items = set()
                     c.cur_query = request.GET.get('q', None)
                     if c.cur_query:
                         cur_query = c.cur_query.lower()
                     if c.cur_query:
+                        p = int(request.params.get('page', 1))
+                        highlight_items = set()
                         try:
                             idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
                             searcher = idx.searcher()
                             qp = QueryParser("content", schema=SCHEMA)
                             try:
                                 query = qp.parse(unicode(cur_query))
                                 if isinstance(query, Phrase):
-                                    search_items.update(query.words)
+                                    highlight_items.update(query.words)
                                 else:
                                     for i in query.all_terms():
-                                        search_items.add(i[1])
+                                        if i[0] == 'content':
+                                            highlight_items.add(i[1])
-                                log.debug(query)
-                                log.debug(search_items)
-                                results = searcher.search(query)
-                                c.runtime = '%s results (%.3f seconds)' \
-                                % (len(results), results.runtime)
-                                analyzer = ANALYZER
+                                matcher = query.matcher(searcher)
-                                formatter = HtmlFormatter('span',
-                                    between='\n<span class="break">...</span>\n')
-                                #how the parts are splitted within the same text part
-                                fragmenter = SimpleFragmenter(200)
-                                #fragmenter = ContextFragmenter(search_items)
-                                for res in results:
+                                log.debug(query)
-                                    d = {}
+                                log.debug(highlight_items)
-                                    d.update(res)
+                                results = searcher.search(query)
-                                    hl = highlight(escape(res['content']), search_items,
+                                res_ln = len(results)
-                                                                     analyzer=analyzer,
+                                c.runtime = '%s results (%.3f seconds)' \
-                                                                     fragmenter=fragmenter,
+                                % (res_ln, results.runtime)
-                                                                     formatter=formatter,
-                                                                     top=5)
+                                def url_generator(**kw):
-                                    f_path = res['path'][res['path'].find(res['repository']) \
+                                    return update_params("?q=%s" % c.cur_query, **kw)
-                                                         + len(res['repository']):].lstrip('/')
-                                    d.update({'content_short':hl,
+                                c.formated_results = Page(
-                                              'f_path':f_path})
+                                            ResultWrapper(searcher, matcher, highlight_items),
-                                    #del d['content']
+                                            page=p, item_count=res_ln,
-                                    c.formated_results.append(d)
+                                            items_per_page=10, url=url_generator)
                             except QueryParserError:
                                 c.runtime = _('Invalid search query. Try quoting it.')
+                            searcher.close()
                         except (EmptyIndexError, IOError):
                             log.error(traceback.format_exc())
                             log.error('Empty Index data')
                             c.runtime = _('There is no index to search in. Please run whoosh indexer')
                     # Return a rendered template
                     return render('/search/search.html')

pylons_app/lib/indexers/__init__.py

0 +118 -19

@@ -1,41 +1,140 b''
1	import sys	1	from os.path import dirname as dn, join as jn
		2	from pidlock import LockHeld, DaemonLock
		3	from pylons_app.config.environment import load_environment
		4	from pylons_app.model.hg_model import HgModel
		5	from shutil import rmtree
		6	from webhelpers.html.builder import escape
		7	from vcs.utils.lazy import LazyProperty
		8
		9	from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
		10	from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
		11	from whoosh.index import create_in, open_dir
		12	from whoosh.formats import Characters
		13	from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
		14
2	import os	15	import os
3	from pidlock import LockHeld, DaemonLock	16	import sys
4	import traceback	17	import traceback
5		18
6	from os.path import dirname as dn	19
7	from os.path import join as jn
8		20
9	#to get the pylons_app import	21	#to get the pylons_app import
10	sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))	22	sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
11		23
12	from pylons_app.config.environment import load_environment
13	from pylons_app.model.hg_model import HgModel
14	from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
15	from whoosh.fields import TEXT, ID, STORED, Schema
16	from whoosh.index import create_in, open_dir
17	from shutil import rmtree
18		24
19	#LOCATION WE KEEP THE INDEX	25	#LOCATION WE KEEP THE INDEX
20	IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')	26	IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
21		27
22	#EXTENSIONS WE WANT TO INDEX CONTENT OFF	28	#EXTENSIONS WE WANT TO INDEX CONTENT OFF
23	INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',	29	INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
24	'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', ~~'h'~~,	30	'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
25	'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',	31	'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
26	'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',	32	'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
27	'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',	33	'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
28	'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt',	34	'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
29	'yaws']	35	'yaws']
30		36
31	#CUSTOM ANALYZER wordsplit + lowercase filter	37	#CUSTOM ANALYZER wordsplit + lowercase filter
32	ANALYZER = RegexTokenizer(expression=r"\w+") \| LowercaseFilter()	38	ANALYZER = RegexTokenizer(expression=r"\w+") \| LowercaseFilter()
33		39
		40
34	#INDEX SCHEMA DEFINITION	41	#INDEX SCHEMA DEFINITION
35	SCHEMA = Schema(owner=TEXT(),	42	SCHEMA = Schema(owner=TEXT(),
36	repository=TEXT(stored=True),	43	repository=TEXT(stored=True),
37	path=ID(stored=True, unique=True),	44	path=ID(stored=True, unique=True),
38	content=~~TEXT~~(~~stored~~=~~True~~, ~~analyzer~~=ANALYZER),	45	content=FieldType(format=Characters(ANALYZER),
39	modtime=STORED(),extension=TEXT(stored=True))	46	scorable=True, stored=True),
		47	modtime=STORED(), extension=TEXT(stored=True))
		48
		49
		50	IDX_NAME = 'HG_INDEX'
		51	FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
		52	FRAGMENTER = SimpleFragmenter(200)
		53
		54
		55
		56
		57	class ResultWrapper(object):
		58	def __init__(self, searcher, matcher, highlight_items):
		59	self.searcher = searcher
		60	self.matcher = matcher
		61	self.highlight_items = highlight_items
		62	self.fragment_size = 150 * 2
		63
		64	@LazyProperty
		65	def doc_ids(self):
		66	docs_id = []
		67	while self.matcher.is_active():
		68	docnum = self.matcher.id()
		69	docs_id.append(docnum)
		70	self.matcher.next()
		71	return docs_id
		72
		73	def __str__(self):
		74	return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
		75
		76	def __repr__(self):
		77	return self.__str__()
		78
		79	def __len__(self):
		80	return len(self.doc_ids)
		81
		82	def __iter__(self):
		83	"""
		84	Allows Iteration over results,and lazy generate content
		85
		86	Requires implementation of ``__getitem__`` method.
		87	"""
		88	for docid in self.doc_ids:
		89	yield self.get_full_content(docid)
40		90
41	IDX_NAME = 'HG_INDEX' No newline at end of file	91	def __getslice__(self, i, j):
		92	"""
		93	Slicing of resultWrapper
		94	"""
		95	slice = []
		96	for docid in self.doc_ids[i:j]:
		97	slice.append(self.get_full_content(docid))
		98	return slice
		99
		100
		101	def get_full_content(self, docid):
		102	res = self.searcher.stored_fields(docid)
		103	f_path = res['path'][res['path'].find(res['repository']) \
		104	+ len(res['repository']):].lstrip('/')
		105
		106	content_short = ''.join(self.get_short_content(res))
		107	res.update({'content_short':content_short,
		108	'content_short_hl':self.highlight(content_short),
		109	'f_path':f_path})
		110
		111	return res
		112
		113	def get_short_content(self, res):
		114	"""
		115	Smart function that implements chunking the content
		116	but not overlap chunks so it doesn't highlight the same
		117	close occurences twice.
		118	@param matcher:
		119	@param size:
		120	"""
		121	memory = [(0, 0)]
		122	for span in self.matcher.spans():
		123	start = span.startchar or 0
		124	end = span.endchar or 0
		125	start_offseted = max(0, start - self.fragment_size)
		126	end_offseted = end + self.fragment_size
		127	print start_offseted, end_offseted
		128	if start_offseted < memory[-1][1]:
		129	start_offseted = memory[-1][1]
		130	memory.append((start_offseted, end_offseted,))
		131	yield res["content"][start_offseted:end_offseted]
		132
		133	def highlight(self, content, top=5):
		134	hl = highlight(escape(content),
		135	self.highlight_items,
		136	analyzer=ANALYZER,
		137	fragmenter=FRAGMENTER,
		138	formatter=FORMATTER,
		139	top=top)
		140	return hl

pylons_app/templates/search/search.html

0 +7 -5

             					h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
             				</div>
             				<div class="code-body">
-            					<pre>${h.literal(sr['content_short'])}</pre>
+            					<pre>${h.literal(sr['content_short_hl'])}</pre>
             				</div>
             			</div>
             		</div>
             			</div>
             			%endif
             		%endif
             	%endfor
+            	%if c.cur_query:
+            	<div class="pagination-wh pagination-left">
+            		${c.formated_results.pager('$link_previous ~2~ $link_next')}
+            	</div>
+            	%endif
             </div>
             </%def>

setup.py

0 +3 -3

                 from setuptools import setup, find_packages
             setup(
-                name='HgApp-%s'%get_version(),
+                name='HgApp-%s' % get_version(),
                 version=get_version(),
                 description='Mercurial repository serving and browsing app',
                 keywords='mercurial web hgwebdir replacement serving hgweb',
                     "SQLAlchemy>=0.6",
                     "babel",
                     "Mako>=0.3.2",
-                    "vcs>=0.1.4",
+                    "vcs>=0.1.5",
                     "pygments>=1.3.0",
                     "mercurial>=1.6",
                     "pysqlite",
-                    "whoosh==1.0.0b10",
+                    "whoosh==1.0.0b16",
                     "py-bcrypt",
                     "celery",
                 ],

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages