upstream/kallithea Commit - r478:7010af6e

Reimplemented searching for speed on large files and added paging for search results...

marcink -

r478:7010af6e celery

parent child

pylons_app/controllers/search.py

0 +27 -42

              from pylons.controllers.util import abort, redirect
              from pylons_app.lib.auth import LoginRequired
              from pylons_app.lib.base import BaseController, render
-             from pylons_app.lib.indexers import ANALYZER, IDX_LOCATION, SCHEMA, IDX_NAME
-             from webhelpers.html.builder import escape
-             from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter, \
-                 ContextFragmenter
+             from pylons_app.lib.indexers import IDX_LOCATION, SCHEMA, IDX_NAME, ResultWrapper
+             from webhelpers.paginate import Page
+             from webhelpers.util import update_params
              from pylons.i18n.translation import _
              from whoosh.index import open_dir, EmptyIndexError
              from whoosh.qparser import QueryParser, QueryParserError
                  def __before__(self):
                      super(SearchController, self).__before__()
                  def index(self):
                      c.formated_results = []
                      c.runtime = ''
-                     search_items = set()
                      c.cur_query = request.GET.get('q', None)
                      if c.cur_query:
                          cur_query = c.cur_query.lower()
                      if c.cur_query:
+                         p = int(request.params.get('page', 1))
+                         highlight_items = set()
                          try:
                              idx = open_dir(IDX_LOCATION, indexname=IDX_NAME)
                              searcher = idx.searcher()
                              qp = QueryParser("content", schema=SCHEMA)
                              try:
                                  query = qp.parse(unicode(cur_query))
                                  if isinstance(query, Phrase):
-                                     search_items.update(query.words)
+                                     highlight_items.update(query.words)
                                  else:
                                      for i in query.all_terms():
-                                         search_items.add(i[1])
-                                 log.debug(query)
-                                 log.debug(search_items)
-                                 results = searcher.search(query)
-                                 c.runtime = '%s results (%.3f seconds)' \
-                                 % (len(results), results.runtime)
+                                         if i[0] == 'content':
+                                             highlight_items.add(i[1])
-                                 analyzer = ANALYZER
-                                 formatter = HtmlFormatter('span',
-                                     between='\n<span class="break">...</span>\n')
-                                 #how the parts are splitted within the same text part
-                                 fragmenter = SimpleFragmenter(200)
-                                 #fragmenter = ContextFragmenter(search_items)
+                                 matcher = query.matcher(searcher)
-                                 for res in results:
-                                     d = {}
-                                     d.update(res)
-                                     hl = highlight(escape(res['content']), search_items,
-                                                                      analyzer=analyzer,
-                                                                      fragmenter=fragmenter,
-                                                                      formatter=formatter,
-                                                                      top=5)
-                                     f_path = res['path'][res['path'].find(res['repository']) \
-                                                          + len(res['repository']):].lstrip('/')
-                                     d.update({'content_short':hl,
-                                               'f_path':f_path})
-                                     #del d['content']
-                                     c.formated_results.append(d)
+                                 log.debug(query)
+                                 log.debug(highlight_items)
+                                 results = searcher.search(query)
+                                 res_ln = len(results)
+                                 c.runtime = '%s results (%.3f seconds)' \
+                                 % (res_ln, results.runtime)
+                                 def url_generator(**kw):
+                                     return update_params("?q=%s" % c.cur_query, **kw)
+                                 c.formated_results = Page(
+                                             ResultWrapper(searcher, matcher, highlight_items),
+                                             page=p, item_count=res_ln,
+                                             items_per_page=10, url=url_generator)
                              except QueryParserError:
                                  c.runtime = _('Invalid search query. Try quoting it.')
+                             searcher.close()
                          except (EmptyIndexError, IOError):
                              log.error(traceback.format_exc())
                              log.error('Empty Index data')
                              c.runtime = _('There is no index to search in. Please run whoosh indexer')
                      # Return a rendered template
                      return render('/search/search.html')

pylons_app/lib/indexers/__init__.py

0 +118 -19

		@@ -1,41 +1,140 b''
1		import sys
	1	from os.path import dirname as dn, join as jn
	2	from pidlock import LockHeld, DaemonLock
	3	from pylons_app.config.environment import load_environment
	4	from pylons_app.model.hg_model import HgModel
	5	from shutil import rmtree
	6	from webhelpers.html.builder import escape
	7	from vcs.utils.lazy import LazyProperty
	8
	9	from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
	10	from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
	11	from whoosh.index import create_in, open_dir
	12	from whoosh.formats import Characters
	13	from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
	14
2	15	import os
3		from pidlock import LockHeld, DaemonLock
	16	import sys
4	17	import traceback
5	18
6		from os.path import dirname as dn
7		from os.path import join as jn
	19
8	20
9	21	#to get the pylons_app import
10	22	sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
11	23
12		from pylons_app.config.environment import load_environment
13		from pylons_app.model.hg_model import HgModel
14		from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
15		from whoosh.fields import TEXT, ID, STORED, Schema
16		from whoosh.index import create_in, open_dir
17		from shutil import rmtree
18	24
19	25	#LOCATION WE KEEP THE INDEX
20	26	IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
21	27
22	28	#EXTENSIONS WE WANT TO INDEX CONTENT OFF
23		INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
24		'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', ~~'h'~~,
25		'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
26		'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
27		'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
28		'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml','xsl','xslt',
	29	INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
	30	'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
	31	'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
	32	'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
	33	'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
	34	'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
29	35	'yaws']
30	36
31	37	#CUSTOM ANALYZER wordsplit + lowercase filter
32	38	ANALYZER = RegexTokenizer(expression=r"\w+") \| LowercaseFilter()
33	39
	40
34	41	#INDEX SCHEMA DEFINITION
35	42	SCHEMA = Schema(owner=TEXT(),
36	43	repository=TEXT(stored=True),
37	44	path=ID(stored=True, unique=True),
38		content=~~TEXT~~(~~stored~~=~~True~~, ~~analyzer~~=ANALYZER),
39		modtime=STORED(),extension=TEXT(stored=True))
	45	content=FieldType(format=Characters(ANALYZER),
	46	scorable=True, stored=True),
	47	modtime=STORED(), extension=TEXT(stored=True))
	48
	49
	50	IDX_NAME = 'HG_INDEX'
	51	FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
	52	FRAGMENTER = SimpleFragmenter(200)
	53
	54
	55
	56
	57	class ResultWrapper(object):
	58	def __init__(self, searcher, matcher, highlight_items):
	59	self.searcher = searcher
	60	self.matcher = matcher
	61	self.highlight_items = highlight_items
	62	self.fragment_size = 150 * 2
	63
	64	@LazyProperty
	65	def doc_ids(self):
	66	docs_id = []
	67	while self.matcher.is_active():
	68	docnum = self.matcher.id()
	69	docs_id.append(docnum)
	70	self.matcher.next()
	71	return docs_id
	72
	73	def __str__(self):
	74	return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
	75
	76	def __repr__(self):
	77	return self.__str__()
	78
	79	def __len__(self):
	80	return len(self.doc_ids)
	81
	82	def __iter__(self):
	83	"""
	84	Allows Iteration over results,and lazy generate content
	85
	86	Requires implementation of ``__getitem__`` method.
	87	"""
	88	for docid in self.doc_ids:
	89	yield self.get_full_content(docid)
40	90
41		IDX_NAME = 'HG_INDEX' No newline at end of file
	91	def __getslice__(self, i, j):
	92	"""
	93	Slicing of resultWrapper
	94	"""
	95	slice = []
	96	for docid in self.doc_ids[i:j]:
	97	slice.append(self.get_full_content(docid))
	98	return slice
	99
	100
	101	def get_full_content(self, docid):
	102	res = self.searcher.stored_fields(docid)
	103	f_path = res['path'][res['path'].find(res['repository']) \
	104	+ len(res['repository']):].lstrip('/')
	105
	106	content_short = ''.join(self.get_short_content(res))
	107	res.update({'content_short':content_short,
	108	'content_short_hl':self.highlight(content_short),
	109	'f_path':f_path})
	110
	111	return res
	112
	113	def get_short_content(self, res):
	114	"""
	115	Smart function that implements chunking the content
	116	but not overlap chunks so it doesn't highlight the same
	117	close occurences twice.
	118	@param matcher:
	119	@param size:
	120	"""
	121	memory = [(0, 0)]
	122	for span in self.matcher.spans():
	123	start = span.startchar or 0
	124	end = span.endchar or 0
	125	start_offseted = max(0, start - self.fragment_size)
	126	end_offseted = end + self.fragment_size
	127	print start_offseted, end_offseted
	128	if start_offseted < memory[-1][1]:
	129	start_offseted = memory[-1][1]
	130	memory.append((start_offseted, end_offseted,))
	131	yield res["content"][start_offseted:end_offseted]
	132
	133	def highlight(self, content, top=5):
	134	hl = highlight(escape(content),
	135	self.highlight_items,
	136	analyzer=ANALYZER,
	137	fragmenter=FRAGMENTER,
	138	formatter=FORMATTER,
	139	top=top)
	140	return hl

pylons_app/templates/search/search.html

0 +7 -5

              					h.url('files_home',repo_name=sr['repository'],revision='tip',f_path=sr['f_path']))}</div>
              				</div>
              				<div class="code-body">
-             					<pre>${h.literal(sr['content_short'])}</pre>
+             					<pre>${h.literal(sr['content_short_hl'])}</pre>
              				</div>
              			</div>
              		</div>
              			</div>
              			%endif
-             		%endif
+             		%endif
              	%endfor
+             	%if c.cur_query:
+             	<div class="pagination-wh pagination-left">
+             		${c.formated_results.pager('$link_previous ~2~ $link_next')}
+             	</div>
+             	%endif
              </div>
              </%def>

setup.py

0 +3 -3

                  from setuptools import setup, find_packages
              setup(
-                 name='HgApp-%s'%get_version(),
+                 name='HgApp-%s' % get_version(),
                  version=get_version(),
                  description='Mercurial repository serving and browsing app',
                  keywords='mercurial web hgwebdir replacement serving hgweb',
                      "SQLAlchemy>=0.6",
                      "babel",
                      "Mako>=0.3.2",
-                     "vcs>=0.1.4",
+                     "vcs>=0.1.5",
                      "pygments>=1.3.0",
                      "mercurial>=1.6",
                      "pysqlite",
-                     "whoosh==1.0.0b10",
+                     "whoosh==1.0.0b16",
                      "py-bcrypt",
                      "celery",
                  ],

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages