##// END OF EJS Templates
merged
merged

File last commit:

r497:fb0c3af6 celery
r545:2e2ae0af merge default
Show More
__init__.py
139 lines | 4.8 KiB | text/x-python | PythonLexer
Reimplemented searching for speed on large files and added paging for search results...
r478 from os.path import dirname as dn, join as jn
from pylons_app.config.environment import load_environment
from pylons_app.model.hg_model import HgModel
from shutil import rmtree
from webhelpers.html.builder import escape
from vcs.utils.lazy import LazyProperty
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
from whoosh.index import create_in, open_dir
from whoosh.formats import Characters
from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
Implemented search using whoosh. Still as experimental option.
r406 import os
Reimplemented searching for speed on large files and added paging for search results...
r478 import sys
Implemented search using whoosh. Still as experimental option.
r406 import traceback
#to get the pylons_app import
sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
#LOCATION WE KEEP THE INDEX
IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
updated config files,...
r436 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
Reimplemented searching for speed on large files and added paging for search results...
r478 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
updated config files,...
r436 'yaws']
Implemented search using whoosh. Still as experimental option.
r406
#CUSTOM ANALYZER wordsplit + lowercase filter
updated config files,...
r436 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
Implemented search using whoosh. Still as experimental option.
r406
Reimplemented searching for speed on large files and added paging for search results...
r478
Implemented search using whoosh. Still as experimental option.
r406 #INDEX SCHEMA DEFINITION
SCHEMA = Schema(owner=TEXT(),
repository=TEXT(stored=True),
path=ID(stored=True, unique=True),
Reimplemented searching for speed on large files and added paging for search results...
r478 content=FieldType(format=Characters(ANALYZER),
scorable=True, stored=True),
modtime=STORED(), extension=TEXT(stored=True))
IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = SimpleFragmenter(200)
class ResultWrapper(object):
def __init__(self, searcher, matcher, highlight_items):
self.searcher = searcher
self.matcher = matcher
self.highlight_items = highlight_items
fixed search chunking bug and optimized chunk size
r479 self.fragment_size = 200 / 2
Reimplemented searching for speed on large files and added paging for search results...
r478
@LazyProperty
def doc_ids(self):
docs_id = []
while self.matcher.is_active():
docnum = self.matcher.id()
fixed search chunking bug and optimized chunk size
r479 chunks = [offsets for offsets in self.get_chunks()]
docs_id.append([docnum, chunks])
Reimplemented searching for speed on large files and added paging for search results...
r478 self.matcher.next()
return docs_id
def __str__(self):
return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
def __repr__(self):
return self.__str__()
def __len__(self):
return len(self.doc_ids)
def __iter__(self):
"""
Allows Iteration over results,and lazy generate content
*Requires* implementation of ``__getitem__`` method.
"""
for docid in self.doc_ids:
yield self.get_full_content(docid)
Implemented search using whoosh. Still as experimental option.
r406
Reimplemented searching for speed on large files and added paging for search results...
r478 def __getslice__(self, i, j):
"""
Slicing of resultWrapper
"""
slice = []
for docid in self.doc_ids[i:j]:
slice.append(self.get_full_content(docid))
return slice
def get_full_content(self, docid):
fixed search chunking bug and optimized chunk size
r479 res = self.searcher.stored_fields(docid[0])
Reimplemented searching for speed on large files and added paging for search results...
r478 f_path = res['path'][res['path'].find(res['repository']) \
+ len(res['repository']):].lstrip('/')
fixed search chunking bug and optimized chunk size
r479 content_short = self.get_short_content(res, docid[1])
Reimplemented searching for speed on large files and added paging for search results...
r478 res.update({'content_short':content_short,
'content_short_hl':self.highlight(content_short),
'f_path':f_path})
return res
fixed search chunking bug and optimized chunk size
r479
def get_short_content(self, res, chunks):
return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
def get_chunks(self):
Reimplemented searching for speed on large files and added paging for search results...
r478 """
Smart function that implements chunking the content
but not overlap chunks so it doesn't highlight the same
close occurences twice.
@param matcher:
@param size:
"""
memory = [(0, 0)]
for span in self.matcher.spans():
start = span.startchar or 0
end = span.endchar or 0
start_offseted = max(0, start - self.fragment_size)
end_offseted = end + self.fragment_size
fixed search chunking bug and optimized chunk size
r479
Reimplemented searching for speed on large files and added paging for search results...
r478 if start_offseted < memory[-1][1]:
start_offseted = memory[-1][1]
memory.append((start_offseted, end_offseted,))
fixed search chunking bug and optimized chunk size
r479 yield (start_offseted, end_offseted,)
Reimplemented searching for speed on large files and added paging for search results...
r478
def highlight(self, content, top=5):
hl = highlight(escape(content),
self.highlight_items,
analyzer=ANALYZER,
fragmenter=FRAGMENTER,
formatter=FORMATTER,
top=top)
return hl