##// END OF EJS Templates
fixed search chunking bug and optimized chunk size
marcink -
r479:149940ba celery
parent child Browse files
Show More
@@ -1,140 +1,140 b''
1 1 from os.path import dirname as dn, join as jn
2 2 from pidlock import LockHeld, DaemonLock
3 3 from pylons_app.config.environment import load_environment
4 4 from pylons_app.model.hg_model import HgModel
5 5 from shutil import rmtree
6 6 from webhelpers.html.builder import escape
7 7 from vcs.utils.lazy import LazyProperty
8 8
9 9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
10 10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
11 11 from whoosh.index import create_in, open_dir
12 12 from whoosh.formats import Characters
13 13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14 14
15 15 import os
16 16 import sys
17 17 import traceback
18 18
19
20
21 19 #to get the pylons_app import
22 20 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
23 21
24 22
25 23 #LOCATION WE KEEP THE INDEX
26 24 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
27 25
28 26 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
29 27 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
30 28 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
31 29 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
32 30 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
33 31 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
34 32 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
35 33 'yaws']
36 34
37 35 #CUSTOM ANALYZER wordsplit + lowercase filter
38 36 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
39 37
40 38
41 39 #INDEX SCHEMA DEFINITION
42 40 SCHEMA = Schema(owner=TEXT(),
43 41 repository=TEXT(stored=True),
44 42 path=ID(stored=True, unique=True),
45 43 content=FieldType(format=Characters(ANALYZER),
46 44 scorable=True, stored=True),
47 45 modtime=STORED(), extension=TEXT(stored=True))
48 46
49 47
50 48 IDX_NAME = 'HG_INDEX'
51 49 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 50 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56 51
57 52 class ResultWrapper(object):
58 53 def __init__(self, searcher, matcher, highlight_items):
59 54 self.searcher = searcher
60 55 self.matcher = matcher
61 56 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
57 self.fragment_size = 200 / 2
63 58
64 59 @LazyProperty
65 60 def doc_ids(self):
66 61 docs_id = []
67 62 while self.matcher.is_active():
68 63 docnum = self.matcher.id()
69 docs_id.append(docnum)
64 chunks = [offsets for offsets in self.get_chunks()]
65 docs_id.append([docnum, chunks])
70 66 self.matcher.next()
71 67 return docs_id
72 68
73 69 def __str__(self):
74 70 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
75 71
76 72 def __repr__(self):
77 73 return self.__str__()
78 74
79 75 def __len__(self):
80 76 return len(self.doc_ids)
81 77
82 78 def __iter__(self):
83 79 """
84 80 Allows Iteration over results,and lazy generate content
85 81
86 82 *Requires* implementation of ``__getitem__`` method.
87 83 """
88 84 for docid in self.doc_ids:
89 85 yield self.get_full_content(docid)
90 86
91 87 def __getslice__(self, i, j):
92 88 """
93 89 Slicing of resultWrapper
94 90 """
95 91 slice = []
96 92 for docid in self.doc_ids[i:j]:
97 93 slice.append(self.get_full_content(docid))
98 94 return slice
99 95
100 96
101 97 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
98 res = self.searcher.stored_fields(docid[0])
103 99 f_path = res['path'][res['path'].find(res['repository']) \
104 100 + len(res['repository']):].lstrip('/')
105 101
106 content_short = ''.join(self.get_short_content(res))
102 content_short = self.get_short_content(res, docid[1])
107 103 res.update({'content_short':content_short,
108 104 'content_short_hl':self.highlight(content_short),
109 105 'f_path':f_path})
110 106
111 107 return res
112
113 def get_short_content(self, res):
108
109 def get_short_content(self, res, chunks):
110
111 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
112
113 def get_chunks(self):
114 114 """
115 115 Smart function that implements chunking the content
116 116 but not overlap chunks so it doesn't highlight the same
117 117 close occurences twice.
118 118 @param matcher:
119 119 @param size:
120 120 """
121 121 memory = [(0, 0)]
122 122 for span in self.matcher.spans():
123 123 start = span.startchar or 0
124 124 end = span.endchar or 0
125 125 start_offseted = max(0, start - self.fragment_size)
126 126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
127
128 128 if start_offseted < memory[-1][1]:
129 129 start_offseted = memory[-1][1]
130 130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
131 yield (start_offseted, end_offseted,)
132 132
133 133 def highlight(self, content, top=5):
134 134 hl = highlight(escape(content),
135 135 self.highlight_items,
136 136 analyzer=ANALYZER,
137 137 fragmenter=FRAGMENTER,
138 138 formatter=FORMATTER,
139 139 top=top)
140 140 return hl
General Comments 0
You need to be logged in to leave comments. Login now