##// END OF EJS Templates
fixed search chunking bug and optimized chunk size
marcink -
r479:149940ba celery
parent child Browse files
Show More
@@ -16,8 +16,6 b' import os'
16 import sys
16 import sys
17 import traceback
17 import traceback
18
18
19
20
21 #to get the pylons_app import
19 #to get the pylons_app import
22 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
20 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
23
21
@@ -50,23 +48,21 b' SCHEMA = Schema(owner=TEXT(),'
50 IDX_NAME = 'HG_INDEX'
48 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
49 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200)
50 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56
51
57 class ResultWrapper(object):
52 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items):
53 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher
54 self.searcher = searcher
60 self.matcher = matcher
55 self.matcher = matcher
61 self.highlight_items = highlight_items
56 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
57 self.fragment_size = 200 / 2
63
58
64 @LazyProperty
59 @LazyProperty
65 def doc_ids(self):
60 def doc_ids(self):
66 docs_id = []
61 docs_id = []
67 while self.matcher.is_active():
62 while self.matcher.is_active():
68 docnum = self.matcher.id()
63 docnum = self.matcher.id()
69 docs_id.append(docnum)
64 chunks = [offsets for offsets in self.get_chunks()]
65 docs_id.append([docnum, chunks])
70 self.matcher.next()
66 self.matcher.next()
71 return docs_id
67 return docs_id
72
68
@@ -99,18 +95,22 b' class ResultWrapper(object):'
99
95
100
96
101 def get_full_content(self, docid):
97 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
98 res = self.searcher.stored_fields(docid[0])
103 f_path = res['path'][res['path'].find(res['repository']) \
99 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/')
100 + len(res['repository']):].lstrip('/')
105
101
106 content_short = ''.join(self.get_short_content(res))
102 content_short = self.get_short_content(res, docid[1])
107 res.update({'content_short':content_short,
103 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short),
104 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path})
105 'f_path':f_path})
110
106
111 return res
107 return res
112
108
113 def get_short_content(self, res):
109 def get_short_content(self, res, chunks):
110
111 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
112
113 def get_chunks(self):
114 """
114 """
115 Smart function that implements chunking the content
115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same
116 but not overlap chunks so it doesn't highlight the same
@@ -124,11 +124,11 b' class ResultWrapper(object):'
124 end = span.endchar or 0
124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size)
125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size
126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
127
128 if start_offseted < memory[-1][1]:
128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1]
129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,))
130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
131 yield (start_offseted, end_offseted,)
132
132
133 def highlight(self, content, top=5):
133 def highlight(self, content, top=5):
134 hl = highlight(escape(content),
134 hl = highlight(escape(content),
General Comments 0
You need to be logged in to leave comments. Login now