##// END OF EJS Templates
fixed search chunking bug and optimized chunk size
marcink -
r479:149940ba celery
parent child Browse files
Show More
@@ -1,140 +1,140 b''
1 from os.path import dirname as dn, join as jn
1 from os.path import dirname as dn, join as jn
2 from pidlock import LockHeld, DaemonLock
2 from pidlock import LockHeld, DaemonLock
3 from pylons_app.config.environment import load_environment
3 from pylons_app.config.environment import load_environment
4 from pylons_app.model.hg_model import HgModel
4 from pylons_app.model.hg_model import HgModel
5 from shutil import rmtree
5 from shutil import rmtree
6 from webhelpers.html.builder import escape
6 from webhelpers.html.builder import escape
7 from vcs.utils.lazy import LazyProperty
7 from vcs.utils.lazy import LazyProperty
8
8
9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
9 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
10 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
11 from whoosh.index import create_in, open_dir
11 from whoosh.index import create_in, open_dir
12 from whoosh.formats import Characters
12 from whoosh.formats import Characters
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14
14
15 import os
15 import os
16 import sys
16 import sys
17 import traceback
17 import traceback
18
18
19
20
21 #to get the pylons_app import
19 #to get the pylons_app import
22 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
20 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
23
21
24
22
25 #LOCATION WE KEEP THE INDEX
23 #LOCATION WE KEEP THE INDEX
26 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
24 IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')
27
25
28 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
26 #EXTENSIONS WE WANT TO INDEX CONTENT OFF
29 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
27 INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
30 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
28 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
31 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
29 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
32 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
30 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
33 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
31 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
34 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
32 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
35 'yaws']
33 'yaws']
36
34
37 #CUSTOM ANALYZER wordsplit + lowercase filter
35 #CUSTOM ANALYZER wordsplit + lowercase filter
38 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
36 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
39
37
40
38
41 #INDEX SCHEMA DEFINITION
39 #INDEX SCHEMA DEFINITION
42 SCHEMA = Schema(owner=TEXT(),
40 SCHEMA = Schema(owner=TEXT(),
43 repository=TEXT(stored=True),
41 repository=TEXT(stored=True),
44 path=ID(stored=True, unique=True),
42 path=ID(stored=True, unique=True),
45 content=FieldType(format=Characters(ANALYZER),
43 content=FieldType(format=Characters(ANALYZER),
46 scorable=True, stored=True),
44 scorable=True, stored=True),
47 modtime=STORED(), extension=TEXT(stored=True))
45 modtime=STORED(), extension=TEXT(stored=True))
48
46
49
47
50 IDX_NAME = 'HG_INDEX'
48 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
49 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200)
50 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56
51
57 class ResultWrapper(object):
52 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items):
53 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher
54 self.searcher = searcher
60 self.matcher = matcher
55 self.matcher = matcher
61 self.highlight_items = highlight_items
56 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2
57 self.fragment_size = 200 / 2
63
58
64 @LazyProperty
59 @LazyProperty
65 def doc_ids(self):
60 def doc_ids(self):
66 docs_id = []
61 docs_id = []
67 while self.matcher.is_active():
62 while self.matcher.is_active():
68 docnum = self.matcher.id()
63 docnum = self.matcher.id()
69 docs_id.append(docnum)
64 chunks = [offsets for offsets in self.get_chunks()]
65 docs_id.append([docnum, chunks])
70 self.matcher.next()
66 self.matcher.next()
71 return docs_id
67 return docs_id
72
68
73 def __str__(self):
69 def __str__(self):
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
70 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
75
71
76 def __repr__(self):
72 def __repr__(self):
77 return self.__str__()
73 return self.__str__()
78
74
79 def __len__(self):
75 def __len__(self):
80 return len(self.doc_ids)
76 return len(self.doc_ids)
81
77
82 def __iter__(self):
78 def __iter__(self):
83 """
79 """
84 Allows Iteration over results,and lazy generate content
80 Allows Iteration over results,and lazy generate content
85
81
86 *Requires* implementation of ``__getitem__`` method.
82 *Requires* implementation of ``__getitem__`` method.
87 """
83 """
88 for docid in self.doc_ids:
84 for docid in self.doc_ids:
89 yield self.get_full_content(docid)
85 yield self.get_full_content(docid)
90
86
91 def __getslice__(self, i, j):
87 def __getslice__(self, i, j):
92 """
88 """
93 Slicing of resultWrapper
89 Slicing of resultWrapper
94 """
90 """
95 slice = []
91 slice = []
96 for docid in self.doc_ids[i:j]:
92 for docid in self.doc_ids[i:j]:
97 slice.append(self.get_full_content(docid))
93 slice.append(self.get_full_content(docid))
98 return slice
94 return slice
99
95
100
96
101 def get_full_content(self, docid):
97 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid)
98 res = self.searcher.stored_fields(docid[0])
103 f_path = res['path'][res['path'].find(res['repository']) \
99 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/')
100 + len(res['repository']):].lstrip('/')
105
101
106 content_short = ''.join(self.get_short_content(res))
102 content_short = self.get_short_content(res, docid[1])
107 res.update({'content_short':content_short,
103 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short),
104 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path})
105 'f_path':f_path})
110
106
111 return res
107 return res
112
108
113 def get_short_content(self, res):
109 def get_short_content(self, res, chunks):
110
111 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
112
113 def get_chunks(self):
114 """
114 """
115 Smart function that implements chunking the content
115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same
116 but not overlap chunks so it doesn't highlight the same
117 close occurences twice.
117 close occurences twice.
118 @param matcher:
118 @param matcher:
119 @param size:
119 @param size:
120 """
120 """
121 memory = [(0, 0)]
121 memory = [(0, 0)]
122 for span in self.matcher.spans():
122 for span in self.matcher.spans():
123 start = span.startchar or 0
123 start = span.startchar or 0
124 end = span.endchar or 0
124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size)
125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size
126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted
127
128 if start_offseted < memory[-1][1]:
128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1]
129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,))
130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted]
131 yield (start_offseted, end_offseted,)
132
132
133 def highlight(self, content, top=5):
133 def highlight(self, content, top=5):
134 hl = highlight(escape(content),
134 hl = highlight(escape(content),
135 self.highlight_items,
135 self.highlight_items,
136 analyzer=ANALYZER,
136 analyzer=ANALYZER,
137 fragmenter=FRAGMENTER,
137 fragmenter=FRAGMENTER,
138 formatter=FORMATTER,
138 formatter=FORMATTER,
139 top=top)
139 top=top)
140 return hl
140 return hl
General Comments 0
You need to be logged in to leave comments. Login now