Show More
@@ -1,140 +1,140 b'' | |||||
1 | from os.path import dirname as dn, join as jn |
|
1 | from os.path import dirname as dn, join as jn | |
2 | from pidlock import LockHeld, DaemonLock |
|
2 | from pidlock import LockHeld, DaemonLock | |
3 | from pylons_app.config.environment import load_environment |
|
3 | from pylons_app.config.environment import load_environment | |
4 | from pylons_app.model.hg_model import HgModel |
|
4 | from pylons_app.model.hg_model import HgModel | |
5 | from shutil import rmtree |
|
5 | from shutil import rmtree | |
6 | from webhelpers.html.builder import escape |
|
6 | from webhelpers.html.builder import escape | |
7 | from vcs.utils.lazy import LazyProperty |
|
7 | from vcs.utils.lazy import LazyProperty | |
8 |
|
8 | |||
9 | from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter |
|
9 | from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter | |
10 | from whoosh.fields import TEXT, ID, STORED, Schema, FieldType |
|
10 | from whoosh.fields import TEXT, ID, STORED, Schema, FieldType | |
11 | from whoosh.index import create_in, open_dir |
|
11 | from whoosh.index import create_in, open_dir | |
12 | from whoosh.formats import Characters |
|
12 | from whoosh.formats import Characters | |
13 | from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter |
|
13 | from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter | |
14 |
|
14 | |||
15 | import os |
|
15 | import os | |
16 | import sys |
|
16 | import sys | |
17 | import traceback |
|
17 | import traceback | |
18 |
|
18 | |||
19 |
|
||||
20 |
|
||||
21 | #to get the pylons_app import |
|
19 | #to get the pylons_app import | |
22 | sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) |
|
20 | sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) | |
23 |
|
21 | |||
24 |
|
22 | |||
25 | #LOCATION WE KEEP THE INDEX |
|
23 | #LOCATION WE KEEP THE INDEX | |
26 | IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') |
|
24 | IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') | |
27 |
|
25 | |||
28 | #EXTENSIONS WE WANT TO INDEX CONTENT OFF |
|
26 | #EXTENSIONS WE WANT TO INDEX CONTENT OFF | |
29 | INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', |
|
27 | INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', | |
30 | 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', |
|
28 | 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', | |
31 | 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', |
|
29 | 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', | |
32 | 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', |
|
30 | 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', | |
33 | 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', |
|
31 | 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', | |
34 | 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt', |
|
32 | 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt', | |
35 | 'yaws'] |
|
33 | 'yaws'] | |
36 |
|
34 | |||
37 | #CUSTOM ANALYZER wordsplit + lowercase filter |
|
35 | #CUSTOM ANALYZER wordsplit + lowercase filter | |
38 | ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() |
|
36 | ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() | |
39 |
|
37 | |||
40 |
|
38 | |||
41 | #INDEX SCHEMA DEFINITION |
|
39 | #INDEX SCHEMA DEFINITION | |
42 | SCHEMA = Schema(owner=TEXT(), |
|
40 | SCHEMA = Schema(owner=TEXT(), | |
43 | repository=TEXT(stored=True), |
|
41 | repository=TEXT(stored=True), | |
44 | path=ID(stored=True, unique=True), |
|
42 | path=ID(stored=True, unique=True), | |
45 | content=FieldType(format=Characters(ANALYZER), |
|
43 | content=FieldType(format=Characters(ANALYZER), | |
46 | scorable=True, stored=True), |
|
44 | scorable=True, stored=True), | |
47 | modtime=STORED(), extension=TEXT(stored=True)) |
|
45 | modtime=STORED(), extension=TEXT(stored=True)) | |
48 |
|
46 | |||
49 |
|
47 | |||
50 | IDX_NAME = 'HG_INDEX' |
|
48 | IDX_NAME = 'HG_INDEX' | |
51 | FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') |
|
49 | FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') | |
52 | FRAGMENTER = SimpleFragmenter(200) |
|
50 | FRAGMENTER = SimpleFragmenter(200) | |
53 |
|
51 | |||
54 |
|
||||
55 |
|
||||
56 |
|
||||
57 | class ResultWrapper(object): |
|
52 | class ResultWrapper(object): | |
58 | def __init__(self, searcher, matcher, highlight_items): |
|
53 | def __init__(self, searcher, matcher, highlight_items): | |
59 | self.searcher = searcher |
|
54 | self.searcher = searcher | |
60 | self.matcher = matcher |
|
55 | self.matcher = matcher | |
61 | self.highlight_items = highlight_items |
|
56 | self.highlight_items = highlight_items | |
62 |
self.fragment_size = |
|
57 | self.fragment_size = 200 / 2 | |
63 |
|
58 | |||
64 | @LazyProperty |
|
59 | @LazyProperty | |
65 | def doc_ids(self): |
|
60 | def doc_ids(self): | |
66 | docs_id = [] |
|
61 | docs_id = [] | |
67 | while self.matcher.is_active(): |
|
62 | while self.matcher.is_active(): | |
68 | docnum = self.matcher.id() |
|
63 | docnum = self.matcher.id() | |
69 | docs_id.append(docnum) |
|
64 | chunks = [offsets for offsets in self.get_chunks()] | |
|
65 | docs_id.append([docnum, chunks]) | |||
70 | self.matcher.next() |
|
66 | self.matcher.next() | |
71 | return docs_id |
|
67 | return docs_id | |
72 |
|
68 | |||
73 | def __str__(self): |
|
69 | def __str__(self): | |
74 | return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) |
|
70 | return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) | |
75 |
|
71 | |||
76 | def __repr__(self): |
|
72 | def __repr__(self): | |
77 | return self.__str__() |
|
73 | return self.__str__() | |
78 |
|
74 | |||
79 | def __len__(self): |
|
75 | def __len__(self): | |
80 | return len(self.doc_ids) |
|
76 | return len(self.doc_ids) | |
81 |
|
77 | |||
82 | def __iter__(self): |
|
78 | def __iter__(self): | |
83 | """ |
|
79 | """ | |
84 | Allows Iteration over results,and lazy generate content |
|
80 | Allows Iteration over results,and lazy generate content | |
85 |
|
81 | |||
86 | *Requires* implementation of ``__getitem__`` method. |
|
82 | *Requires* implementation of ``__getitem__`` method. | |
87 | """ |
|
83 | """ | |
88 | for docid in self.doc_ids: |
|
84 | for docid in self.doc_ids: | |
89 | yield self.get_full_content(docid) |
|
85 | yield self.get_full_content(docid) | |
90 |
|
86 | |||
91 | def __getslice__(self, i, j): |
|
87 | def __getslice__(self, i, j): | |
92 | """ |
|
88 | """ | |
93 | Slicing of resultWrapper |
|
89 | Slicing of resultWrapper | |
94 | """ |
|
90 | """ | |
95 | slice = [] |
|
91 | slice = [] | |
96 | for docid in self.doc_ids[i:j]: |
|
92 | for docid in self.doc_ids[i:j]: | |
97 | slice.append(self.get_full_content(docid)) |
|
93 | slice.append(self.get_full_content(docid)) | |
98 | return slice |
|
94 | return slice | |
99 |
|
95 | |||
100 |
|
96 | |||
101 | def get_full_content(self, docid): |
|
97 | def get_full_content(self, docid): | |
102 | res = self.searcher.stored_fields(docid) |
|
98 | res = self.searcher.stored_fields(docid[0]) | |
103 | f_path = res['path'][res['path'].find(res['repository']) \ |
|
99 | f_path = res['path'][res['path'].find(res['repository']) \ | |
104 | + len(res['repository']):].lstrip('/') |
|
100 | + len(res['repository']):].lstrip('/') | |
105 |
|
101 | |||
106 |
content_short = |
|
102 | content_short = self.get_short_content(res, docid[1]) | |
107 | res.update({'content_short':content_short, |
|
103 | res.update({'content_short':content_short, | |
108 | 'content_short_hl':self.highlight(content_short), |
|
104 | 'content_short_hl':self.highlight(content_short), | |
109 | 'f_path':f_path}) |
|
105 | 'f_path':f_path}) | |
110 |
|
106 | |||
111 | return res |
|
107 | return res | |
112 |
|
108 | |||
113 | def get_short_content(self, res): |
|
109 | def get_short_content(self, res, chunks): | |
|
110 | ||||
|
111 | return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks]) | |||
|
112 | ||||
|
113 | def get_chunks(self): | |||
114 | """ |
|
114 | """ | |
115 | Smart function that implements chunking the content |
|
115 | Smart function that implements chunking the content | |
116 | but not overlap chunks so it doesn't highlight the same |
|
116 | but not overlap chunks so it doesn't highlight the same | |
117 | close occurences twice. |
|
117 | close occurences twice. | |
118 | @param matcher: |
|
118 | @param matcher: | |
119 | @param size: |
|
119 | @param size: | |
120 | """ |
|
120 | """ | |
121 | memory = [(0, 0)] |
|
121 | memory = [(0, 0)] | |
122 | for span in self.matcher.spans(): |
|
122 | for span in self.matcher.spans(): | |
123 | start = span.startchar or 0 |
|
123 | start = span.startchar or 0 | |
124 | end = span.endchar or 0 |
|
124 | end = span.endchar or 0 | |
125 | start_offseted = max(0, start - self.fragment_size) |
|
125 | start_offseted = max(0, start - self.fragment_size) | |
126 | end_offseted = end + self.fragment_size |
|
126 | end_offseted = end + self.fragment_size | |
127 | print start_offseted, end_offseted |
|
127 | ||
128 | if start_offseted < memory[-1][1]: |
|
128 | if start_offseted < memory[-1][1]: | |
129 | start_offseted = memory[-1][1] |
|
129 | start_offseted = memory[-1][1] | |
130 | memory.append((start_offseted, end_offseted,)) |
|
130 | memory.append((start_offseted, end_offseted,)) | |
131 |
yield |
|
131 | yield (start_offseted, end_offseted,) | |
132 |
|
132 | |||
133 | def highlight(self, content, top=5): |
|
133 | def highlight(self, content, top=5): | |
134 | hl = highlight(escape(content), |
|
134 | hl = highlight(escape(content), | |
135 | self.highlight_items, |
|
135 | self.highlight_items, | |
136 | analyzer=ANALYZER, |
|
136 | analyzer=ANALYZER, | |
137 | fragmenter=FRAGMENTER, |
|
137 | fragmenter=FRAGMENTER, | |
138 | formatter=FORMATTER, |
|
138 | formatter=FORMATTER, | |
139 | top=top) |
|
139 | top=top) | |
140 | return hl |
|
140 | return hl |
General Comments 0
You need to be logged in to leave comments.
Login now