Show More
@@ -1,140 +1,140 b'' | |||
|
1 | 1 | from os.path import dirname as dn, join as jn |
|
2 | 2 | from pidlock import LockHeld, DaemonLock |
|
3 | 3 | from pylons_app.config.environment import load_environment |
|
4 | 4 | from pylons_app.model.hg_model import HgModel |
|
5 | 5 | from shutil import rmtree |
|
6 | 6 | from webhelpers.html.builder import escape |
|
7 | 7 | from vcs.utils.lazy import LazyProperty |
|
8 | 8 | |
|
9 | 9 | from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter |
|
10 | 10 | from whoosh.fields import TEXT, ID, STORED, Schema, FieldType |
|
11 | 11 | from whoosh.index import create_in, open_dir |
|
12 | 12 | from whoosh.formats import Characters |
|
13 | 13 | from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter |
|
14 | 14 | |
|
15 | 15 | import os |
|
16 | 16 | import sys |
|
17 | 17 | import traceback |
|
18 | 18 | |
|
19 | ||
|
20 | ||
|
21 | 19 | #to get the pylons_app import |
|
22 | 20 | sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) |
|
23 | 21 | |
|
24 | 22 | |
|
25 | 23 | #LOCATION WE KEEP THE INDEX |
|
26 | 24 | IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index') |
|
27 | 25 | |
|
28 | 26 | #EXTENSIONS WE WANT TO INDEX CONTENT OFF |
|
29 | 27 | INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c', |
|
30 | 28 | 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl', |
|
31 | 29 | 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp', |
|
32 | 30 | 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3', |
|
33 | 31 | 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql', |
|
34 | 32 | 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt', |
|
35 | 33 | 'yaws'] |
|
36 | 34 | |
|
37 | 35 | #CUSTOM ANALYZER wordsplit + lowercase filter |
|
38 | 36 | ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() |
|
39 | 37 | |
|
40 | 38 | |
|
41 | 39 | #INDEX SCHEMA DEFINITION |
|
42 | 40 | SCHEMA = Schema(owner=TEXT(), |
|
43 | 41 | repository=TEXT(stored=True), |
|
44 | 42 | path=ID(stored=True, unique=True), |
|
45 | 43 | content=FieldType(format=Characters(ANALYZER), |
|
46 | 44 | scorable=True, stored=True), |
|
47 | 45 | modtime=STORED(), extension=TEXT(stored=True)) |
|
48 | 46 | |
|
49 | 47 | |
|
50 | 48 | IDX_NAME = 'HG_INDEX' |
|
51 | 49 | FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') |
|
52 | 50 | FRAGMENTER = SimpleFragmenter(200) |
|
53 | ||
|
54 | ||
|
55 | ||
|
56 | 51 | |
|
57 | 52 | class ResultWrapper(object): |
|
58 | 53 | def __init__(self, searcher, matcher, highlight_items): |
|
59 | 54 | self.searcher = searcher |
|
60 | 55 | self.matcher = matcher |
|
61 | 56 | self.highlight_items = highlight_items |
|
62 |
self.fragment_size = |
|
|
57 | self.fragment_size = 200 / 2 | |
|
63 | 58 | |
|
64 | 59 | @LazyProperty |
|
65 | 60 | def doc_ids(self): |
|
66 | 61 | docs_id = [] |
|
67 | 62 | while self.matcher.is_active(): |
|
68 | 63 | docnum = self.matcher.id() |
|
69 | docs_id.append(docnum) | |
|
64 | chunks = [offsets for offsets in self.get_chunks()] | |
|
65 | docs_id.append([docnum, chunks]) | |
|
70 | 66 | self.matcher.next() |
|
71 | 67 | return docs_id |
|
72 | 68 | |
|
73 | 69 | def __str__(self): |
|
74 | 70 | return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) |
|
75 | 71 | |
|
76 | 72 | def __repr__(self): |
|
77 | 73 | return self.__str__() |
|
78 | 74 | |
|
79 | 75 | def __len__(self): |
|
80 | 76 | return len(self.doc_ids) |
|
81 | 77 | |
|
82 | 78 | def __iter__(self): |
|
83 | 79 | """ |
|
84 | 80 | Allows Iteration over results,and lazy generate content |
|
85 | 81 | |
|
86 | 82 | *Requires* implementation of ``__getitem__`` method. |
|
87 | 83 | """ |
|
88 | 84 | for docid in self.doc_ids: |
|
89 | 85 | yield self.get_full_content(docid) |
|
90 | 86 | |
|
91 | 87 | def __getslice__(self, i, j): |
|
92 | 88 | """ |
|
93 | 89 | Slicing of resultWrapper |
|
94 | 90 | """ |
|
95 | 91 | slice = [] |
|
96 | 92 | for docid in self.doc_ids[i:j]: |
|
97 | 93 | slice.append(self.get_full_content(docid)) |
|
98 | 94 | return slice |
|
99 | 95 | |
|
100 | 96 | |
|
101 | 97 | def get_full_content(self, docid): |
|
102 | res = self.searcher.stored_fields(docid) | |
|
98 | res = self.searcher.stored_fields(docid[0]) | |
|
103 | 99 | f_path = res['path'][res['path'].find(res['repository']) \ |
|
104 | 100 | + len(res['repository']):].lstrip('/') |
|
105 | 101 | |
|
106 |
content_short = |
|
|
102 | content_short = self.get_short_content(res, docid[1]) | |
|
107 | 103 | res.update({'content_short':content_short, |
|
108 | 104 | 'content_short_hl':self.highlight(content_short), |
|
109 | 105 | 'f_path':f_path}) |
|
110 | 106 | |
|
111 | 107 | return res |
|
112 | ||
|
113 | def get_short_content(self, res): | |
|
108 | ||
|
109 | def get_short_content(self, res, chunks): | |
|
110 | ||
|
111 | return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks]) | |
|
112 | ||
|
113 | def get_chunks(self): | |
|
114 | 114 | """ |
|
115 | 115 | Smart function that implements chunking the content |
|
116 | 116 | but not overlap chunks so it doesn't highlight the same |
|
117 | 117 | close occurences twice. |
|
118 | 118 | @param matcher: |
|
119 | 119 | @param size: |
|
120 | 120 | """ |
|
121 | 121 | memory = [(0, 0)] |
|
122 | 122 | for span in self.matcher.spans(): |
|
123 | 123 | start = span.startchar or 0 |
|
124 | 124 | end = span.endchar or 0 |
|
125 | 125 | start_offseted = max(0, start - self.fragment_size) |
|
126 | 126 | end_offseted = end + self.fragment_size |
|
127 | print start_offseted, end_offseted | |
|
127 | ||
|
128 | 128 | if start_offseted < memory[-1][1]: |
|
129 | 129 | start_offseted = memory[-1][1] |
|
130 | 130 | memory.append((start_offseted, end_offseted,)) |
|
131 |
yield |
|
|
131 | yield (start_offseted, end_offseted,) | |
|
132 | 132 | |
|
133 | 133 | def highlight(self, content, top=5): |
|
134 | 134 | hl = highlight(escape(content), |
|
135 | 135 | self.highlight_items, |
|
136 | 136 | analyzer=ANALYZER, |
|
137 | 137 | fragmenter=FRAGMENTER, |
|
138 | 138 | formatter=FORMATTER, |
|
139 | 139 | top=top) |
|
140 | 140 | return hl |
General Comments 0
You need to be logged in to leave comments.
Login now