##// END OF EJS Templates
updated whoosh indexer to take path as second argument
marcink -
r452:f19d3ee8 default
parent child Browse files
Show More
@@ -1,212 +1,220
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3 # whoosh indexer daemon for hg-app
4 4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 5 #
6 6 # This program is free software; you can redistribute it and/or
7 7 # modify it under the terms of the GNU General Public License
8 8 # as published by the Free Software Foundation; version 2
9 9 # of the License or (at your opinion) any later version of the license.
10 10 #
11 11 # This program is distributed in the hope that it will be useful,
12 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 14 # GNU General Public License for more details.
15 15 #
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program; if not, write to the Free Software
18 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 19 # MA 02110-1301, USA.
20 20 """
21 21 Created on Jan 26, 2010
22 22
23 23 @author: marcink
24 24 A deamon will read from task table and run tasks
25 25 """
26 26 import sys
27 27 import os
28 28 from os.path import dirname as dn
29 29 from os.path import join as jn
30 30
31 31 #to get the pylons_app import
32 32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
33 33 sys.path.append(project_path)
34 34
35 35 from pidlock import LockHeld, DaemonLock
36 36 import traceback
37 37 from pylons_app.config.environment import load_environment
38 38 from pylons_app.model.hg_model import HgModel
39 39 from pylons_app.lib.helpers import safe_unicode
40 40 from whoosh.index import create_in, open_dir
41 41 from shutil import rmtree
42 42 from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
43 43 SCHEMA, IDX_NAME
44 44
45 45 import logging
46 46 import logging.config
47 47 logging.config.fileConfig(jn(project_path, 'development.ini'))
48 48 log = logging.getLogger('whooshIndexer')
49 49
50 50 def scan_paths(root_location):
51 51 return HgModel.repo_scan('/', root_location, None, True)
52 52
53 53 class WhooshIndexingDaemon(object):
54 54 """Deamon for atomic jobs"""
55 55
56 56 def __init__(self, indexname='HG_INDEX', repo_location=None):
57 57 self.indexname = indexname
58 58 self.repo_location = repo_location
59 59
60 60 def get_paths(self, root_dir):
61 61 """recursive walk in root dir and return a set of all path in that dir
62 62 excluding files in .hg dir"""
63 63 index_paths_ = set()
64 64 for path, dirs, files in os.walk(root_dir):
65 65 if path.find('.hg') == -1:
66 66 for f in files:
67 67 index_paths_.add(jn(path, f))
68 68
69 69 return index_paths_
70 70
71 71 def add_doc(self, writer, path, repo):
72 72 """Adding doc to writer"""
73 73
74 74 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
75 75 #we just index the content of choosen files
76 76 if ext in INDEX_EXTENSIONS:
77 77 log.debug(' >> %s [WITH CONTENT]' % path)
78 78 fobj = open(path, 'rb')
79 79 content = fobj.read()
80 80 fobj.close()
81 81 u_content = safe_unicode(content)
82 82 else:
83 83 log.debug(' >> %s' % path)
84 84 #just index file name without it's content
85 85 u_content = u''
86 86
87 87
88 88
89 89 try:
90 90 os.stat(path)
91 91 writer.add_document(owner=unicode(repo.contact),
92 92 repository=u"%s" % repo.name,
93 93 path=u"%s" % path,
94 94 content=u_content,
95 95 modtime=os.path.getmtime(path),
96 96 extension=ext)
97 97 except OSError, e:
98 98 import errno
99 99 if e.errno == errno.ENOENT:
100 100 log.debug('path %s does not exist or is a broken symlink' % path)
101 101 else:
102 102 raise e
103 103
104 104
105 105 def build_index(self):
106 106 if os.path.exists(IDX_LOCATION):
107 107 log.debug('removing previos index')
108 108 rmtree(IDX_LOCATION)
109 109
110 110 if not os.path.exists(IDX_LOCATION):
111 111 os.mkdir(IDX_LOCATION)
112 112
113 113 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
114 114 writer = idx.writer()
115 115
116 116 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
117 117 log.debug('building index @ %s' % repo.path)
118 118
119 119 for idx_path in self.get_paths(repo.path):
120 120 self.add_doc(writer, idx_path, repo)
121 121 writer.commit(merge=True)
122 122
123 123 log.debug('>>> FINISHED BUILDING INDEX <<<')
124 124
125 125
126 126 def update_index(self):
127 127 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
128 128
129 129 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
130 130 # The set of all paths in the index
131 131 indexed_paths = set()
132 132 # The set of all paths we need to re-index
133 133 to_index = set()
134 134
135 135 reader = idx.reader()
136 136 writer = idx.writer()
137 137
138 138 # Loop over the stored fields in the index
139 139 for fields in reader.all_stored_fields():
140 140 indexed_path = fields['path']
141 141 indexed_paths.add(indexed_path)
142 142
143 143 if not os.path.exists(indexed_path):
144 144 # This file was deleted since it was indexed
145 145 log.debug('removing from index %s' % indexed_path)
146 146 writer.delete_by_term('path', indexed_path)
147 147
148 148 else:
149 149 # Check if this file was changed since it
150 150 # was indexed
151 151 indexed_time = fields['modtime']
152 152
153 153 mtime = os.path.getmtime(indexed_path)
154 154
155 155 if mtime > indexed_time:
156 156
157 157 # The file has changed, delete it and add it to the list of
158 158 # files to reindex
159 159 log.debug('adding to reindex list %s' % indexed_path)
160 160 writer.delete_by_term('path', indexed_path)
161 161 to_index.add(indexed_path)
162 162 #writer.commit()
163 163
164 164 # Loop over the files in the filesystem
165 165 # Assume we have a function that gathers the filenames of the
166 166 # documents to be indexed
167 167 for repo in scan_paths(self.repo_location).values():
168 168 for path in self.get_paths(repo.path):
169 169 if path in to_index or path not in indexed_paths:
170 170 # This is either a file that's changed, or a new file
171 171 # that wasn't indexed before. So index it!
172 172 self.add_doc(writer, path, repo)
173 173 log.debug('reindexing %s' % path)
174 174
175 175 writer.commit(merge=True)
176 176 #idx.optimize()
177 177 log.debug('>>> FINISHED <<<')
178 178
179 179 def run(self, full_index=False):
180 180 """Run daemon"""
181 181 if full_index:
182 182 self.build_index()
183 183 else:
184 184 self.update_index()
185 185
186 186 if __name__ == "__main__":
187 187 arg = sys.argv[1:]
188 if not arg:
189 sys.stdout.write('Please specify indexing type [full|incremental]'
190 ' as script arg \n')
188 if len(arg) != 2:
189 sys.stderr.write('Please specify indexing type [full|incremental]'
190 'and path to repositories as script args \n')
191 191 sys.exit()
192
193
192 194 if arg[0] == 'full':
193 195 full_index = True
194 196 elif arg[0] == 'incremental':
195 197 # False means looking just for changes
196 198 full_index = False
197 199 else:
198 200 sys.stdout.write('Please use [full|incremental]'
199 ' as script arg \n')
201 ' as script first arg \n')
200 202 sys.exit()
201 203
202
203 repo_location = '/home/hg_repos/*'
204 if not os.path.isdir(arg[1]):
205 sys.stderr.write('%s is not a valid path \n' % arg[1])
206 sys.exit()
207 else:
208 if arg[1].endswith('/'):
209 repo_location = arg[1] + '*'
210 else:
211 repo_location = arg[1] + '/*'
204 212
205 213 try:
206 214 l = DaemonLock()
207 215 WhooshIndexingDaemon(repo_location=repo_location)\
208 216 .run(full_index=full_index)
209 217 l.release()
210 218 except LockHeld:
211 219 sys.exit(1)
212 220
General Comments 0
You need to be logged in to leave comments. Login now