##// END OF EJS Templates
added support for broken symlinks in whoosh indexer
marcink -
r441:c59c4d43 default
parent child Browse files
Show More
@@ -1,188 +1,199 b''
1 1 #!/usr/bin/env python
2 2 # encoding: utf-8
3 3 # whoosh indexer daemon for hg-app
4 4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 5 #
6 6 # This program is free software; you can redistribute it and/or
7 7 # modify it under the terms of the GNU General Public License
8 8 # as published by the Free Software Foundation; version 2
9 9 # of the License or (at your opinion) any later version of the license.
10 10 #
11 11 # This program is distributed in the hope that it will be useful,
12 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 14 # GNU General Public License for more details.
15 15 #
16 16 # You should have received a copy of the GNU General Public License
17 17 # along with this program; if not, write to the Free Software
18 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 19 # MA 02110-1301, USA.
20 20 """
21 21 Created on Jan 26, 2010
22 22
23 23 @author: marcink
24 24 A deamon will read from task table and run tasks
25 25 """
26 26 import sys
27 27 import os
28 28 from os.path import dirname as dn
29 29 from os.path import join as jn
30 30
31 31 #to get the pylons_app import
32 32 project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
33 33 sys.path.append(project_path)
34 34
35 35 from pidlock import LockHeld, DaemonLock
36 36 import traceback
37 37 from pylons_app.config.environment import load_environment
38 38 from pylons_app.model.hg_model import HgModel
39 39 from whoosh.index import create_in, open_dir
40 40 from shutil import rmtree
41 41 from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
42 42 SCHEMA, IDX_NAME
43 43
44 44 import logging
45 45 import logging.config
46 46 logging.config.fileConfig(jn(project_path, 'development.ini'))
47 47 log = logging.getLogger('whooshIndexer')
48 48
49 49 def scan_paths(root_location):
50 50 return HgModel.repo_scan('/', root_location, None, True)
51 51
52 52 class WhooshIndexingDaemon(object):
53 53 """Deamon for atomic jobs"""
54 54
55 55 def __init__(self, indexname='HG_INDEX', repo_location=None):
56 56 self.indexname = indexname
57 57 self.repo_location = repo_location
58 58
59 59 def get_paths(self, root_dir):
60 60 """recursive walk in root dir and return a set of all path in that dir
61 61 excluding files in .hg dir"""
62 62 index_paths_ = set()
63 63 for path, dirs, files in os.walk(root_dir):
64 64 if path.find('.hg') == -1:
65 65 for f in files:
66 66 index_paths_.add(jn(path, f))
67 67
68 68 return index_paths_
69 69
70 70 def add_doc(self, writer, path, repo):
71 71 """Adding doc to writer"""
72 72
73 73 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
74 74 #we just index the content of choosen files
75 75 if ext in INDEX_EXTENSIONS:
76 76 log.debug(' >> %s [WITH CONTENT]' % path)
77 77 fobj = open(path, 'rb')
78 78 content = fobj.read()
79 79 fobj.close()
80 80 try:
81 81 u_content = unicode(content)
82 82 except UnicodeDecodeError:
83 83 #incase we have a decode error just represent as byte string
84 84 u_content = unicode(str(content).encode('string_escape'))
85 85 else:
86 86 log.debug(' >> %s' % path)
87 87 #just index file name without it's content
88 88 u_content = u''
89
90 writer.add_document(owner=unicode(repo.contact),
89
90
91
92 try:
93 os.stat(path)
94 writer.add_document(owner=unicode(repo.contact),
91 95 repository=u"%s" % repo.name,
92 96 path=u"%s" % path,
93 97 content=u_content,
94 98 modtime=os.path.getmtime(path),
95 extension=ext)
99 extension=ext)
100 except OSError, e:
101 import errno
102 if e.errno == errno.ENOENT:
103 log.debug('path %s does not exist or is a broken symlink' % path)
104 else:
105 raise e
106
96 107
97 108 def build_index(self):
98 109 if os.path.exists(IDX_LOCATION):
99 110 log.debug('removing previos index')
100 111 rmtree(IDX_LOCATION)
101 112
102 113 if not os.path.exists(IDX_LOCATION):
103 114 os.mkdir(IDX_LOCATION)
104 115
105 116 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
106 117 writer = idx.writer()
107 118
108 119 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
109 120 log.debug('building index @ %s' % repo.path)
110 121
111 122 for idx_path in self.get_paths(repo.path):
112 123 self.add_doc(writer, idx_path, repo)
113 124 writer.commit(merge=True)
114 125
115 126 log.debug('>>> FINISHED BUILDING INDEX <<<')
116 127
117 128
118 129 def update_index(self):
119 130 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
120 131
121 132 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
122 133 # The set of all paths in the index
123 134 indexed_paths = set()
124 135 # The set of all paths we need to re-index
125 136 to_index = set()
126 137
127 138 reader = idx.reader()
128 139 writer = idx.writer()
129 140
130 141 # Loop over the stored fields in the index
131 142 for fields in reader.all_stored_fields():
132 143 indexed_path = fields['path']
133 144 indexed_paths.add(indexed_path)
134 145
135 146 if not os.path.exists(indexed_path):
136 147 # This file was deleted since it was indexed
137 148 log.debug('removing from index %s' % indexed_path)
138 149 writer.delete_by_term('path', indexed_path)
139 150
140 151 else:
141 152 # Check if this file was changed since it
142 153 # was indexed
143 154 indexed_time = fields['modtime']
144 155
145 156 mtime = os.path.getmtime(indexed_path)
146 157
147 158 if mtime > indexed_time:
148 159
149 160 # The file has changed, delete it and add it to the list of
150 161 # files to reindex
151 162 log.debug('adding to reindex list %s' % indexed_path)
152 163 writer.delete_by_term('path', indexed_path)
153 164 to_index.add(indexed_path)
154 165 #writer.commit()
155 166
156 167 # Loop over the files in the filesystem
157 168 # Assume we have a function that gathers the filenames of the
158 169 # documents to be indexed
159 170 for repo in scan_paths(self.repo_location).values():
160 171 for path in self.get_paths(repo.path):
161 172 if path in to_index or path not in indexed_paths:
162 173 # This is either a file that's changed, or a new file
163 174 # that wasn't indexed before. So index it!
164 175 self.add_doc(writer, path, repo)
165 176 log.debug('reindexing %s' % path)
166 177
167 178 writer.commit(merge=True)
168 179 #idx.optimize()
169 180 log.debug('>>> FINISHED <<<')
170 181
171 182 def run(self, full_index=False):
172 183 """Run daemon"""
173 184 if full_index:
174 185 self.build_index()
175 186 else:
176 187 self.update_index()
177 188
178 189 if __name__ == "__main__":
179 190 repo_location = '/home/marcink/hg_repos/*'
180 191 full_index = True # False means looking just for changes
181 192 try:
182 193 l = DaemonLock()
183 194 WhooshIndexingDaemon(repo_location=repo_location)\
184 195 .run(full_index=full_index)
185 196 l.release()
186 197 except LockHeld:
187 198 sys.exit(1)
188 199
General Comments 0
You need to be logged in to leave comments. Login now