##// END OF EJS Templates
fixed path issue
marcink -
r407:0c9dfae5 default
parent child Browse files
Show More
@@ -1,181 +1,181
1 #!/usr/bin/env python
1 #!/usr/bin/env python
2 # encoding: utf-8
2 # encoding: utf-8
3 # whoosh indexer daemon for hg-app
3 # whoosh indexer daemon for hg-app
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
4 # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
5 #
5 #
6 # This program is free software; you can redistribute it and/or
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; version 2
8 # as published by the Free Software Foundation; version 2
9 # of the License or (at your opinion) any later version of the license.
9 # of the License or (at your opinion) any later version of the license.
10 #
10 #
11 # This program is distributed in the hope that it will be useful,
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
14 # GNU General Public License for more details.
15 #
15 #
16 # You should have received a copy of the GNU General Public License
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 # MA 02110-1301, USA.
19 # MA 02110-1301, USA.
20 """
20 """
21 Created on Jan 26, 2010
21 Created on Jan 26, 2010
22
22
23 @author: marcink
23 @author: marcink
24 A deamon will read from task table and run tasks
24 A deamon will read from task table and run tasks
25 """
25 """
26 import sys
26 import sys
27 import os
27 import os
28 from pidlock import LockHeld, DaemonLock
28 from pidlock import LockHeld, DaemonLock
29 import traceback
29 import traceback
30
30
31 from os.path import dirname as dn
31 from os.path import dirname as dn
32 from os.path import join as jn
32 from os.path import join as jn
33
33
34 #to get the pylons_app import
34 #to get the pylons_app import
35 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
35 sys.path.append(dn(dn(dn(dn(os.path.realpath(__file__))))))
36
36
37 from pylons_app.config.environment import load_environment
37 from pylons_app.config.environment import load_environment
38 from pylons_app.model.hg_model import HgModel
38 from pylons_app.model.hg_model import HgModel
39 from whoosh.index import create_in, open_dir
39 from whoosh.index import create_in, open_dir
40 from shutil import rmtree
40 from shutil import rmtree
41 from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
41 from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
42 import logging
42 import logging
43 log = logging.getLogger(__name__)
43 log = logging.getLogger(__name__)
44
44
45
45
46 location = '/home/marcink/python_workspace_dirty/*'
46 location = '/home/marcink/python_workspace_dirty/*'
47
47
48 def scan_paths(root_location):
48 def scan_paths(root_location):
49 return HgModel.repo_scan('/', root_location, None, True)
49 return HgModel.repo_scan('/', root_location, None, True)
50
50
51 class WhooshIndexingDaemon(object):
51 class WhooshIndexingDaemon(object):
52 """Deamon for atomic jobs"""
52 """Deamon for atomic jobs"""
53
53
54 def __init__(self, indexname='HG_INDEX'):
54 def __init__(self, indexname='HG_INDEX'):
55 self.indexname = indexname
55 self.indexname = indexname
56
56
57
57
58 def get_paths(self, root_dir):
58 def get_paths(self, root_dir):
59 """recursive walk in root dir and return a set of all path in that dir
59 """recursive walk in root dir and return a set of all path in that dir
60 excluding files in .hg dir"""
60 excluding files in .hg dir"""
61 index_paths_ = set()
61 index_paths_ = set()
62 for path, dirs, files in os.walk(root_dir):
62 for path, dirs, files in os.walk(root_dir):
63 if path.find('.hg') == -1:
63 if path.find('.hg') == -1:
64 for f in files:
64 for f in files:
65 index_paths_.add(jn(path, f))
65 index_paths_.add(jn(path, f))
66
66
67 return index_paths_
67 return index_paths_
68
68
69 def add_doc(self, writer, path, repo):
69 def add_doc(self, writer, path, repo):
70 """Adding doc to writer"""
70 """Adding doc to writer"""
71
71
72 #we don't won't to read excluded file extensions just index them
72 #we don't won't to read excluded file extensions just index them
73 if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
73 if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
74 fobj = open(path, 'rb')
74 fobj = open(path, 'rb')
75 content = fobj.read()
75 content = fobj.read()
76 fobj.close()
76 fobj.close()
77 try:
77 try:
78 u_content = unicode(content)
78 u_content = unicode(content)
79 except UnicodeDecodeError:
79 except UnicodeDecodeError:
80 #incase we have a decode error just represent as byte string
80 #incase we have a decode error just represent as byte string
81 u_content = unicode(str(content).encode('string_escape'))
81 u_content = unicode(str(content).encode('string_escape'))
82 else:
82 else:
83 u_content = u''
83 u_content = u''
84 writer.add_document(owner=unicode(repo.contact),
84 writer.add_document(owner=unicode(repo.contact),
85 repository=u"%s" % repo.name,
85 repository=u"%s" % repo.name,
86 path=u"%s" % path,
86 path=u"%s" % path,
87 content=u_content,
87 content=u_content,
88 modtime=os.path.getmtime(path))
88 modtime=os.path.getmtime(path))
89
89
90 def build_index(self):
90 def build_index(self):
91 if os.path.exists(IDX_LOCATION):
91 if os.path.exists(IDX_LOCATION):
92 rmtree(IDX_LOCATION)
92 rmtree(IDX_LOCATION)
93
93
94 if not os.path.exists(IDX_LOCATION):
94 if not os.path.exists(IDX_LOCATION):
95 os.mkdir(IDX_LOCATION)
95 os.mkdir(IDX_LOCATION)
96
96
97 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
97 idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
98 writer = idx.writer()
98 writer = idx.writer()
99
99
100 for cnt, repo in enumerate(scan_paths(location).values()):
100 for cnt, repo in enumerate(scan_paths(location).values()):
101 log.debug('building index @ %s' % repo.path)
101 log.debug('building index @ %s' % repo.path)
102
102
103 for idx_path in self.get_paths(repo.path):
103 for idx_path in self.get_paths(repo.path):
104 log.debug(' >> %s' % idx_path)
104 log.debug(' >> %s' % idx_path)
105 self.add_doc(writer, idx_path, repo)
105 self.add_doc(writer, idx_path, repo)
106 writer.commit(merge=True)
106 writer.commit(merge=True)
107
107
108 log.debug('>>> FINISHED BUILDING INDEX <<<')
108 log.debug('>>> FINISHED BUILDING INDEX <<<')
109
109
110
110
111 def update_index(self):
111 def update_index(self):
112 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
112 log.debug('STARTING INCREMENTAL INDEXING UPDATE')
113
113
114 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
114 idx = open_dir(IDX_LOCATION, indexname=self.indexname)
115 # The set of all paths in the index
115 # The set of all paths in the index
116 indexed_paths = set()
116 indexed_paths = set()
117 # The set of all paths we need to re-index
117 # The set of all paths we need to re-index
118 to_index = set()
118 to_index = set()
119
119
120 reader = idx.reader()
120 reader = idx.reader()
121 writer = idx.writer()
121 writer = idx.writer()
122
122
123 # Loop over the stored fields in the index
123 # Loop over the stored fields in the index
124 for fields in reader.all_stored_fields():
124 for fields in reader.all_stored_fields():
125 indexed_path = fields['path']
125 indexed_path = fields['path']
126 indexed_paths.add(indexed_path)
126 indexed_paths.add(indexed_path)
127
127
128 if not os.path.exists(indexed_path):
128 if not os.path.exists(indexed_path):
129 # This file was deleted since it was indexed
129 # This file was deleted since it was indexed
130 log.debug('removing from index %s' % indexed_path)
130 log.debug('removing from index %s' % indexed_path)
131 writer.delete_by_term('path', indexed_path)
131 writer.delete_by_term('path', indexed_path)
132
132
133 else:
133 else:
134 # Check if this file was changed since it
134 # Check if this file was changed since it
135 # was indexed
135 # was indexed
136 indexed_time = fields['modtime']
136 indexed_time = fields['modtime']
137
137
138 mtime = os.path.getmtime(indexed_path)
138 mtime = os.path.getmtime(indexed_path)
139
139
140 if mtime > indexed_time:
140 if mtime > indexed_time:
141
141
142 # The file has changed, delete it and add it to the list of
142 # The file has changed, delete it and add it to the list of
143 # files to reindex
143 # files to reindex
144 log.debug('adding to reindex list %s' % indexed_path)
144 log.debug('adding to reindex list %s' % indexed_path)
145 writer.delete_by_term('path', indexed_path)
145 writer.delete_by_term('path', indexed_path)
146 to_index.add(indexed_path)
146 to_index.add(indexed_path)
147 #writer.commit()
147 #writer.commit()
148
148
149 # Loop over the files in the filesystem
149 # Loop over the files in the filesystem
150 # Assume we have a function that gathers the filenames of the
150 # Assume we have a function that gathers the filenames of the
151 # documents to be indexed
151 # documents to be indexed
152 for repo in scan_paths(location).values():
152 for repo in scan_paths(location).values():
153 for path in self.get_paths(repo.path):
153 for path in self.get_paths(repo.path):
154 if path in to_index or path not in indexed_paths:
154 if path in to_index or path not in indexed_paths:
155 # This is either a file that's changed, or a new file
155 # This is either a file that's changed, or a new file
156 # that wasn't indexed before. So index it!
156 # that wasn't indexed before. So index it!
157 self.add_doc(writer, path, repo)
157 self.add_doc(writer, path, repo)
158 log.debug('reindexing %s' % path)
158 log.debug('reindexing %s' % path)
159
159
160 writer.commit(merge=True)
160 writer.commit(merge=True)
161 #idx.optimize()
161 #idx.optimize()
162 log.debug('>>> FINISHED <<<')
162 log.debug('>>> FINISHED <<<')
163
163
164 def run(self, full_index=False):
164 def run(self, full_index=False):
165 """Run daemon"""
165 """Run daemon"""
166 if full_index:
166 if full_index:
167 self.build_index()
167 self.build_index()
168 else:
168 else:
169 self.update_index()
169 self.update_index()
170
170
171 if __name__ == "__main__":
171 if __name__ == "__main__":
172
172
173 #config = load_environment()
173 #config = load_environment()
174 #print config
174 #print config
175 try:
175 try:
176 l = DaemonLock()
176 l = DaemonLock()
177 WhooshIndexingDaemon().run(full_index=True)
177 WhooshIndexingDaemon().run(full_index=True)
178 l.release()
178 l.release()
179 except LockHeld:
179 except LockHeld:
180 sys.exit(1)
180 sys.exit(1)
181
181
General Comments 0
You need to be logged in to leave comments. Login now