upstream/kallithea Commit - r465:e01a85f9

1

#!/usr/bin/env python

1

#!/usr/bin/env python

2

# encoding: utf-8

2

# encoding: utf-8

3

# whoosh indexer daemon for hg-app

3

# whoosh indexer daemon for hg-app

4

5

#

5

#

6

# This program is free software; you can redistribute it and/or

6

# This program is free software; you can redistribute it and/or

7

# modify it under the terms of the GNU General Public License

7

# modify it under the terms of the GNU General Public License

8

# as published by the Free Software Foundation; version 2

8

# as published by the Free Software Foundation; version 2

9

# of the License or (at your opinion) any later version of the license.

9

# of the License or (at your opinion) any later version of the license.

10

#

10

#

11

# This program is distributed in the hope that it will be useful,

11

# This program is distributed in the hope that it will be useful,

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14

# GNU General Public License for more details.

14

# GNU General Public License for more details.

15

#

15

#

16

# You should have received a copy of the GNU General Public License

16

# You should have received a copy of the GNU General Public License

17

# along with this program; if not, write to the Free Software

17

# along with this program; if not, write to the Free Software

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

19

# MA 02110-1301, USA.

19

# MA 02110-1301, USA.

20

"""

20

"""

21

Created on Jan 26, 2010

21

Created on Jan 26, 2010

22

23

@author: marcink

23

@author: marcink

24

A deamon will read from task table and run tasks

24

A deamon will read from task table and run tasks

25

"""

25

"""

26

import sys

26

import sys

27

import os

27

import os

28

from os.path import dirname as dn

28

from os.path import dirname as dn

29

from os.path import join as jn

29

from os.path import join as jn

30

31

#to get the pylons_app import

31

#to get the pylons_app import

32

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

32

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

33

sys.path.append(project_path)

33

sys.path.append(project_path)

34

35

from pidlock import LockHeld, DaemonLock

35

from pidlock import LockHeld, DaemonLock

36

import traceback

36

import traceback

37

from pylons_app.config.environment import load_environment

37

from pylons_app.config.environment import load_environment

38

from pylons_app.model.hg_model import HgModel

38

from pylons_app.model.hg_model import HgModel

39

from pylons_app.lib.helpers import safe_unicode

39

from pylons_app.lib.helpers import safe_unicode

40

from whoosh.index import create_in, open_dir

40

from whoosh.index import create_in, open_dir

41

from shutil import rmtree

41

from shutil import rmtree

42

from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \

42

from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \

43

SCHEMA, IDX_NAME

43

SCHEMA, IDX_NAME

44

45

import logging

45

import logging

46

import logging.config

46

import logging.config

47

logging.config.fileConfig(jn(project_path, 'development.ini'))

47

logging.config.fileConfig(jn(project_path, 'development.ini'))

48

log = logging.getLogger('whooshIndexer')

48

log = logging.getLogger('whooshIndexer')

49

50

def scan_paths(root_location):

50

def scan_paths(root_location):

51

return HgModel.repo_scan('/', root_location, None, True)

51

return HgModel.repo_scan('/', root_location, None, True)

52

53

class WhooshIndexingDaemon(object):

53

class WhooshIndexingDaemon(object):

54

"""Deamon for atomic jobs"""

54

"""Deamon for atomic jobs"""

55

56

def __init__(self, indexname='HG_INDEX', repo_location=None):

56

def __init__(self, indexname='HG_INDEX', repo_location=None):

57

self.indexname = indexname

57

self.indexname = indexname

58

self.repo_location = repo_location

58

self.repo_location = repo_location

59

self.initial = False

60

if not os.path.isdir(IDX_LOCATION):

61

os.mkdir(IDX_LOCATION)

62

log.info('Cannot run incremental index since it does not'

63

' yet exist running full build')

64

self.initial = True

59

65

60

def get_paths(self, root_dir):

66

def get_paths(self, root_dir):

61

"""recursive walk in root dir and return a set of all path in that dir

67

"""recursive walk in root dir and return a set of all path in that dir

62

excluding files in .hg dir"""

68

excluding files in .hg dir"""

63

index_paths_ = set()

69

index_paths_ = set()

64

for path, dirs, files in os.walk(root_dir):

70

for path, dirs, files in os.walk(root_dir):

65

if path.find('.hg') == -1:

71

if path.find('.hg') == -1:

66

for f in files:

72

for f in files:

67

index_paths_.add(jn(path, f))

73

index_paths_.add(jn(path, f))

68

74

69

return index_paths_

75

return index_paths_

70

76

71

def add_doc(self, writer, path, repo):

77

def add_doc(self, writer, path, repo):

72

"""Adding doc to writer"""

78

"""Adding doc to writer"""

73

79

74

ext = unicode(path.split('/')[-1].split('.')[-1].lower())

80

ext = unicode(path.split('/')[-1].split('.')[-1].lower())

75

#we just index the content of choosen files

81

#we just index the content of choosen files

76

if ext in INDEX_EXTENSIONS:

82

if ext in INDEX_EXTENSIONS:

77

log.debug(' >> %s [WITH CONTENT]' % path)

83

log.debug(' >> %s [WITH CONTENT]' % path)

78

fobj = open(path, 'rb')

84

fobj = open(path, 'rb')

79

content = fobj.read()

85

content = fobj.read()

80

fobj.close()

86

fobj.close()

81

u_content = safe_unicode(content)

87

u_content = safe_unicode(content)

82

else:

88

else:

83

log.debug(' >> %s' % path)

89

log.debug(' >> %s' % path)

84

#just index file name without it's content

90

#just index file name without it's content

85

u_content = u''

91

u_content = u''

86

92

87

93

88

94

89

try:

95

try:

90

os.stat(path)

96

os.stat(path)

91

writer.add_document(owner=unicode(repo.contact),

97

writer.add_document(owner=unicode(repo.contact),

92

repository=u"%s" % repo.name,

98

repository=u"%s" % repo.name,

93

path=u"%s" % path,

99

path=u"%s" % path,

94

content=u_content,

100

content=u_content,

95

modtime=os.path.getmtime(path),

101

modtime=os.path.getmtime(path),

96

extension=ext)

102

extension=ext)

97

except OSError, e:

103

except OSError, e:

98

import errno

104

import errno

99

if e.errno == errno.ENOENT:

105

if e.errno == errno.ENOENT:

100

log.debug('path %s does not exist or is a broken symlink' % path)

106

log.debug('path %s does not exist or is a broken symlink' % path)

101

else:

107

else:

102

raise e

108

raise e

103

109

104

110

105

def build_index(self):

111

def build_index(self):

106

if os.path.exists(IDX_LOCATION):

112

if os.path.exists(IDX_LOCATION):

107

log.debug('removing previos index')

113

log.debug('removing previos index')

108

rmtree(IDX_LOCATION)

114

rmtree(IDX_LOCATION)

109

115

110

if not os.path.exists(IDX_LOCATION):

116

if not os.path.exists(IDX_LOCATION):

111

os.mkdir(IDX_LOCATION)

117

os.mkdir(IDX_LOCATION)

112

118

113

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

119

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

114

writer = idx.writer()

120

writer = idx.writer()

115

121

116

for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

122

for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

117

log.debug('building index @ %s' % repo.path)

123

log.debug('building index @ %s' % repo.path)

118

124

119

for idx_path in self.get_paths(repo.path):

125

for idx_path in self.get_paths(repo.path):

120

self.add_doc(writer, idx_path, repo)

126

self.add_doc(writer, idx_path, repo)

121

writer.commit(merge=True)

127

writer.commit(merge=True)

122

128

123

log.debug('>>> FINISHED BUILDING INDEX <<<')

129

log.debug('>>> FINISHED BUILDING INDEX <<<')

124

130

125

131

126

def update_index(self):

132

def update_index(self):

127

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

133

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

128

134

129

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

135

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

130

# The set of all paths in the index

136

# The set of all paths in the index

131

indexed_paths = set()

137

indexed_paths = set()

132

# The set of all paths we need to re-index

138

# The set of all paths we need to re-index

133

to_index = set()

139

to_index = set()

134

140

135

reader = idx.reader()

141

reader = idx.reader()

136

writer = idx.writer()

142

writer = idx.writer()

137

143

138

# Loop over the stored fields in the index

144

# Loop over the stored fields in the index

139

for fields in reader.all_stored_fields():

145

for fields in reader.all_stored_fields():

140

indexed_path = fields['path']

146

indexed_path = fields['path']

141

indexed_paths.add(indexed_path)

147

indexed_paths.add(indexed_path)

142

148

143

if not os.path.exists(indexed_path):

149

if not os.path.exists(indexed_path):

144

# This file was deleted since it was indexed

150

# This file was deleted since it was indexed

145

log.debug('removing from index %s' % indexed_path)

151

log.debug('removing from index %s' % indexed_path)

146

writer.delete_by_term('path', indexed_path)

152

writer.delete_by_term('path', indexed_path)

147

153

148

else:

154

else:

149

# Check if this file was changed since it

155

# Check if this file was changed since it

150

# was indexed

156

# was indexed

151

indexed_time = fields['modtime']

157

indexed_time = fields['modtime']

152

158

153

mtime = os.path.getmtime(indexed_path)

159

mtime = os.path.getmtime(indexed_path)

154

160

155

if mtime > indexed_time:

161

if mtime > indexed_time:

156

162

157

# The file has changed, delete it and add it to the list of

163

# The file has changed, delete it and add it to the list of

158

# files to reindex

164

# files to reindex

159

log.debug('adding to reindex list %s' % indexed_path)

165

log.debug('adding to reindex list %s' % indexed_path)

160

writer.delete_by_term('path', indexed_path)

166

writer.delete_by_term('path', indexed_path)

161

to_index.add(indexed_path)

167

to_index.add(indexed_path)

162

#writer.commit()

168

#writer.commit()

163

169

164

# Loop over the files in the filesystem

170

# Loop over the files in the filesystem

165

# Assume we have a function that gathers the filenames of the

171

# Assume we have a function that gathers the filenames of the

166

# documents to be indexed

172

# documents to be indexed

167

for repo in scan_paths(self.repo_location).values():

173

for repo in scan_paths(self.repo_location).values():

168

for path in self.get_paths(repo.path):

174

for path in self.get_paths(repo.path):

169

if path in to_index or path not in indexed_paths:

175

if path in to_index or path not in indexed_paths:

170

# This is either a file that's changed, or a new file

176

# This is either a file that's changed, or a new file

171

# that wasn't indexed before. So index it!

177

# that wasn't indexed before. So index it!

172

self.add_doc(writer, path, repo)

178

self.add_doc(writer, path, repo)

173

log.debug('reindexing %s' % path)

179

log.debug('reindexing %s' % path)

174

180

175

writer.commit(merge=True)

181

writer.commit(merge=True)

176

#idx.optimize()

182

#idx.optimize()

177

log.debug('>>> FINISHED <<<')

183

log.debug('>>> FINISHED <<<')

178

184

179

def run(self, full_index=False):

185

def run(self, full_index=False):

180

"""Run daemon"""

186

"""Run daemon"""

181

if full_index:

187

if full_index or self.initial:

182

self.build_index()

188

self.build_index()

183

else:

189

else:

184

self.update_index()

190

self.update_index()

185

191

186

if __name__ == "__main__":

192

if __name__ == "__main__":

187

arg = sys.argv[1:]

193

arg = sys.argv[1:]

188

if len(arg) != 2:

194

if len(arg) != 2:

189

sys.stderr.write('Please specify indexing type [full|incremental]'

195

sys.stderr.write('Please specify indexing type [full|incremental]'

190

'and path to repositories as script args \n')

196

'and path to repositories as script args \n')

191

sys.exit()

197

sys.exit()

192

198

193

199

194

if arg[0] == 'full':

200

if arg[0] == 'full':

195

full_index = True

201

full_index = True

196

elif arg[0] == 'incremental':

202

elif arg[0] == 'incremental':

197

# False means looking just for changes

203

# False means looking just for changes

198

full_index = False

204

full_index = False

199

else:

205

else:

200

sys.stdout.write('Please use [full|incremental]'

206

sys.stdout.write('Please use [full|incremental]'

201

' as script first arg \n')

207

' as script first arg \n')

202

sys.exit()

208

sys.exit()

203

209

204

if not os.path.isdir(arg[1]):

210

if not os.path.isdir(arg[1]):

205

sys.stderr.write('%s is not a valid path \n' % arg[1])

211

sys.stderr.write('%s is not a valid path \n' % arg[1])

206

sys.exit()

212

sys.exit()

207

else:

213

else:

208

if arg[1].endswith('/'):

214

if arg[1].endswith('/'):

209

repo_location = arg[1] + '*'

215

repo_location = arg[1] + '*'

210

else:

216

else:

211

repo_location = arg[1] + '/*'

217

repo_location = arg[1] + '/*'

212

218

213

try:

219

try:

214

l = DaemonLock()

220

l = DaemonLock()

215

WhooshIndexingDaemon(repo_location=repo_location)\

221

WhooshIndexingDaemon(repo_location=repo_location)\

216

.run(full_index=full_index)

222

.run(full_index=full_index)

217

l.release()

223

l.release()

218

except LockHeld:

224

except LockHeld:

219

sys.exit(1)

225

sys.exit(1)

220

226

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python
             # encoding: utf-8
             # whoosh indexer daemon for hg-app
             # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
             #
             # This program is free software; you can redistribute it and/or
             # modify it under the terms of the GNU General Public License
             # as published by the Free Software Foundation; version 2
             # of the License or (at your opinion) any later version of the license.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software
             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
             # MA  02110-1301, USA.
             """
             Created on Jan 26, 2010
             @author: marcink
             A deamon will read from task table and run tasks
             """
             import sys
             import os
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the pylons_app import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from pidlock import LockHeld, DaemonLock
             import traceback
             from pylons_app.config.environment import load_environment
             from pylons_app.model.hg_model import HgModel
             from pylons_app.lib.helpers import safe_unicode
             from whoosh.index import create_in, open_dir
             from shutil import rmtree
             from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
             SCHEMA, IDX_NAME
             import logging
             import logging.config
             logging.config.fileConfig(jn(project_path, 'development.ini'))
             log = logging.getLogger('whooshIndexer')
             def scan_paths(root_location):
                 return HgModel.repo_scan('/', root_location, None, True)
             class WhooshIndexingDaemon(object):
                 """Deamon for atomic jobs"""
                 def __init__(self, indexname='HG_INDEX', repo_location=None):
                     self.indexname = indexname
                     self.repo_location = repo_location
+                    self.initial = False
+                    if not os.path.isdir(IDX_LOCATION):
+                        os.mkdir(IDX_LOCATION)
+                        log.info('Cannot run incremental index since it does not'
+                                 ' yet exist running full build')
+                        self.initial = True
                 def get_paths(self, root_dir):
                     """recursive walk in root dir and return a set of all path in that dir
                     excluding files in .hg dir"""
                     index_paths_ = set()
                     for path, dirs, files in os.walk(root_dir):
                         if path.find('.hg') == -1:
                             for f in files:
                                 index_paths_.add(jn(path, f))
                     return index_paths_
                 def add_doc(self, writer, path, repo):
                     """Adding doc to writer"""
                     ext = unicode(path.split('/')[-1].split('.')[-1].lower())
                     #we just index the content of choosen files
                     if ext in INDEX_EXTENSIONS:
                         log.debug('    >> %s [WITH CONTENT]' % path)
                         fobj = open(path, 'rb')
                         content = fobj.read()
                         fobj.close()
                         u_content = safe_unicode(content)
                     else:
                         log.debug('    >> %s' % path)
                         #just index file name without it's content
                         u_content = u''
                     try:
                         os.stat(path)
                         writer.add_document(owner=unicode(repo.contact),
                                         repository=u"%s" % repo.name,
                                         path=u"%s" % path,
                                         content=u_content,
                                         modtime=os.path.getmtime(path),
                                         extension=ext)
                     except OSError, e:
                         import errno
                         if e.errno == errno.ENOENT:
                             log.debug('path %s does not exist or is a broken symlink' % path)
                         else:
                             raise e
                 def build_index(self):
                     if os.path.exists(IDX_LOCATION):
                         log.debug('removing previos index')
                         rmtree(IDX_LOCATION)
                     if not os.path.exists(IDX_LOCATION):
                         os.mkdir(IDX_LOCATION)
                     idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
                     writer = idx.writer()
                     for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
                         log.debug('building index @ %s' % repo.path)
                         for idx_path in self.get_paths(repo.path):
                             self.add_doc(writer, idx_path, repo)
                     writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_index(self):
                     log.debug('STARTING INCREMENTAL INDEXING UPDATE')
                     idx = open_dir(IDX_LOCATION, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     reader = idx.reader()
                     writer = idx.writer()
                     # Loop over the stored fields in the index
                     for fields in reader.all_stored_fields():
                         indexed_path = fields['path']
                         indexed_paths.add(indexed_path)
                         if not os.path.exists(indexed_path):
                             # This file was deleted since it was indexed
                             log.debug('removing from index %s' % indexed_path)
                             writer.delete_by_term('path', indexed_path)
                         else:
                             # Check if this file was changed since it
                             # was indexed
                             indexed_time = fields['modtime']
                             mtime = os.path.getmtime(indexed_path)
                             if mtime > indexed_time:
                                 # The file has changed, delete it and add it to the list of
                                 # files to reindex
                                 log.debug('adding to reindex list %s' % indexed_path)
                                 writer.delete_by_term('path', indexed_path)
                                 to_index.add(indexed_path)
                                 #writer.commit()
                     # Loop over the files in the filesystem
                     # Assume we have a function that gathers the filenames of the
                     # documents to be indexed
                     for repo in scan_paths(self.repo_location).values():
                         for path in self.get_paths(repo.path):
                             if path in to_index or path not in indexed_paths:
                                 # This is either a file that's changed, or a new file
                                 # that wasn't indexed before. So index it!
                                 self.add_doc(writer, path, repo)
                                 log.debug('reindexing %s' % path)
                     writer.commit(merge=True)
                     #idx.optimize()
                     log.debug('>>> FINISHED <<<')
                 def run(self, full_index=False):
                     """Run daemon"""
-                    if full_index:
+                    if full_index or self.initial:
                         self.build_index()
                     else:
                         self.update_index()
             if __name__ == "__main__":
                 arg = sys.argv[1:]
                 if len(arg) != 2:
                     sys.stderr.write('Please specify indexing type [full|incremental]'
                                      'and path to repositories as script args \n')
                     sys.exit()
                 if arg[0] == 'full':
                     full_index = True
                 elif arg[0] == 'incremental':
                     # False means looking just for changes
                     full_index = False
                 else:
                     sys.stdout.write('Please use [full|incremental]'
                                      ' as script first arg \n')
                     sys.exit()
                 if not os.path.isdir(arg[1]):
                     sys.stderr.write('%s is not a valid path \n' % arg[1])
                     sys.exit()
                 else:
                     if arg[1].endswith('/'):
                         repo_location = arg[1] + '*'
                     else:
                         repo_location = arg[1] + '/*'
                 try:
                     l = DaemonLock()
                     WhooshIndexingDaemon(repo_location=repo_location)\
                         .run(full_index=full_index)
                     l.release()
                 except LockHeld:
                     sys.exit(1)