upstream/kallithea Commit - r452:f19d3ee8

1

#!/usr/bin/env python

1

#!/usr/bin/env python

2

# encoding: utf-8

2

# encoding: utf-8

3

# whoosh indexer daemon for hg-app

3

# whoosh indexer daemon for hg-app

4

5

#

5

#

6

# This program is free software; you can redistribute it and/or

6

# This program is free software; you can redistribute it and/or

7

# modify it under the terms of the GNU General Public License

7

# modify it under the terms of the GNU General Public License

8

# as published by the Free Software Foundation; version 2

8

# as published by the Free Software Foundation; version 2

9

# of the License or (at your opinion) any later version of the license.

9

# of the License or (at your opinion) any later version of the license.

10

#

10

#

11

# This program is distributed in the hope that it will be useful,

11

# This program is distributed in the hope that it will be useful,

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14

# GNU General Public License for more details.

14

# GNU General Public License for more details.

15

#

15

#

16

# You should have received a copy of the GNU General Public License

16

# You should have received a copy of the GNU General Public License

17

# along with this program; if not, write to the Free Software

17

# along with this program; if not, write to the Free Software

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

19

# MA 02110-1301, USA.

19

# MA 02110-1301, USA.

20

"""

20

"""

21

Created on Jan 26, 2010

21

Created on Jan 26, 2010

22

23

@author: marcink

23

@author: marcink

24

A deamon will read from task table and run tasks

24

A deamon will read from task table and run tasks

25

"""

25

"""

26

import sys

26

import sys

27

import os

27

import os

28

from os.path import dirname as dn

28

from os.path import dirname as dn

29

from os.path import join as jn

29

from os.path import join as jn

30

31

#to get the pylons_app import

31

#to get the pylons_app import

32

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

32

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

33

sys.path.append(project_path)

33

sys.path.append(project_path)

34

35

from pidlock import LockHeld, DaemonLock

35

from pidlock import LockHeld, DaemonLock

36

import traceback

36

import traceback

37

from pylons_app.config.environment import load_environment

37

from pylons_app.config.environment import load_environment

38

from pylons_app.model.hg_model import HgModel

38

from pylons_app.model.hg_model import HgModel

39

from pylons_app.lib.helpers import safe_unicode

39

from pylons_app.lib.helpers import safe_unicode

40

from whoosh.index import create_in, open_dir

40

from whoosh.index import create_in, open_dir

41

from shutil import rmtree

41

from shutil import rmtree

42

from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \

42

from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \

43

SCHEMA, IDX_NAME

43

SCHEMA, IDX_NAME

44

45

import logging

45

import logging

46

import logging.config

46

import logging.config

47

logging.config.fileConfig(jn(project_path, 'development.ini'))

47

logging.config.fileConfig(jn(project_path, 'development.ini'))

48

log = logging.getLogger('whooshIndexer')

48

log = logging.getLogger('whooshIndexer')

49

50

def scan_paths(root_location):

50

def scan_paths(root_location):

51

return HgModel.repo_scan('/', root_location, None, True)

51

return HgModel.repo_scan('/', root_location, None, True)

52

53

class WhooshIndexingDaemon(object):

53

class WhooshIndexingDaemon(object):

54

"""Deamon for atomic jobs"""

54

"""Deamon for atomic jobs"""

55

56

def __init__(self, indexname='HG_INDEX', repo_location=None):

56

def __init__(self, indexname='HG_INDEX', repo_location=None):

57

self.indexname = indexname

57

self.indexname = indexname

58

self.repo_location = repo_location

58

self.repo_location = repo_location

59

60

def get_paths(self, root_dir):

60

def get_paths(self, root_dir):

61

"""recursive walk in root dir and return a set of all path in that dir

61

"""recursive walk in root dir and return a set of all path in that dir

62

excluding files in .hg dir"""

62

excluding files in .hg dir"""

63

index_paths_ = set()

63

index_paths_ = set()

64

for path, dirs, files in os.walk(root_dir):

64

for path, dirs, files in os.walk(root_dir):

65

if path.find('.hg') == -1:

65

if path.find('.hg') == -1:

66

for f in files:

66

for f in files:

67

index_paths_.add(jn(path, f))

67

index_paths_.add(jn(path, f))

68

69

return index_paths_

69

return index_paths_

70

71

def add_doc(self, writer, path, repo):

71

def add_doc(self, writer, path, repo):

72

"""Adding doc to writer"""

72

"""Adding doc to writer"""

73

74

ext = unicode(path.split('/')[-1].split('.')[-1].lower())

74

ext = unicode(path.split('/')[-1].split('.')[-1].lower())

75

#we just index the content of choosen files

75

#we just index the content of choosen files

76

if ext in INDEX_EXTENSIONS:

76

if ext in INDEX_EXTENSIONS:

77

log.debug(' >> %s [WITH CONTENT]' % path)

77

log.debug(' >> %s [WITH CONTENT]' % path)

78

fobj = open(path, 'rb')

78

fobj = open(path, 'rb')

79

content = fobj.read()

79

content = fobj.read()

80

fobj.close()

80

fobj.close()

81

u_content = safe_unicode(content)

81

u_content = safe_unicode(content)

82

else:

82

else:

83

log.debug(' >> %s' % path)

83

log.debug(' >> %s' % path)

84

#just index file name without it's content

84

#just index file name without it's content

85

u_content = u''

85

u_content = u''

86

87

88

89

try:

89

try:

90

os.stat(path)

90

os.stat(path)

91

writer.add_document(owner=unicode(repo.contact),

91

writer.add_document(owner=unicode(repo.contact),

92

repository=u"%s" % repo.name,

92

repository=u"%s" % repo.name,

93

path=u"%s" % path,

93

path=u"%s" % path,

94

content=u_content,

94

content=u_content,

95

modtime=os.path.getmtime(path),

95

modtime=os.path.getmtime(path),

96

extension=ext)

96

extension=ext)

97

except OSError, e:

97

except OSError, e:

98

import errno

98

import errno

99

if e.errno == errno.ENOENT:

99

if e.errno == errno.ENOENT:

100

log.debug('path %s does not exist or is a broken symlink' % path)

100

log.debug('path %s does not exist or is a broken symlink' % path)

101

else:

101

else:

102

raise e

102

raise e

103

104

105

def build_index(self):

105

def build_index(self):

106

if os.path.exists(IDX_LOCATION):

106

if os.path.exists(IDX_LOCATION):

107

log.debug('removing previos index')

107

log.debug('removing previos index')

108

rmtree(IDX_LOCATION)

108

rmtree(IDX_LOCATION)

109

110

if not os.path.exists(IDX_LOCATION):

110

if not os.path.exists(IDX_LOCATION):

111

os.mkdir(IDX_LOCATION)

111

os.mkdir(IDX_LOCATION)

112

113

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

113

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

114

writer = idx.writer()

114

writer = idx.writer()

115

116

for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

116

for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

117

log.debug('building index @ %s' % repo.path)

117

log.debug('building index @ %s' % repo.path)

118

119

for idx_path in self.get_paths(repo.path):

119

for idx_path in self.get_paths(repo.path):

120

self.add_doc(writer, idx_path, repo)

120

self.add_doc(writer, idx_path, repo)

121

writer.commit(merge=True)

121

writer.commit(merge=True)

122

123

log.debug('>>> FINISHED BUILDING INDEX <<<')

123

log.debug('>>> FINISHED BUILDING INDEX <<<')

124

125

126

def update_index(self):

126

def update_index(self):

127

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

127

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

128

129

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

129

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

130

# The set of all paths in the index

130

# The set of all paths in the index

131

indexed_paths = set()

131

indexed_paths = set()

132

# The set of all paths we need to re-index

132

# The set of all paths we need to re-index

133

to_index = set()

133

to_index = set()

134

135

reader = idx.reader()

135

reader = idx.reader()

136

writer = idx.writer()

136

writer = idx.writer()

137

138

# Loop over the stored fields in the index

138

# Loop over the stored fields in the index

139

for fields in reader.all_stored_fields():

139

for fields in reader.all_stored_fields():

140

indexed_path = fields['path']

140

indexed_path = fields['path']

141

indexed_paths.add(indexed_path)

141

indexed_paths.add(indexed_path)

142

143

if not os.path.exists(indexed_path):

143

if not os.path.exists(indexed_path):

144

# This file was deleted since it was indexed

144

# This file was deleted since it was indexed

145

log.debug('removing from index %s' % indexed_path)

145

log.debug('removing from index %s' % indexed_path)

146

writer.delete_by_term('path', indexed_path)

146

writer.delete_by_term('path', indexed_path)

147

148

else:

148

else:

149

# Check if this file was changed since it

149

# Check if this file was changed since it

150

# was indexed

150

# was indexed

151

indexed_time = fields['modtime']

151

indexed_time = fields['modtime']

152

153

mtime = os.path.getmtime(indexed_path)

153

mtime = os.path.getmtime(indexed_path)

154

155

if mtime > indexed_time:

155

if mtime > indexed_time:

156

157

# The file has changed, delete it and add it to the list of

157

# The file has changed, delete it and add it to the list of

158

# files to reindex

158

# files to reindex

159

log.debug('adding to reindex list %s' % indexed_path)

159

log.debug('adding to reindex list %s' % indexed_path)

160

writer.delete_by_term('path', indexed_path)

160

writer.delete_by_term('path', indexed_path)

161

to_index.add(indexed_path)

161

to_index.add(indexed_path)

162

#writer.commit()

162

#writer.commit()

163

164

# Loop over the files in the filesystem

164

# Loop over the files in the filesystem

165

# Assume we have a function that gathers the filenames of the

165

# Assume we have a function that gathers the filenames of the

166

# documents to be indexed

166

# documents to be indexed

167

for repo in scan_paths(self.repo_location).values():

167

for repo in scan_paths(self.repo_location).values():

168

for path in self.get_paths(repo.path):

168

for path in self.get_paths(repo.path):

169

if path in to_index or path not in indexed_paths:

169

if path in to_index or path not in indexed_paths:

170

# This is either a file that's changed, or a new file

170

# This is either a file that's changed, or a new file

171

# that wasn't indexed before. So index it!

171

# that wasn't indexed before. So index it!

172

self.add_doc(writer, path, repo)

172

self.add_doc(writer, path, repo)

173

log.debug('reindexing %s' % path)

173

log.debug('reindexing %s' % path)

174

175

writer.commit(merge=True)

175

writer.commit(merge=True)

176

#idx.optimize()

176

#idx.optimize()

177

log.debug('>>> FINISHED <<<')

177

log.debug('>>> FINISHED <<<')

178

179

def run(self, full_index=False):

179

def run(self, full_index=False):

180

"""Run daemon"""

180

"""Run daemon"""

181

if full_index:

181

if full_index:

182

self.build_index()

182

self.build_index()

183

else:

183

else:

184

self.update_index()

184

self.update_index()

185

186

if __name__ == "__main__":

186

if __name__ == "__main__":

187

arg = sys.argv[1:]

187

arg = sys.argv[1:]

188

if ~~not~~ ~~arg~~:

188

if len(arg) != 2:

189

sys.std~~out~~.write('Please specify indexing type [full|incremental]'

189

sys.stderr.write('Please specify indexing type [full|incremental]'

190

' as script arg \n')

190

'and path to repositories as script args \n')

191

sys.exit()

191

sys.exit()

192

193

192

if arg[0] == 'full':

194

if arg[0] == 'full':

193

full_index = True

195

full_index = True

194

elif arg[0] == 'incremental':

196

elif arg[0] == 'incremental':

195

# False means looking just for changes

197

# False means looking just for changes

196

full_index = False

198

full_index = False

197

else:

199

else:

198

sys.stdout.write('Please use [full|incremental]'

200

sys.stdout.write('Please use [full|incremental]'

199

' as script arg \n')

201

' as script first arg \n')

200

sys.exit()

202

sys.exit()

201

203

202

204

if not os.path.isdir(arg[1]):

203

repo_location = '/home/hg_repos/*'

205

sys.stderr.write('%s is not a valid path \n' % arg[1])

206

sys.exit()

207

else:

208

if arg[1].endswith('/'):

209

repo_location = arg[1] + '*'

210

else:

211

repo_location = arg[1] + '/*'

204

212

205

try:

213

try:

206

l = DaemonLock()

214

l = DaemonLock()

207

WhooshIndexingDaemon(repo_location=repo_location)\

215

WhooshIndexingDaemon(repo_location=repo_location)\

208

.run(full_index=full_index)

216

.run(full_index=full_index)

209

l.release()

217

l.release()

210

except LockHeld:

218

except LockHeld:

211

sys.exit(1)

219

sys.exit(1)

212

220

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python
             # encoding: utf-8
             # whoosh indexer daemon for hg-app
             # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
             #
             # This program is free software; you can redistribute it and/or
             # modify it under the terms of the GNU General Public License
             # as published by the Free Software Foundation; version 2
             # of the License or (at your opinion) any later version of the license.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software
             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
             # MA  02110-1301, USA.
             """
             Created on Jan 26, 2010
             @author: marcink
             A deamon will read from task table and run tasks
             """
             import sys
             import os
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the pylons_app import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from pidlock import LockHeld, DaemonLock
             import traceback
             from pylons_app.config.environment import load_environment
             from pylons_app.model.hg_model import HgModel
             from pylons_app.lib.helpers import safe_unicode
             from whoosh.index import create_in, open_dir
             from shutil import rmtree
             from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
             SCHEMA, IDX_NAME
             import logging
             import logging.config
             logging.config.fileConfig(jn(project_path, 'development.ini'))
             log = logging.getLogger('whooshIndexer')
             def scan_paths(root_location):
                 return HgModel.repo_scan('/', root_location, None, True)
             class WhooshIndexingDaemon(object):
                 """Deamon for atomic jobs"""
                 def __init__(self, indexname='HG_INDEX', repo_location=None):
                     self.indexname = indexname
                     self.repo_location = repo_location
                 def get_paths(self, root_dir):
                     """recursive walk in root dir and return a set of all path in that dir
                     excluding files in .hg dir"""
                     index_paths_ = set()
                     for path, dirs, files in os.walk(root_dir):
                         if path.find('.hg') == -1:
                             for f in files:
                                 index_paths_.add(jn(path, f))
                     return index_paths_
                 def add_doc(self, writer, path, repo):
                     """Adding doc to writer"""
                     ext = unicode(path.split('/')[-1].split('.')[-1].lower())
                     #we just index the content of choosen files
                     if ext in INDEX_EXTENSIONS:
                         log.debug('    >> %s [WITH CONTENT]' % path)
                         fobj = open(path, 'rb')
                         content = fobj.read()
                         fobj.close()
                         u_content = safe_unicode(content)
                     else:
                         log.debug('    >> %s' % path)
                         #just index file name without it's content
                         u_content = u''
                     try:
                         os.stat(path)
                         writer.add_document(owner=unicode(repo.contact),
                                         repository=u"%s" % repo.name,
                                         path=u"%s" % path,
                                         content=u_content,
                                         modtime=os.path.getmtime(path),
                                         extension=ext)
                     except OSError, e:
                         import errno
                         if e.errno == errno.ENOENT:
                             log.debug('path %s does not exist or is a broken symlink' % path)
                         else:
                             raise e
                 def build_index(self):
                     if os.path.exists(IDX_LOCATION):
                         log.debug('removing previos index')
                         rmtree(IDX_LOCATION)
                     if not os.path.exists(IDX_LOCATION):
                         os.mkdir(IDX_LOCATION)
                     idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
                     writer = idx.writer()
                     for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
                         log.debug('building index @ %s' % repo.path)
                         for idx_path in self.get_paths(repo.path):
                             self.add_doc(writer, idx_path, repo)
                     writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_index(self):
                     log.debug('STARTING INCREMENTAL INDEXING UPDATE')
                     idx = open_dir(IDX_LOCATION, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     reader = idx.reader()
                     writer = idx.writer()
                     # Loop over the stored fields in the index
                     for fields in reader.all_stored_fields():
                         indexed_path = fields['path']
                         indexed_paths.add(indexed_path)
                         if not os.path.exists(indexed_path):
                             # This file was deleted since it was indexed
                             log.debug('removing from index %s' % indexed_path)
                             writer.delete_by_term('path', indexed_path)
                         else:
                             # Check if this file was changed since it
                             # was indexed
                             indexed_time = fields['modtime']
                             mtime = os.path.getmtime(indexed_path)
                             if mtime > indexed_time:
                                 # The file has changed, delete it and add it to the list of
                                 # files to reindex
                                 log.debug('adding to reindex list %s' % indexed_path)
                                 writer.delete_by_term('path', indexed_path)
                                 to_index.add(indexed_path)
                                 #writer.commit()
                     # Loop over the files in the filesystem
                     # Assume we have a function that gathers the filenames of the
                     # documents to be indexed
                     for repo in scan_paths(self.repo_location).values():
                         for path in self.get_paths(repo.path):
                             if path in to_index or path not in indexed_paths:
                                 # This is either a file that's changed, or a new file
                                 # that wasn't indexed before. So index it!
                                 self.add_doc(writer, path, repo)
                                 log.debug('reindexing %s' % path)
                     writer.commit(merge=True)
                     #idx.optimize()
                     log.debug('>>> FINISHED <<<')
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index:
                         self.build_index()
                     else:
                         self.update_index()
             if __name__ == "__main__":
                 arg = sys.argv[1:]
-                if not arg:
+                if len(arg) != 2:
-                    sys.stdout.write('Please specify indexing type [full|incremental]'
+                    sys.stderr.write('Please specify indexing type [full|incremental]'
-                                     ' as script arg \n')
+                                     'and path to repositories as script args \n')
                     sys.exit()
                 if arg[0] == 'full':
                     full_index = True
                 elif arg[0] == 'incremental':
                     # False means looking just for changes
                     full_index = False
                 else:
                     sys.stdout.write('Please use [full|incremental]'
-                                     ' as script arg \n')
+                                     ' as script first arg \n')
                     sys.exit()
+                if not os.path.isdir(arg[1]):
-                repo_location = '/home/hg_repos/*'
+                    sys.stderr.write('%s is not a valid path \n' % arg[1])
+                    sys.exit()
+                else:
+                    if arg[1].endswith('/'):
+                        repo_location = arg[1] + '*'
+                    else:
+                        repo_location = arg[1] + '/*'
                 try:
                     l = DaemonLock()
                     WhooshIndexingDaemon(repo_location=repo_location)\
                         .run(full_index=full_index)
                     l.release()
                 except LockHeld:
                     sys.exit(1)