upstream/kallithea Commit - r557:29ec9ddb

1

#!/usr/bin/env python

1

#!/usr/bin/env python

2

# encoding: utf-8

2

# encoding: utf-8

3

# whoosh indexer daemon for rhodecode

3

# whoosh indexer daemon for rhodecode

4

5

#

5

#

6

# This program is free software; you can redistribute it and/or

6

# This program is free software; you can redistribute it and/or

7

# modify it under the terms of the GNU General Public License

7

# modify it under the terms of the GNU General Public License

8

# as published by the Free Software Foundation; version 2

8

# as published by the Free Software Foundation; version 2

9

# of the License or (at your opinion) any later version of the license.

9

# of the License or (at your opinion) any later version of the license.

10

#

10

#

11

# This program is distributed in the hope that it will be useful,

11

# This program is distributed in the hope that it will be useful,

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14

# GNU General Public License for more details.

14

# GNU General Public License for more details.

15

#

15

#

16

# You should have received a copy of the GNU General Public License

16

# You should have received a copy of the GNU General Public License

17

# along with this program; if not, write to the Free Software

17

# along with this program; if not, write to the Free Software

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

19

# MA 02110-1301, USA.

19

# MA 02110-1301, USA.

20

"""

20

"""

21

Created on Jan 26, 2010

21

Created on Jan 26, 2010

22

23

@author: marcink

23

@author: marcink

24

A deamon will read from task table and run tasks

24

A deamon will read from task table and run tasks

25

"""

25

"""

26

import sys

26

import sys

27

import os

27

import os

28

from os.path import dirname as dn

28

from os.path import dirname as dn

29

from os.path import join as jn

29

from os.path import join as jn

30

31

#to get the rhodecode import

31

#to get the rhodecode import

32

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

32

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

33

sys.path.append(project_path)

33

sys.path.append(project_path)

34

35

from rhodecode.lib.pidlock import LockHeld, DaemonLock

35

from rhodecode.lib.pidlock import LockHeld, DaemonLock

36

from rhodecode.model.hg_model import HgModel

36

from rhodecode.model.hg_model import HgModel

37

from rhodecode.lib.helpers import safe_unicode

37

from rhodecode.lib.helpers import safe_unicode

38

from whoosh.index import create_in, open_dir

38

from whoosh.index import create_in, open_dir

39

from shutil import rmtree

39

from shutil import rmtree

40

from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME

40

from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME

41

42

import logging

42

import logging

43

44

log = logging.getLogger('whooshIndexer')

44

log = logging.getLogger('whooshIndexer')

45

# create logger

45

# create logger

46

log.setLevel(logging.DEBUG)

46

log.setLevel(logging.DEBUG)

47

log.propagate = False

47

log.propagate = False

48

# create console handler and set level to debug

48

# create console handler and set level to debug

49

ch = logging.StreamHandler()

49

ch = logging.StreamHandler()

50

ch.setLevel(logging.DEBUG)

50

ch.setLevel(logging.DEBUG)

51

52

# create formatter

52

# create formatter

53

formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

53

formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

54

55

# add formatter to ch

55

# add formatter to ch

56

ch.setFormatter(formatter)

56

ch.setFormatter(formatter)

57

58

# add ch to logger

58

# add ch to logger

59

log.addHandler(ch)

59

log.addHandler(ch)

60

61

def scan_paths(root_location):

61

def scan_paths(root_location):

62

return HgModel.repo_scan('/', root_location, None, True)

62

return HgModel.repo_scan('/', root_location, None, True)

63

64

class WhooshIndexingDaemon(object):

64

class WhooshIndexingDaemon(object):

65

"""Deamon for atomic jobs"""

65

"""Deamon for atomic jobs"""

66

67

def __init__(self, indexname='HG_INDEX', repo_location=None):

67

def __init__(self, indexname='HG_INDEX', repo_location=None):

68

self.indexname = indexname

68

self.indexname = indexname

69

self.repo_location = repo_location

69

self.repo_location = repo_location

70

self.initial = False

70

self.initial = False

71

if not os.path.isdir(IDX_LOCATION):

71

if not os.path.isdir(IDX_LOCATION):

72

os.mkdir(IDX_LOCATION)

72

os.mkdir(IDX_LOCATION)

73

log.info('Cannot run incremental index since it does not'

73

log.info('Cannot run incremental index since it does not'

74

' yet exist running full build')

74

' yet exist running full build')

75

self.initial = True

75

self.initial = True

76

77

def get_paths(self, root_dir):

77

def get_paths(self, root_dir):

78

"""recursive walk in root dir and return a set of all path in that dir

78

"""recursive walk in root dir and return a set of all path in that dir

79

excluding files in .hg dir"""

79

excluding files in .hg dir"""

80

index_paths_ = set()

80

index_paths_ = set()

81

for path, dirs, files in os.walk(root_dir):

81

for path, dirs, files in os.walk(root_dir):

82

if path.find('.hg') == -1:

82

if path.find('.hg') == -1:

83

for f in files:

83

for f in files:

84

index_paths_.add(jn(path, f))

84

index_paths_.add(jn(path, f))

85

86

return index_paths_

86

return index_paths_

87

88

def add_doc(self, writer, path, repo):

88

def add_doc(self, writer, path, repo):

89

"""Adding doc to writer"""

89

"""Adding doc to writer"""

90

91

ext = unicode(path.split('/')[-1].split('.')[-1].lower())

91

ext = unicode(path.split('/')[-1].split('.')[-1].lower())

92

#we just index the content of choosen files

92

#we just index the content of choosen files

93

if ext in INDEX_EXTENSIONS:

93

if ext in INDEX_EXTENSIONS:

94

log.debug(' >> %s [WITH CONTENT]' % path)

94

log.debug(' >> %s [WITH CONTENT]' % path)

95

fobj = open(path, 'rb')

95

fobj = open(path, 'rb')

96

content = fobj.read()

96

content = fobj.read()

97

fobj.close()

97

fobj.close()

98

u_content = safe_unicode(content)

98

u_content = safe_unicode(content)

99

else:

99

else:

100

log.debug(' >> %s' % path)

100

log.debug(' >> %s' % path)

101

#just index file name without it's content

101

#just index file name without it's content

102

u_content = u''

102

u_content = u''

103

104

105

106

try:

106

try:

107

os.stat(path)

107

os.stat(path)

108

writer.add_document(owner=unicode(repo.contact),

108

writer.add_document(owner=unicode(repo.contact),

109

repository=u"%s" % repo.name,

109

repository=safe_unicode(repo.name),

110

path=u"%s" % path,

110

path=safe_unicode(path),

111

content=u_content,

111

content=u_content,

112

modtime=os.path.getmtime(path),

112

modtime=os.path.getmtime(path),

113

extension=ext)

113

extension=ext)

114

except OSError, e:

114

except OSError, e:

115

import errno

115

import errno

116

if e.errno == errno.ENOENT:

116

if e.errno == errno.ENOENT:

117

log.debug('path %s does not exist or is a broken symlink' % path)

117

log.debug('path %s does not exist or is a broken symlink' % path)

118

else:

118

else:

119

raise e

119

raise e

120

121

122

def build_index(self):

122

def build_index(self):

123

if os.path.exists(IDX_LOCATION):

123

if os.path.exists(IDX_LOCATION):

124

log.debug('removing previos index')

124

log.debug('removing previos index')

125

rmtree(IDX_LOCATION)

125

rmtree(IDX_LOCATION)

126

127

if not os.path.exists(IDX_LOCATION):

127

if not os.path.exists(IDX_LOCATION):

128

os.mkdir(IDX_LOCATION)

128

os.mkdir(IDX_LOCATION)

129

130

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

130

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

131

writer = idx.writer()

131

writer = idx.writer()

132

133

for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

133

for cnt, repo in enumerate(scan_paths(self.repo_location).values()):

134

log.debug('building index @ %s' % repo.path)

134

log.debug('building index @ %s' % repo.path)

135

136

for idx_path in self.get_paths(repo.path):

136

for idx_path in self.get_paths(repo.path):

137

self.add_doc(writer, idx_path, repo)

137

self.add_doc(writer, idx_path, repo)

138

writer.commit(merge=True)

138

writer.commit(merge=True)

139

140

log.debug('>>> FINISHED BUILDING INDEX <<<')

140

log.debug('>>> FINISHED BUILDING INDEX <<<')

141

142

143

def update_index(self):

143

def update_index(self):

144

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

144

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

145

146

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

146

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

147

# The set of all paths in the index

147

# The set of all paths in the index

148

indexed_paths = set()

148

indexed_paths = set()

149

# The set of all paths we need to re-index

149

# The set of all paths we need to re-index

150

to_index = set()

150

to_index = set()

151

152

reader = idx.reader()

152

reader = idx.reader()

153

writer = idx.writer()

153

writer = idx.writer()

154

155

# Loop over the stored fields in the index

155

# Loop over the stored fields in the index

156

for fields in reader.all_stored_fields():

156

for fields in reader.all_stored_fields():

157

indexed_path = fields['path']

157

indexed_path = fields['path']

158

indexed_paths.add(indexed_path)

158

indexed_paths.add(indexed_path)

159

160

if not os.path.exists(indexed_path):

160

if not os.path.exists(indexed_path):

161

# This file was deleted since it was indexed

161

# This file was deleted since it was indexed

162

log.debug('removing from index %s' % indexed_path)

162

log.debug('removing from index %s' % indexed_path)

163

writer.delete_by_term('path', indexed_path)

163

writer.delete_by_term('path', indexed_path)

164

165

else:

165

else:

166

# Check if this file was changed since it

166

# Check if this file was changed since it

167

# was indexed

167

# was indexed

168

indexed_time = fields['modtime']

168

indexed_time = fields['modtime']

169

170

mtime = os.path.getmtime(indexed_path)

170

mtime = os.path.getmtime(indexed_path)

171

172

if mtime > indexed_time:

172

if mtime > indexed_time:

173

174

# The file has changed, delete it and add it to the list of

174

# The file has changed, delete it and add it to the list of

175

# files to reindex

175

# files to reindex

176

log.debug('adding to reindex list %s' % indexed_path)

176

log.debug('adding to reindex list %s' % indexed_path)

177

writer.delete_by_term('path', indexed_path)

177

writer.delete_by_term('path', indexed_path)

178

to_index.add(indexed_path)

178

to_index.add(indexed_path)

179

#writer.commit()

179

#writer.commit()

180

181

# Loop over the files in the filesystem

181

# Loop over the files in the filesystem

182

# Assume we have a function that gathers the filenames of the

182

# Assume we have a function that gathers the filenames of the

183

# documents to be indexed

183

# documents to be indexed

184

for repo in scan_paths(self.repo_location).values():

184

for repo in scan_paths(self.repo_location).values():

185

for path in self.get_paths(repo.path):

185

for path in self.get_paths(repo.path):

186

if path in to_index or path not in indexed_paths:

186

if path in to_index or path not in indexed_paths:

187

# This is either a file that's changed, or a new file

187

# This is either a file that's changed, or a new file

188

# that wasn't indexed before. So index it!

188

# that wasn't indexed before. So index it!

189

self.add_doc(writer, path, repo)

189

self.add_doc(writer, path, repo)

190

log.debug('reindexing %s' % path)

190

log.debug('reindexing %s' % path)

191

192

writer.commit(merge=True)

192

writer.commit(merge=True)

193

#idx.optimize()

193

#idx.optimize()

194

log.debug('>>> FINISHED <<<')

194

log.debug('>>> FINISHED <<<')

195

196

def run(self, full_index=False):

196

def run(self, full_index=False):

197

"""Run daemon"""

197

"""Run daemon"""

198

if full_index or self.initial:

198

if full_index or self.initial:

199

self.build_index()

199

self.build_index()

200

else:

200

else:

201

self.update_index()

201

self.update_index()

202

203

if __name__ == "__main__":

203

if __name__ == "__main__":

204

arg = sys.argv[1:]

204

arg = sys.argv[1:]

205

if len(arg) != 2:

205

if len(arg) != 2:

206

sys.stderr.write('Please specify indexing type [full|incremental]'

206

sys.stderr.write('Please specify indexing type [full|incremental]'

207

'and path to repositories as script args \n')

207

'and path to repositories as script args \n')

208

sys.exit()

208

sys.exit()

209

210

211

if arg[0] == 'full':

211

if arg[0] == 'full':

212

full_index = True

212

full_index = True

213

elif arg[0] == 'incremental':

213

elif arg[0] == 'incremental':

214

# False means looking just for changes

214

# False means looking just for changes

215

full_index = False

215

full_index = False

216

else:

216

else:

217

sys.stdout.write('Please use [full|incremental]'

217

sys.stdout.write('Please use [full|incremental]'

218

' as script first arg \n')

218

' as script first arg \n')

219

sys.exit()

219

sys.exit()

220

221

if not os.path.isdir(arg[1]):

221

if not os.path.isdir(arg[1]):

222

sys.stderr.write('%s is not a valid path \n' % arg[1])

222

sys.stderr.write('%s is not a valid path \n' % arg[1])

223

sys.exit()

223

sys.exit()

224

else:

224

else:

225

if arg[1].endswith('/'):

225

if arg[1].endswith('/'):

226

repo_location = arg[1] + '*'

226

repo_location = arg[1] + '*'

227

else:

227

else:

228

repo_location = arg[1] + '/*'

228

repo_location = arg[1] + '/*'

229

230

try:

230

try:

231

l = DaemonLock()

231

l = DaemonLock()

232

WhooshIndexingDaemon(repo_location=repo_location)\

232

WhooshIndexingDaemon(repo_location=repo_location)\

233

.run(full_index=full_index)

233

.run(full_index=full_index)

234

l.release()

234

l.release()

235

reload(logging)

235

reload(logging)

236

except LockHeld:

236

except LockHeld:

237

sys.exit(1)

237

sys.exit(1)

238

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python
             # encoding: utf-8
             # whoosh indexer daemon for rhodecode
             # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
             #
             # This program is free software; you can redistribute it and/or
             # modify it under the terms of the GNU General Public License
             # as published by the Free Software Foundation; version 2
             # of the License or (at your opinion) any later version of the license.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software
             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
             # MA  02110-1301, USA.
             """
             Created on Jan 26, 2010
             @author: marcink
             A deamon will read from task table and run tasks
             """
             import sys
             import os
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.lib.pidlock import LockHeld, DaemonLock
             from rhodecode.model.hg_model import HgModel
             from rhodecode.lib.helpers import safe_unicode
             from whoosh.index import create_in, open_dir
             from shutil import rmtree
             from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
             import logging
             log = logging.getLogger('whooshIndexer')
             # create logger
             log.setLevel(logging.DEBUG)
             log.propagate = False
             # create console handler and set level to debug
             ch = logging.StreamHandler()
             ch.setLevel(logging.DEBUG)
             # create formatter
             formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
             # add formatter to ch
             ch.setFormatter(formatter)
             # add ch to logger
             log.addHandler(ch)
             def scan_paths(root_location):
                 return HgModel.repo_scan('/', root_location, None, True)
             class WhooshIndexingDaemon(object):
                 """Deamon for atomic jobs"""
                 def __init__(self, indexname='HG_INDEX', repo_location=None):
                     self.indexname = indexname
                     self.repo_location = repo_location
                     self.initial = False
                     if not os.path.isdir(IDX_LOCATION):
                         os.mkdir(IDX_LOCATION)
                         log.info('Cannot run incremental index since it does not'
                                  ' yet exist running full build')
                         self.initial = True
                 def get_paths(self, root_dir):
                     """recursive walk in root dir and return a set of all path in that dir
                     excluding files in .hg dir"""
                     index_paths_ = set()
                     for path, dirs, files in os.walk(root_dir):
                         if path.find('.hg') == -1:
                             for f in files:
                                 index_paths_.add(jn(path, f))
                     return index_paths_
                 def add_doc(self, writer, path, repo):
                     """Adding doc to writer"""
                     ext = unicode(path.split('/')[-1].split('.')[-1].lower())
                     #we just index the content of choosen files
                     if ext in INDEX_EXTENSIONS:
                         log.debug('    >> %s [WITH CONTENT]' % path)
                         fobj = open(path, 'rb')
                         content = fobj.read()
                         fobj.close()
                         u_content = safe_unicode(content)
                     else:
                         log.debug('    >> %s' % path)
                         #just index file name without it's content
                         u_content = u''
                     try:
                         os.stat(path)
                         writer.add_document(owner=unicode(repo.contact),
-                                        repository=u"%s" % repo.name,
+                                        repository=safe_unicode(repo.name),
-                                        path=u"%s" % path,
+                                        path=safe_unicode(path),
                                         content=u_content,
                                         modtime=os.path.getmtime(path),
                                         extension=ext)
                     except OSError, e:
                         import errno
                         if e.errno == errno.ENOENT:
                             log.debug('path %s does not exist or is a broken symlink' % path)
                         else:
                             raise e
                 def build_index(self):
                     if os.path.exists(IDX_LOCATION):
                         log.debug('removing previos index')
                         rmtree(IDX_LOCATION)
                     if not os.path.exists(IDX_LOCATION):
                         os.mkdir(IDX_LOCATION)
                     idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
                     writer = idx.writer()
                     for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
                         log.debug('building index @ %s' % repo.path)
                         for idx_path in self.get_paths(repo.path):
                             self.add_doc(writer, idx_path, repo)
                     writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_index(self):
                     log.debug('STARTING INCREMENTAL INDEXING UPDATE')
                     idx = open_dir(IDX_LOCATION, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     reader = idx.reader()
                     writer = idx.writer()
                     # Loop over the stored fields in the index
                     for fields in reader.all_stored_fields():
                         indexed_path = fields['path']
                         indexed_paths.add(indexed_path)
                         if not os.path.exists(indexed_path):
                             # This file was deleted since it was indexed
                             log.debug('removing from index %s' % indexed_path)
                             writer.delete_by_term('path', indexed_path)
                         else:
                             # Check if this file was changed since it
                             # was indexed
                             indexed_time = fields['modtime']
                             mtime = os.path.getmtime(indexed_path)
                             if mtime > indexed_time:
                                 # The file has changed, delete it and add it to the list of
                                 # files to reindex
                                 log.debug('adding to reindex list %s' % indexed_path)
                                 writer.delete_by_term('path', indexed_path)
                                 to_index.add(indexed_path)
                                 #writer.commit()
                     # Loop over the files in the filesystem
                     # Assume we have a function that gathers the filenames of the
                     # documents to be indexed
                     for repo in scan_paths(self.repo_location).values():
                         for path in self.get_paths(repo.path):
                             if path in to_index or path not in indexed_paths:
                                 # This is either a file that's changed, or a new file
                                 # that wasn't indexed before. So index it!
                                 self.add_doc(writer, path, repo)
                                 log.debug('reindexing %s' % path)
                     writer.commit(merge=True)
                     #idx.optimize()
                     log.debug('>>> FINISHED <<<')
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_index()
                     else:
                         self.update_index()
             if __name__ == "__main__":
                 arg = sys.argv[1:]
                 if len(arg) != 2:
                     sys.stderr.write('Please specify indexing type [full|incremental]'
                                      'and path to repositories as script args \n')
                     sys.exit()
                 if arg[0] == 'full':
                     full_index = True
                 elif arg[0] == 'incremental':
                     # False means looking just for changes
                     full_index = False
                 else:
                     sys.stdout.write('Please use [full|incremental]'
                                      ' as script first arg \n')
                     sys.exit()
                 if not os.path.isdir(arg[1]):
                     sys.stderr.write('%s is not a valid path \n' % arg[1])
                     sys.exit()
                 else:
                     if arg[1].endswith('/'):
                         repo_location = arg[1] + '*'
                     else:
                         repo_location = arg[1] + '/*'
                 try:
                     l = DaemonLock()
                     WhooshIndexingDaemon(repo_location=repo_location)\
                         .run(full_index=full_index)
                     l.release()
                     reload(logging)
                 except LockHeld:
                     sys.exit(1)