upstream/kallithea Commit - r407:0c9dfae5

1

#!/usr/bin/env python

1

#!/usr/bin/env python

2

# encoding: utf-8

2

# encoding: utf-8

3

# whoosh indexer daemon for hg-app

3

# whoosh indexer daemon for hg-app

4

5

#

5

#

6

# This program is free software; you can redistribute it and/or

6

# This program is free software; you can redistribute it and/or

7

# modify it under the terms of the GNU General Public License

7

# modify it under the terms of the GNU General Public License

8

# as published by the Free Software Foundation; version 2

8

# as published by the Free Software Foundation; version 2

9

# of the License or (at your opinion) any later version of the license.

9

# of the License or (at your opinion) any later version of the license.

10

#

10

#

11

# This program is distributed in the hope that it will be useful,

11

# This program is distributed in the hope that it will be useful,

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14

# GNU General Public License for more details.

14

# GNU General Public License for more details.

15

#

15

#

16

# You should have received a copy of the GNU General Public License

16

# You should have received a copy of the GNU General Public License

17

# along with this program; if not, write to the Free Software

17

# along with this program; if not, write to the Free Software

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

18

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

19

# MA 02110-1301, USA.

19

# MA 02110-1301, USA.

20

"""

20

"""

21

Created on Jan 26, 2010

21

Created on Jan 26, 2010

22

23

@author: marcink

23

@author: marcink

24

A deamon will read from task table and run tasks

24

A deamon will read from task table and run tasks

25

"""

25

"""

26

import sys

26

import sys

27

import os

27

import os

28

from pidlock import LockHeld, DaemonLock

28

from pidlock import LockHeld, DaemonLock

29

import traceback

29

import traceback

30

31

from os.path import dirname as dn

31

from os.path import dirname as dn

32

from os.path import join as jn

32

from os.path import join as jn

33

34

#to get the pylons_app import

34

#to get the pylons_app import

35

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

35

sys.path.append(dn(dn(dn(dn(os.path.realpath(__file__))))))

36

37

from pylons_app.config.environment import load_environment

37

from pylons_app.config.environment import load_environment

38

from pylons_app.model.hg_model import HgModel

38

from pylons_app.model.hg_model import HgModel

39

from whoosh.index import create_in, open_dir

39

from whoosh.index import create_in, open_dir

40

from shutil import rmtree

40

from shutil import rmtree

41

from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME

41

from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME

42

import logging

42

import logging

43

log = logging.getLogger(__name__)

43

log = logging.getLogger(__name__)

44

45

46

location = '/home/marcink/python_workspace_dirty/*'

46

location = '/home/marcink/python_workspace_dirty/*'

47

48

def scan_paths(root_location):

48

def scan_paths(root_location):

49

return HgModel.repo_scan('/', root_location, None, True)

49

return HgModel.repo_scan('/', root_location, None, True)

50

51

class WhooshIndexingDaemon(object):

51

class WhooshIndexingDaemon(object):

52

"""Deamon for atomic jobs"""

52

"""Deamon for atomic jobs"""

53

54

def __init__(self, indexname='HG_INDEX'):

54

def __init__(self, indexname='HG_INDEX'):

55

self.indexname = indexname

55

self.indexname = indexname

56

57

58

def get_paths(self, root_dir):

58

def get_paths(self, root_dir):

59

"""recursive walk in root dir and return a set of all path in that dir

59

"""recursive walk in root dir and return a set of all path in that dir

60

excluding files in .hg dir"""

60

excluding files in .hg dir"""

61

index_paths_ = set()

61

index_paths_ = set()

62

for path, dirs, files in os.walk(root_dir):

62

for path, dirs, files in os.walk(root_dir):

63

if path.find('.hg') == -1:

63

if path.find('.hg') == -1:

64

for f in files:

64

for f in files:

65

index_paths_.add(jn(path, f))

65

index_paths_.add(jn(path, f))

66

67

return index_paths_

67

return index_paths_

68

69

def add_doc(self, writer, path, repo):

69

def add_doc(self, writer, path, repo):

70

"""Adding doc to writer"""

70

"""Adding doc to writer"""

71

72

#we don't won't to read excluded file extensions just index them

72

#we don't won't to read excluded file extensions just index them

73

if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:

73

if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:

74

fobj = open(path, 'rb')

74

fobj = open(path, 'rb')

75

content = fobj.read()

75

content = fobj.read()

76

fobj.close()

76

fobj.close()

77

try:

77

try:

78

u_content = unicode(content)

78

u_content = unicode(content)

79

except UnicodeDecodeError:

79

except UnicodeDecodeError:

80

#incase we have a decode error just represent as byte string

80

#incase we have a decode error just represent as byte string

81

u_content = unicode(str(content).encode('string_escape'))

81

u_content = unicode(str(content).encode('string_escape'))

82

else:

82

else:

83

u_content = u''

83

u_content = u''

84

writer.add_document(owner=unicode(repo.contact),

84

writer.add_document(owner=unicode(repo.contact),

85

repository=u"%s" % repo.name,

85

repository=u"%s" % repo.name,

86

path=u"%s" % path,

86

path=u"%s" % path,

87

content=u_content,

87

content=u_content,

88

modtime=os.path.getmtime(path))

88

modtime=os.path.getmtime(path))

89

90

def build_index(self):

90

def build_index(self):

91

if os.path.exists(IDX_LOCATION):

91

if os.path.exists(IDX_LOCATION):

92

rmtree(IDX_LOCATION)

92

rmtree(IDX_LOCATION)

93

94

if not os.path.exists(IDX_LOCATION):

94

if not os.path.exists(IDX_LOCATION):

95

os.mkdir(IDX_LOCATION)

95

os.mkdir(IDX_LOCATION)

96

97

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

97

idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)

98

writer = idx.writer()

98

writer = idx.writer()

99

100

for cnt, repo in enumerate(scan_paths(location).values()):

100

for cnt, repo in enumerate(scan_paths(location).values()):

101

log.debug('building index @ %s' % repo.path)

101

log.debug('building index @ %s' % repo.path)

102

103

for idx_path in self.get_paths(repo.path):

103

for idx_path in self.get_paths(repo.path):

104

log.debug(' >> %s' % idx_path)

104

log.debug(' >> %s' % idx_path)

105

self.add_doc(writer, idx_path, repo)

105

self.add_doc(writer, idx_path, repo)

106

writer.commit(merge=True)

106

writer.commit(merge=True)

107

108

log.debug('>>> FINISHED BUILDING INDEX <<<')

108

log.debug('>>> FINISHED BUILDING INDEX <<<')

109

110

111

def update_index(self):

111

def update_index(self):

112

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

112

log.debug('STARTING INCREMENTAL INDEXING UPDATE')

113

114

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

114

idx = open_dir(IDX_LOCATION, indexname=self.indexname)

115

# The set of all paths in the index

115

# The set of all paths in the index

116

indexed_paths = set()

116

indexed_paths = set()

117

# The set of all paths we need to re-index

117

# The set of all paths we need to re-index

118

to_index = set()

118

to_index = set()

119

120

reader = idx.reader()

120

reader = idx.reader()

121

writer = idx.writer()

121

writer = idx.writer()

122

123

# Loop over the stored fields in the index

123

# Loop over the stored fields in the index

124

for fields in reader.all_stored_fields():

124

for fields in reader.all_stored_fields():

125

indexed_path = fields['path']

125

indexed_path = fields['path']

126

indexed_paths.add(indexed_path)

126

indexed_paths.add(indexed_path)

127

128

if not os.path.exists(indexed_path):

128

if not os.path.exists(indexed_path):

129

# This file was deleted since it was indexed

129

# This file was deleted since it was indexed

130

log.debug('removing from index %s' % indexed_path)

130

log.debug('removing from index %s' % indexed_path)

131

writer.delete_by_term('path', indexed_path)

131

writer.delete_by_term('path', indexed_path)

132

133

else:

133

else:

134

# Check if this file was changed since it

134

# Check if this file was changed since it

135

# was indexed

135

# was indexed

136

indexed_time = fields['modtime']

136

indexed_time = fields['modtime']

137

138

mtime = os.path.getmtime(indexed_path)

138

mtime = os.path.getmtime(indexed_path)

139

140

if mtime > indexed_time:

140

if mtime > indexed_time:

141

142

# The file has changed, delete it and add it to the list of

142

# The file has changed, delete it and add it to the list of

143

# files to reindex

143

# files to reindex

144

log.debug('adding to reindex list %s' % indexed_path)

144

log.debug('adding to reindex list %s' % indexed_path)

145

writer.delete_by_term('path', indexed_path)

145

writer.delete_by_term('path', indexed_path)

146

to_index.add(indexed_path)

146

to_index.add(indexed_path)

147

#writer.commit()

147

#writer.commit()

148

149

# Loop over the files in the filesystem

149

# Loop over the files in the filesystem

150

# Assume we have a function that gathers the filenames of the

150

# Assume we have a function that gathers the filenames of the

151

# documents to be indexed

151

# documents to be indexed

152

for repo in scan_paths(location).values():

152

for repo in scan_paths(location).values():

153

for path in self.get_paths(repo.path):

153

for path in self.get_paths(repo.path):

154

if path in to_index or path not in indexed_paths:

154

if path in to_index or path not in indexed_paths:

155

# This is either a file that's changed, or a new file

155

# This is either a file that's changed, or a new file

156

# that wasn't indexed before. So index it!

156

# that wasn't indexed before. So index it!

157

self.add_doc(writer, path, repo)

157

self.add_doc(writer, path, repo)

158

log.debug('reindexing %s' % path)

158

log.debug('reindexing %s' % path)

159

160

writer.commit(merge=True)

160

writer.commit(merge=True)

161

#idx.optimize()

161

#idx.optimize()

162

log.debug('>>> FINISHED <<<')

162

log.debug('>>> FINISHED <<<')

163

164

def run(self, full_index=False):

164

def run(self, full_index=False):

165

"""Run daemon"""

165

"""Run daemon"""

166

if full_index:

166

if full_index:

167

self.build_index()

167

self.build_index()

168

else:

168

else:

169

self.update_index()

169

self.update_index()

170

171

if __name__ == "__main__":

171

if __name__ == "__main__":

172

173

#config = load_environment()

173

#config = load_environment()

174

#print config

174

#print config

175

try:

175

try:

176

l = DaemonLock()

176

l = DaemonLock()

177

WhooshIndexingDaemon().run(full_index=True)

177

WhooshIndexingDaemon().run(full_index=True)

178

l.release()

178

l.release()

179

except LockHeld:

179

except LockHeld:

180

sys.exit(1)

180

sys.exit(1)

181

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             #!/usr/bin/env python
             # encoding: utf-8
             # whoosh indexer daemon for hg-app
             # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
             #
             # This program is free software; you can redistribute it and/or
             # modify it under the terms of the GNU General Public License
             # as published by the Free Software Foundation; version 2
             # of the License or (at your opinion) any later version of the license.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software
             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
             # MA  02110-1301, USA.
             """
             Created on Jan 26, 2010
             @author: marcink
             A deamon will read from task table and run tasks
             """
             import sys
             import os
             from pidlock import LockHeld, DaemonLock
             import traceback
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the pylons_app import
-            sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
+            sys.path.append(dn(dn(dn(dn(os.path.realpath(__file__))))))
             from pylons_app.config.environment import load_environment
             from pylons_app.model.hg_model import HgModel
             from whoosh.index import create_in, open_dir
             from shutil import rmtree
             from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
             import logging
             log = logging.getLogger(__name__)
             location = '/home/marcink/python_workspace_dirty/*'
             def scan_paths(root_location):
                 return HgModel.repo_scan('/', root_location, None, True)
             class WhooshIndexingDaemon(object):
                 """Deamon for atomic jobs"""
                 def __init__(self, indexname='HG_INDEX'):
                     self.indexname = indexname
                 def get_paths(self, root_dir):
                     """recursive walk in root dir and return a set of all path in that dir
                     excluding files in .hg dir"""
                     index_paths_ = set()
                     for path, dirs, files in os.walk(root_dir):
                         if path.find('.hg') == -1:
                             for f in files:
                                 index_paths_.add(jn(path, f))
                     return index_paths_
                 def add_doc(self, writer, path, repo):
                     """Adding doc to writer"""
                     #we don't won't to read excluded file extensions just index them
                     if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS:
                         fobj = open(path, 'rb')
                         content = fobj.read()
                         fobj.close()
                         try:
                             u_content = unicode(content)
                         except UnicodeDecodeError:
                             #incase we have a decode error just represent as byte string
                             u_content = unicode(str(content).encode('string_escape'))
                     else:
                         u_content = u''
                     writer.add_document(owner=unicode(repo.contact),
                                         repository=u"%s" % repo.name,
                                         path=u"%s" % path,
                                         content=u_content,
                                         modtime=os.path.getmtime(path))
                 def build_index(self):
                     if os.path.exists(IDX_LOCATION):
                         rmtree(IDX_LOCATION)
                     if not os.path.exists(IDX_LOCATION):
                         os.mkdir(IDX_LOCATION)
                     idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
                     writer = idx.writer()
                     for cnt, repo in enumerate(scan_paths(location).values()):
                         log.debug('building index @ %s' % repo.path)
                         for idx_path in self.get_paths(repo.path):
                             log.debug('    >> %s' % idx_path)
                             self.add_doc(writer, idx_path, repo)
                     writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_index(self):
                     log.debug('STARTING INCREMENTAL INDEXING UPDATE')
                     idx = open_dir(IDX_LOCATION, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     reader = idx.reader()
                     writer = idx.writer()
                     # Loop over the stored fields in the index
                     for fields in reader.all_stored_fields():
                         indexed_path = fields['path']
                         indexed_paths.add(indexed_path)
                         if not os.path.exists(indexed_path):
                             # This file was deleted since it was indexed
                             log.debug('removing from index %s' % indexed_path)
                             writer.delete_by_term('path', indexed_path)
                         else:
                             # Check if this file was changed since it
                             # was indexed
                             indexed_time = fields['modtime']
                             mtime = os.path.getmtime(indexed_path)
                             if mtime > indexed_time:
                                 # The file has changed, delete it and add it to the list of
                                 # files to reindex
                                 log.debug('adding to reindex list %s' % indexed_path)
                                 writer.delete_by_term('path', indexed_path)
                                 to_index.add(indexed_path)
                                 #writer.commit()
                     # Loop over the files in the filesystem
                     # Assume we have a function that gathers the filenames of the
                     # documents to be indexed
                     for repo in scan_paths(location).values():
                         for path in self.get_paths(repo.path):
                             if path in to_index or path not in indexed_paths:
                                 # This is either a file that's changed, or a new file
                                 # that wasn't indexed before. So index it!
                                 self.add_doc(writer, path, repo)
                                 log.debug('reindexing %s' % path)
                     writer.commit(merge=True)
                     #idx.optimize()
                     log.debug('>>> FINISHED <<<')
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index:
                         self.build_index()
                     else:
                         self.update_index()
             if __name__ == "__main__":
                 #config = load_environment()
                 #print config
                 try:
                     l = DaemonLock()
                     WhooshIndexingDaemon().run(full_index=True)
                     l.release()
                 except LockHeld:
                     sys.exit(1)