upstream/kallithea Commit - r1198:02a7f263

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.__init__

3

rhodecode.lib.indexers.__init__

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

Whoosh indexing module for RhodeCode

6

Whoosh indexing module for RhodeCode

7

8

:created_on: Aug 17, 2010

8

:created_on: Aug 17, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software; you can redistribute it and/or

13

# This program is free software; you can redistribute it and/or

14

# modify it under the terms of the GNU General Public License

14

# modify it under the terms of the GNU General Public License

15

# as published by the Free Software Foundation; version 2

15

# as published by the Free Software Foundation; version 2

16

# of the License or (at your opinion) any later version of the license.

16

# of the License or (at your opinion) any later version of the license.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program; if not, write to the Free Software

24

# along with this program; if not, write to the Free Software

25

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

25

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

26

# MA 02110-1301, USA.

26

# MA 02110-1301, USA.

27

import os

27

import os

28

import sys

28

import sys

29

import traceback

29

import traceback

30

from os.path import dirname as dn, join as jn

30

from os.path import dirname as dn, join as jn

31

32

#to get the rhodecode import

32

#to get the rhodecode import

33

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

33

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

34

35

from string import strip

35

from string import strip

36

37

from rhodecode.model import init_model

37

from rhodecode.model import init_model

38

from rhodecode.model.scm import ScmModel

38

from rhodecode.model.scm import ScmModel

39

from rhodecode.config.environment import load_environment

39

from rhodecode.config.environment import load_environment

40

from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

40

from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

41

42

from shutil import rmtree

42

from shutil import rmtree

43

from webhelpers.html.builder import escape

43

from webhelpers.html.builder import escape

44

from vcs.utils.lazy import LazyProperty

44

from vcs.utils.lazy import LazyProperty

45

46

from sqlalchemy import engine_from_config

46

from sqlalchemy import engine_from_config

47

48

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

48

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

49

from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

49

from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

50

from whoosh.index import create_in, open_dir

50

from whoosh.index import create_in, open_dir

51

from whoosh.formats import Characters

51

from whoosh.formats import Characters

52

from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

52

from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

53

54

55

#EXTENSIONS WE WANT TO INDEX CONTENT OFF

55

#EXTENSIONS WE WANT TO INDEX CONTENT OFF

56

INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',

56

INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',

57

'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',

57

'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',

58

'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',

58

'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',

59

'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',

59

'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',

60

'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',

60

'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',

61

'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',

61

'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',

62

'yaws']

62

'yaws']

63

64

#CUSTOM ANALYZER wordsplit + lowercase filter

64

#CUSTOM ANALYZER wordsplit + lowercase filter

65

ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

65

ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

66

67

68

#INDEX SCHEMA DEFINITION

68

#INDEX SCHEMA DEFINITION

69

SCHEMA = Schema(owner=TEXT(),

69

SCHEMA = Schema(owner=TEXT(),

70

repository=TEXT(stored=True),

70

repository=TEXT(stored=True),

71

path=TEXT(stored=True),

71

path=TEXT(stored=True),

72

content=FieldType(format=Characters(ANALYZER),

72

content=FieldType(format=Characters(ANALYZER),

73

scorable=True, stored=True),

73

scorable=True, stored=True),

74

modtime=STORED(), extension=TEXT(stored=True))

74

modtime=STORED(), extension=TEXT(stored=True))

75

76

77

IDX_NAME = 'HG_INDEX'

77

IDX_NAME = 'HG_INDEX'

78

FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

78

FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

79

FRAGMENTER = SimpleFragmenter(200)

79

FRAGMENTER = SimpleFragmenter(200)

80

81

82

class MakeIndex(BasePasterCommand):

82

class MakeIndex(BasePasterCommand):

83

84

max_args = 1

84

max_args = 1

85

min_args = 1

85

min_args = 1

86

87

usage = "CONFIG_FILE"

87

usage = "CONFIG_FILE"

88

summary = "Creates index for full text search given configuration file"

88

summary = "Creates index for full text search given configuration file"

89

group_name = "RhodeCode"

89

group_name = "RhodeCode"

90

takes_config_file = -1

90

takes_config_file = -1

91

parser = Command.standard_parser(verbose=True)

91

parser = Command.standard_parser(verbose=True)

92

93

def command(self):

93

def command(self):

94

95

from pylons import config

95

from pylons import config

96

add_cache(config)

96

add_cache(config)

97

engine = engine_from_config(config, 'sqlalchemy.db1.')

97

engine = engine_from_config(config, 'sqlalchemy.db1.')

98

init_model(engine)

98

init_model(engine)

99

100

index_location = config['index_dir']

100

index_location = config['index_dir']

101

repo_location = self.options.repo_location

101

repo_location = self.options.repo_location

102

repo_list = map(strip, self.options.repo_list.split(',')) \

102

repo_list = map(strip, self.options.repo_list.split(',')) \

103

if self.options.repo_list else None

103

if self.options.repo_list else None

104

105

#======================================================================

105

#======================================================================

106

# WHOOSH DAEMON

106

# WHOOSH DAEMON

107

#======================================================================

107

#======================================================================

108

from rhodecode.lib.pidlock import LockHeld, DaemonLock

108

from rhodecode.lib.pidlock import LockHeld, DaemonLock

109

from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

109

from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

110

try:

110

try:

111

l = DaemonLock()

111

l = DaemonLock()

112

WhooshIndexingDaemon(index_location=index_location,

112

WhooshIndexingDaemon(index_location=index_location,

113

repo_location=repo_location,

113

repo_location=repo_location,

114

repo_list=repo_list)\

114

repo_list=repo_list)\

115

.run(full_index=self.options.full_index)

115

.run(full_index=self.options.full_index)

116

l.release()

116

l.release()

117

except LockHeld:

117

except LockHeld:

118

sys.exit(1)

118

sys.exit(1)

119

120

def update_parser(self):

120

def update_parser(self):

121

self.parser.add_option('--repo-location',

121

self.parser.add_option('--repo-location',

122

action='store',

122

action='store',

123

dest='repo_location',

123

dest='repo_location',

124

help="Specifies repositories location to index REQUIRED",

124

help="Specifies repositories location to index REQUIRED",

125

)

125

)

126

self.parser.add_option('--index-only',

126

self.parser.add_option('--index-only',

127

action='store',

127

action='store',

128

dest='repo_list',

128

dest='repo_list',

129

help="Specifies a comma separated list of repositores "

129

help="Specifies a comma separated list of repositores "

130

"to build index on OPTIONAL",

130

"to build index on OPTIONAL",

131

)

131

)

132

self.parser.add_option('-f',

132

self.parser.add_option('-f',

133

action='store_true',

133

action='store_true',

134

dest='full_index',

134

dest='full_index',

135

help="Specifies that index should be made full i.e"

135

help="Specifies that index should be made full i.e"

136

" destroy old and build from scratch",

136

" destroy old and build from scratch",

137

default=False)

137

default=False)

138

139

class ResultWrapper(object):

139

class ResultWrapper(object):

140

def __init__(self, search_type, searcher, matcher, highlight_items):

140

def __init__(self, search_type, searcher, matcher, highlight_items):

141

self.search_type = search_type

141

self.search_type = search_type

142

self.searcher = searcher

142

self.searcher = searcher

143

self.matcher = matcher

143

self.matcher = matcher

144

self.highlight_items = highlight_items

144

self.highlight_items = highlight_items

145

self.fragment_size = 200 / 2

145

self.fragment_size = 200 / 2

146

147

@LazyProperty

147

@LazyProperty

148

def doc_ids(self):

148

def doc_ids(self):

149

docs_id = []

149

docs_id = []

150

while self.matcher.is_active():

150

while self.matcher.is_active():

151

docnum = self.matcher.id()

151

docnum = self.matcher.id()

152

chunks = [offsets for offsets in self.get_chunks()]

152

chunks = [offsets for offsets in self.get_chunks()]

153

docs_id.append([docnum, chunks])

153

docs_id.append([docnum, chunks])

154

self.matcher.next()

154

self.matcher.next()

155

return docs_id

155

return docs_id

156

157

def __str__(self):

157

def __str__(self):

158

return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

158

return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

159

160

def __repr__(self):

160

def __repr__(self):

161

return self.__str__()

161

return self.__str__()

162

163

def __len__(self):

163

def __len__(self):

164

return len(self.doc_ids)

164

return len(self.doc_ids)

165

166

def __iter__(self):

166

def __iter__(self):

167

"""

167

"""

168

Allows Iteration over results,and lazy generate content

168

Allows Iteration over results,and lazy generate content

169

170

*Requires* implementation of ``__getitem__`` method.

170

*Requires* implementation of ``__getitem__`` method.

171

"""

171

"""

172

for docid in self.doc_ids:

172

for docid in self.doc_ids:

173

yield self.get_full_content(docid)

173

yield self.get_full_content(docid)

174

175

def __get~~slice~~__(self, i, j):

175

def __getitem__(self, key):

176

"""

176

"""

177

Slicing of resultWrapper

177

Slicing of resultWrapper

178

"""

178

"""

179

i, j = key.start, key.stop

180

179

slice = []

181

slice = []

180

for docid in self.doc_ids[i:j]:

182

for docid in self.doc_ids[i:j]:

181

slice.append(self.get_full_content(docid))

183

slice.append(self.get_full_content(docid))

182

return slice

184

return slice

183

185

184

186

185

def get_full_content(self, docid):

187

def get_full_content(self, docid):

186

res = self.searcher.stored_fields(docid[0])

188

res = self.searcher.stored_fields(docid[0])

187

f_path = res['path'][res['path'].find(res['repository']) \

189

f_path = res['path'][res['path'].find(res['repository']) \

188

+ len(res['repository']):].lstrip('/')

190

+ len(res['repository']):].lstrip('/')

189

191

190

content_short = self.get_short_content(res, docid[1])

192

content_short = self.get_short_content(res, docid[1])

191

res.update({'content_short':content_short,

193

res.update({'content_short':content_short,

192

'content_short_hl':self.highlight(content_short),

194

'content_short_hl':self.highlight(content_short),

193

'f_path':f_path})

195

'f_path':f_path})

194

196

195

return res

197

return res

196

198

197

def get_short_content(self, res, chunks):

199

def get_short_content(self, res, chunks):

198

200

199

return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

201

return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

200

202

201

def get_chunks(self):

203

def get_chunks(self):

202

"""

204

"""

203

Smart function that implements chunking the content

205

Smart function that implements chunking the content

204

but not overlap chunks so it doesn't highlight the same

206

but not overlap chunks so it doesn't highlight the same

205

close occurrences twice.

207

close occurrences twice.

206

@param matcher:

208

@param matcher:

207

@param size:

209

@param size:

208

"""

210

"""

209

memory = [(0, 0)]

211

memory = [(0, 0)]

210

for span in self.matcher.spans():

212

for span in self.matcher.spans():

211

start = span.startchar or 0

213

start = span.startchar or 0

212

end = span.endchar or 0

214

end = span.endchar or 0

213

start_offseted = max(0, start - self.fragment_size)

215

start_offseted = max(0, start - self.fragment_size)

214

end_offseted = end + self.fragment_size

216

end_offseted = end + self.fragment_size

215

217

216

if start_offseted < memory[-1][1]:

218

if start_offseted < memory[-1][1]:

217

start_offseted = memory[-1][1]

219

start_offseted = memory[-1][1]

218

memory.append((start_offseted, end_offseted,))

220

memory.append((start_offseted, end_offseted,))

219

yield (start_offseted, end_offseted,)

221

yield (start_offseted, end_offseted,)

220

222

221

def highlight(self, content, top=5):

223

def highlight(self, content, top=5):

222

if self.search_type != 'content':

224

if self.search_type != 'content':

223

return ''

225

return ''

224

hl = highlight(escape(content),

226

hl = highlight(escape(content),

225

self.highlight_items,

227

self.highlight_items,

226

analyzer=ANALYZER,

228

analyzer=ANALYZER,

227

fragmenter=FRAGMENTER,

229

fragmenter=FRAGMENTER,

228

formatter=FORMATTER,

230

formatter=FORMATTER,

229

top=top)

231

top=top)

230

return hl

232

return hl

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.__init__
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 Whoosh indexing module for RhodeCode
                 :created_on: Aug 17, 2010
                 :author: marcink
                 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software; you can redistribute it and/or
             # modify it under the terms of the GNU General Public License
             # as published by the Free Software Foundation; version 2
             # of the License or (at your opinion) any later version of the license.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software
             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
             # MA  02110-1301, USA.
             import os
             import sys
             import traceback
             from os.path import dirname as dn, join as jn
             #to get the rhodecode import
             sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
             from string import strip
             from rhodecode.model import init_model
             from rhodecode.model.scm import ScmModel
             from rhodecode.config.environment import load_environment
             from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
             from shutil import rmtree
             from webhelpers.html.builder import escape
             from vcs.utils.lazy import LazyProperty
             from sqlalchemy import engine_from_config
             from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
             from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
             from whoosh.index import create_in, open_dir
             from whoosh.formats import Characters
             from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
             #EXTENSIONS WE WANT TO INDEX CONTENT OFF
             INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
                                 'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
                                 'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
                                 'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
                                 'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
                                 'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
                                 'yaws']
             #CUSTOM ANALYZER wordsplit + lowercase filter
             ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
             #INDEX SCHEMA DEFINITION
             SCHEMA = Schema(owner=TEXT(),
                             repository=TEXT(stored=True),
                             path=TEXT(stored=True),
                             content=FieldType(format=Characters(ANALYZER),
                                          scorable=True, stored=True),
                             modtime=STORED(), extension=TEXT(stored=True))
             IDX_NAME = 'HG_INDEX'
             FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
             FRAGMENTER = SimpleFragmenter(200)
             class MakeIndex(BasePasterCommand):
                 max_args = 1
                 min_args = 1
                 usage = "CONFIG_FILE"
                 summary = "Creates index for full text search given configuration file"
                 group_name = "RhodeCode"
                 takes_config_file = -1
                 parser = Command.standard_parser(verbose=True)
                 def command(self):
                     from pylons import config
                     add_cache(config)
                     engine = engine_from_config(config, 'sqlalchemy.db1.')
                     init_model(engine)
                     index_location = config['index_dir']
                     repo_location = self.options.repo_location
                     repo_list = map(strip, self.options.repo_list.split(',')) \
                         if self.options.repo_list else None
                     #======================================================================
                     # WHOOSH DAEMON
                     #======================================================================
                     from rhodecode.lib.pidlock import LockHeld, DaemonLock
                     from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
                     try:
                         l = DaemonLock()
                         WhooshIndexingDaemon(index_location=index_location,
                                              repo_location=repo_location,
                                              repo_list=repo_list)\
                             .run(full_index=self.options.full_index)
                         l.release()
                     except LockHeld:
                         sys.exit(1)
                 def update_parser(self):
                     self.parser.add_option('--repo-location',
                                       action='store',
                                       dest='repo_location',
                                       help="Specifies repositories location to index REQUIRED",
                                       )
                     self.parser.add_option('--index-only',
                                       action='store',
                                       dest='repo_list',
                                       help="Specifies a comma separated list of repositores "
                                             "to build index on OPTIONAL",
                                       )
                     self.parser.add_option('-f',
                                       action='store_true',
                                       dest='full_index',
                                       help="Specifies that index should be made full i.e"
                                             " destroy old and build from scratch",
                                       default=False)
             class ResultWrapper(object):
                 def __init__(self, search_type, searcher, matcher, highlight_items):
                     self.search_type = search_type
                     self.searcher = searcher
                     self.matcher = matcher
                     self.highlight_items = highlight_items
                     self.fragment_size = 200 / 2
                 @LazyProperty
                 def doc_ids(self):
                     docs_id = []
                     while self.matcher.is_active():
                         docnum = self.matcher.id()
                         chunks = [offsets for offsets in self.get_chunks()]
                         docs_id.append([docnum, chunks])
                         self.matcher.next()
                     return docs_id
                 def __str__(self):
                     return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
                 def __repr__(self):
                     return self.__str__()
                 def __len__(self):
                     return len(self.doc_ids)
                 def __iter__(self):
                     """
                     Allows Iteration over results,and lazy generate content
                     *Requires* implementation of ``__getitem__`` method.
                     """
                     for docid in self.doc_ids:
                         yield self.get_full_content(docid)
-                def __getslice__(self, i, j):
+                def __getitem__(self, key):
                     """
                     Slicing of resultWrapper
                     """
+                    i, j = key.start, key.stop
                     slice = []
                     for docid in self.doc_ids[i:j]:
                         slice.append(self.get_full_content(docid))
                     return slice
                 def get_full_content(self, docid):
                     res = self.searcher.stored_fields(docid[0])
                     f_path = res['path'][res['path'].find(res['repository']) \
                                          + len(res['repository']):].lstrip('/')
                     content_short = self.get_short_content(res, docid[1])
                     res.update({'content_short':content_short,
                                 'content_short_hl':self.highlight(content_short),
                                 'f_path':f_path})
                     return res
                 def get_short_content(self, res, chunks):
                     return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
                 def get_chunks(self):
                     """
                     Smart function that implements chunking the content
                     but not overlap chunks so it doesn't highlight the same
                     close occurrences twice.
                     @param matcher:
                     @param size:
                     """
                     memory = [(0, 0)]
                     for span in self.matcher.spans():
                         start = span.startchar or 0
                         end = span.endchar or 0
                         start_offseted = max(0, start - self.fragment_size)
                         end_offseted = end + self.fragment_size
                         if start_offseted < memory[-1][1]:
                             start_offseted = memory[-1][1]
                         memory.append((start_offseted, end_offseted,))
                         yield (start_offseted, end_offseted,)
                 def highlight(self, content, top=5):
                     if self.search_type != 'content':
                         return ''
                     hl = highlight(escape(content),
                              self.highlight_items,
                              analyzer=ANALYZER,
                              fragmenter=FRAGMENTER,
                              formatter=FORMATTER,
                              top=top)
                     return hl