upstream/kallithea Commit - r1407:2744f5b0

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.__init__

3

rhodecode.lib.indexers.__init__

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

Whoosh indexing module for RhodeCode

6

Whoosh indexing module for RhodeCode

7

8

:created_on: Aug 17, 2010

8

:created_on: Aug 17, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

import os

25

import os

26

import sys

26

import sys

27

import traceback

27

import traceback

28

from os.path import dirname as dn, join as jn

28

from os.path import dirname as dn, join as jn

29

30

#to get the rhodecode import

30

#to get the rhodecode import

31

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

31

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

32

33

from string import strip

33

from string import strip

34

from shutil import rmtree

34

from shutil import rmtree

35

36

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

36

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

37

from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

37

from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

38

from whoosh.index import create_in, open_dir

38

from whoosh.index import create_in, open_dir

39

from whoosh.formats import Characters

39

from whoosh.formats import Characters

40

from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

40

from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

41

42

from webhelpers.html.builder import escape

42

from webhelpers.html.builder import escape

43

from sqlalchemy import engine_from_config

43

from sqlalchemy import engine_from_config

44

from vcs.utils.lazy import LazyProperty

44

from vcs.utils.lazy import LazyProperty

45

46

from rhodecode.model import init_model

46

from rhodecode.model import init_model

47

from rhodecode.model.scm import ScmModel

47

from rhodecode.model.scm import ScmModel

48

from rhodecode.model.repo import RepoModel

48

from rhodecode.config.environment import load_environment

49

from rhodecode.config.environment import load_environment

49

from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP

50

from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP

50

from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

51

from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

51

52

#EXTENSIONS WE WANT TO INDEX CONTENT OFF

53

#EXTENSIONS WE WANT TO INDEX CONTENT OFF

53

INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()

54

INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()

54

55

#CUSTOM ANALYZER wordsplit + lowercase filter

56

#CUSTOM ANALYZER wordsplit + lowercase filter

56

ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

57

ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

57

58

59

#INDEX SCHEMA DEFINITION

60

#INDEX SCHEMA DEFINITION

60

SCHEMA = Schema(owner=TEXT(),

61

SCHEMA = Schema(owner=TEXT(),

61

repository=TEXT(stored=True),

62

repository=TEXT(stored=True),

62

path=TEXT(stored=True),

63

path=TEXT(stored=True),

63

content=FieldType(format=Characters(ANALYZER),

64

content=FieldType(format=Characters(ANALYZER),

64

scorable=True, stored=True),

65

scorable=True, stored=True),

65

modtime=STORED(), extension=TEXT(stored=True))

66

modtime=STORED(), extension=TEXT(stored=True))

66

67

68

IDX_NAME = 'HG_INDEX'

69

IDX_NAME = 'HG_INDEX'

69

FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

70

FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

70

FRAGMENTER = SimpleFragmenter(200)

71

FRAGMENTER = SimpleFragmenter(200)

71

72

73

class MakeIndex(BasePasterCommand):

74

class MakeIndex(BasePasterCommand):

74

75

max_args = 1

76

max_args = 1

76

min_args = 1

77

min_args = 1

77

78

usage = "CONFIG_FILE"

79

usage = "CONFIG_FILE"

79

summary = "Creates index for full text search given configuration file"

80

summary = "Creates index for full text search given configuration file"

80

group_name = "RhodeCode"

81

group_name = "RhodeCode"

81

takes_config_file = -1

82

takes_config_file = -1

82

parser = Command.standard_parser(verbose=True)

83

parser = Command.standard_parser(verbose=True)

83

84

def command(self):

85

def command(self):

85

86

from pylons import config

87

from pylons import config

87

add_cache(config)

88

add_cache(config)

88

engine = engine_from_config(config, 'sqlalchemy.db1.')

89

engine = engine_from_config(config, 'sqlalchemy.db1.')

89

init_model(engine)

90

init_model(engine)

90

91

index_location = config['index_dir']

92

index_location = config['index_dir']

92

repo_location = self.options.repo_location

93

repo_location = self.options.repo_location if self.options.repo_location else RepoModel().repos_path

93

repo_list = map(strip, self.options.repo_list.split(',')) \

94

repo_list = map(strip, self.options.repo_list.split(',')) \

94

if self.options.repo_list else None

95

if self.options.repo_list else None

95

96

#======================================================================

97

#======================================================================

97

# WHOOSH DAEMON

98

# WHOOSH DAEMON

98

#======================================================================

99

#======================================================================

99

from rhodecode.lib.pidlock import LockHeld, DaemonLock

100

from rhodecode.lib.pidlock import LockHeld, DaemonLock

100

from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

101

from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

101

try:

102

try:

102

l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))

103

l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))

103

WhooshIndexingDaemon(index_location=index_location,

104

WhooshIndexingDaemon(index_location=index_location,

104

repo_location=repo_location,

105

repo_location=repo_location,

105

repo_list=repo_list)\

106

repo_list=repo_list)\

106

.run(full_index=self.options.full_index)

107

.run(full_index=self.options.full_index)

107

l.release()

108

l.release()

108

except LockHeld:

109

except LockHeld:

109

sys.exit(1)

110

sys.exit(1)

110

111

def update_parser(self):

112

def update_parser(self):

112

self.parser.add_option('--repo-location',

113

self.parser.add_option('--repo-location',

113

action='store',

114

action='store',

114

dest='repo_location',

115

dest='repo_location',

115

help="Specifies repositories location to index REQUIRED",

116

help="Specifies repositories location to index REQUIRED",

116

)

117

)

117

self.parser.add_option('--index-only',

118

self.parser.add_option('--index-only',

118

action='store',

119

action='store',

119

dest='repo_list',

120

dest='repo_list',

120

help="Specifies a comma separated list of repositores "

121

help="Specifies a comma separated list of repositores "

121

"to build index on OPTIONAL",

122

"to build index on OPTIONAL",

122

)

123

)

123

self.parser.add_option('-f',

124

self.parser.add_option('-f',

124

action='store_true',

125

action='store_true',

125

dest='full_index',

126

dest='full_index',

126

help="Specifies that index should be made full i.e"

127

help="Specifies that index should be made full i.e"

127

" destroy old and build from scratch",

128

" destroy old and build from scratch",

128

default=False)

129

default=False)

129

130

class ResultWrapper(object):

131

class ResultWrapper(object):

131

def __init__(self, search_type, searcher, matcher, highlight_items):

132

def __init__(self, search_type, searcher, matcher, highlight_items):

132

self.search_type = search_type

133

self.search_type = search_type

133

self.searcher = searcher

134

self.searcher = searcher

134

self.matcher = matcher

135

self.matcher = matcher

135

self.highlight_items = highlight_items

136

self.highlight_items = highlight_items

136

self.fragment_size = 200 / 2

137

self.fragment_size = 200 / 2

137

138

@LazyProperty

139

@LazyProperty

139

def doc_ids(self):

140

def doc_ids(self):

140

docs_id = []

141

docs_id = []

141

while self.matcher.is_active():

142

while self.matcher.is_active():

142

docnum = self.matcher.id()

143

docnum = self.matcher.id()

143

chunks = [offsets for offsets in self.get_chunks()]

144

chunks = [offsets for offsets in self.get_chunks()]

144

docs_id.append([docnum, chunks])

145

docs_id.append([docnum, chunks])

145

self.matcher.next()

146

self.matcher.next()

146

return docs_id

147

return docs_id

147

148

def __str__(self):

149

def __str__(self):

149

return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

150

return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

150

151

def __repr__(self):

152

def __repr__(self):

152

return self.__str__()

153

return self.__str__()

153

154

def __len__(self):

155

def __len__(self):

155

return len(self.doc_ids)

156

return len(self.doc_ids)

156

157

def __iter__(self):

158

def __iter__(self):

158

"""

159

"""

159

Allows Iteration over results,and lazy generate content

160

Allows Iteration over results,and lazy generate content

160

161

*Requires* implementation of ``__getitem__`` method.

162

*Requires* implementation of ``__getitem__`` method.

162

"""

163

"""

163

for docid in self.doc_ids:

164

for docid in self.doc_ids:

164

yield self.get_full_content(docid)

165

yield self.get_full_content(docid)

165

166

def __getitem__(self, key):

167

def __getitem__(self, key):

167

"""

168

"""

168

Slicing of resultWrapper

169

Slicing of resultWrapper

169

"""

170

"""

170

i, j = key.start, key.stop

171

i, j = key.start, key.stop

171

172

slice = []

173

slice = []

173

for docid in self.doc_ids[i:j]:

174

for docid in self.doc_ids[i:j]:

174

slice.append(self.get_full_content(docid))

175

slice.append(self.get_full_content(docid))

175

return slice

176

return slice

176

177

178

def get_full_content(self, docid):

179

def get_full_content(self, docid):

179

res = self.searcher.stored_fields(docid[0])

180

res = self.searcher.stored_fields(docid[0])

180

f_path = res['path'][res['path'].find(res['repository']) \

181

f_path = res['path'][res['path'].find(res['repository']) \

181

+ len(res['repository']):].lstrip('/')

182

+ len(res['repository']):].lstrip('/')

182

183

content_short = self.get_short_content(res, docid[1])

184

content_short = self.get_short_content(res, docid[1])

184

res.update({'content_short':content_short,

185

res.update({'content_short':content_short,

185

'content_short_hl':self.highlight(content_short),

186

'content_short_hl':self.highlight(content_short),

186

'f_path':f_path})

187

'f_path':f_path})

187

188

return res

189

return res

189

190

def get_short_content(self, res, chunks):

191

def get_short_content(self, res, chunks):

191

192

return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

193

return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

193

194

def get_chunks(self):

195

def get_chunks(self):

195

"""

196

"""

196

Smart function that implements chunking the content

197

Smart function that implements chunking the content

197

but not overlap chunks so it doesn't highlight the same

198

but not overlap chunks so it doesn't highlight the same

198

close occurrences twice.

199

close occurrences twice.

199

200

:param matcher:

201

:param matcher:

201

:param size:

202

:param size:

202

"""

203

"""

203

memory = [(0, 0)]

204

memory = [(0, 0)]

204

for span in self.matcher.spans():

205

for span in self.matcher.spans():

205

start = span.startchar or 0

206

start = span.startchar or 0

206

end = span.endchar or 0

207

end = span.endchar or 0

207

start_offseted = max(0, start - self.fragment_size)

208

start_offseted = max(0, start - self.fragment_size)

208

end_offseted = end + self.fragment_size

209

end_offseted = end + self.fragment_size

209

210

if start_offseted < memory[-1][1]:

211

if start_offseted < memory[-1][1]:

211

start_offseted = memory[-1][1]

212

start_offseted = memory[-1][1]

212

memory.append((start_offseted, end_offseted,))

213

memory.append((start_offseted, end_offseted,))

213

yield (start_offseted, end_offseted,)

214

yield (start_offseted, end_offseted,)

214

215

def highlight(self, content, top=5):

216

def highlight(self, content, top=5):

216

if self.search_type != 'content':

217

if self.search_type != 'content':

217

return ''

218

return ''

218

hl = highlight(escape(content),

219

hl = highlight(escape(content),

219

self.highlight_items,

220

self.highlight_items,

220

analyzer=ANALYZER,

221

analyzer=ANALYZER,

221

fragmenter=FRAGMENTER,

222

fragmenter=FRAGMENTER,

222

formatter=FORMATTER,

223

formatter=FORMATTER,

223

top=top)

224

top=top)

224

return hl

225

return hl

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.__init__
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 Whoosh indexing module for RhodeCode
                 :created_on: Aug 17, 2010
                 :author: marcink
                 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             import os
             import sys
             import traceback
             from os.path import dirname as dn, join as jn
             #to get the rhodecode import
             sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
             from string import strip
             from shutil import rmtree
             from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
             from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
             from whoosh.index import create_in, open_dir
             from whoosh.formats import Characters
             from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
             from webhelpers.html.builder import escape
             from sqlalchemy import engine_from_config
             from vcs.utils.lazy import LazyProperty
             from rhodecode.model import init_model
             from rhodecode.model.scm import ScmModel
+            from rhodecode.model.repo import RepoModel
             from rhodecode.config.environment import load_environment
             from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
             from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
             #EXTENSIONS WE WANT TO INDEX CONTENT OFF
             INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
             #CUSTOM ANALYZER wordsplit + lowercase filter
             ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
             #INDEX SCHEMA DEFINITION
             SCHEMA = Schema(owner=TEXT(),
                             repository=TEXT(stored=True),
                             path=TEXT(stored=True),
                             content=FieldType(format=Characters(ANALYZER),
                                          scorable=True, stored=True),
                             modtime=STORED(), extension=TEXT(stored=True))
             IDX_NAME = 'HG_INDEX'
             FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
             FRAGMENTER = SimpleFragmenter(200)
             class MakeIndex(BasePasterCommand):
                 max_args = 1
                 min_args = 1
                 usage = "CONFIG_FILE"
                 summary = "Creates index for full text search given configuration file"
                 group_name = "RhodeCode"
                 takes_config_file = -1
                 parser = Command.standard_parser(verbose=True)
                 def command(self):
                     from pylons import config
                     add_cache(config)
                     engine = engine_from_config(config, 'sqlalchemy.db1.')
                     init_model(engine)
                     index_location = config['index_dir']
-                    repo_location = self.options.repo_location
+                    repo_location = self.options.repo_location if self.options.repo_location else RepoModel().repos_path
                     repo_list = map(strip, self.options.repo_list.split(',')) \
                         if self.options.repo_list else None
                     #======================================================================
                     # WHOOSH DAEMON
                     #======================================================================
                     from rhodecode.lib.pidlock import LockHeld, DaemonLock
                     from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
                     try:
                         l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))
                         WhooshIndexingDaemon(index_location=index_location,
                                              repo_location=repo_location,
                                              repo_list=repo_list)\
                             .run(full_index=self.options.full_index)
                         l.release()
                     except LockHeld:
                         sys.exit(1)
                 def update_parser(self):
                     self.parser.add_option('--repo-location',
                                       action='store',
                                       dest='repo_location',
                                       help="Specifies repositories location to index REQUIRED",
                                       )
                     self.parser.add_option('--index-only',
                                       action='store',
                                       dest='repo_list',
                                       help="Specifies a comma separated list of repositores "
                                             "to build index on OPTIONAL",
                                       )
                     self.parser.add_option('-f',
                                       action='store_true',
                                       dest='full_index',
                                       help="Specifies that index should be made full i.e"
                                             " destroy old and build from scratch",
                                       default=False)
             class ResultWrapper(object):
                 def __init__(self, search_type, searcher, matcher, highlight_items):
                     self.search_type = search_type
                     self.searcher = searcher
                     self.matcher = matcher
                     self.highlight_items = highlight_items
                     self.fragment_size = 200 / 2
                 @LazyProperty
                 def doc_ids(self):
                     docs_id = []
                     while self.matcher.is_active():
                         docnum = self.matcher.id()
                         chunks = [offsets for offsets in self.get_chunks()]
                         docs_id.append([docnum, chunks])
                         self.matcher.next()
                     return docs_id
                 def __str__(self):
                     return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
                 def __repr__(self):
                     return self.__str__()
                 def __len__(self):
                     return len(self.doc_ids)
                 def __iter__(self):
                     """
                     Allows Iteration over results,and lazy generate content
                     *Requires* implementation of ``__getitem__`` method.
                     """
                     for docid in self.doc_ids:
                         yield self.get_full_content(docid)
                 def __getitem__(self, key):
                     """
                     Slicing of resultWrapper
                     """
                     i, j = key.start, key.stop
                     slice = []
                     for docid in self.doc_ids[i:j]:
                         slice.append(self.get_full_content(docid))
                     return slice
                 def get_full_content(self, docid):
                     res = self.searcher.stored_fields(docid[0])
                     f_path = res['path'][res['path'].find(res['repository']) \
                                          + len(res['repository']):].lstrip('/')
                     content_short = self.get_short_content(res, docid[1])
                     res.update({'content_short':content_short,
                                 'content_short_hl':self.highlight(content_short),
                                 'f_path':f_path})
                     return res
                 def get_short_content(self, res, chunks):
                     return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
                 def get_chunks(self):
                     """
                     Smart function that implements chunking the content
                     but not overlap chunks so it doesn't highlight the same
                     close occurrences twice.
                     :param matcher:
                     :param size:
                     """
                     memory = [(0, 0)]
                     for span in self.matcher.spans():
                         start = span.startchar or 0
                         end = span.endchar or 0
                         start_offseted = max(0, start - self.fragment_size)
                         end_offseted = end + self.fragment_size
                         if start_offseted < memory[-1][1]:
                             start_offseted = memory[-1][1]
                         memory.append((start_offseted, end_offseted,))
                         yield (start_offseted, end_offseted,)
                 def highlight(self, content, top=5):
                     if self.search_type != 'content':
                         return ''
                     hl = highlight(escape(content),
                              self.highlight_items,
                              analyzer=ANALYZER,
                              fragmenter=FRAGMENTER,
                              formatter=FORMATTER,
                              top=top)
                     return hl