upstream/kallithea Commit - r1409:c3172bc0

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.__init__

3

rhodecode.lib.indexers.__init__

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

Whoosh indexing module for RhodeCode

6

Whoosh indexing module for RhodeCode

7

8

:created_on: Aug 17, 2010

8

:created_on: Aug 17, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

import os

25

import os

26

import sys

26

import sys

27

import traceback

27

import traceback

28

from os.path import dirname as dn, join as jn

28

from os.path import dirname as dn, join as jn

29

30

#to get the rhodecode import

30

#to get the rhodecode import

31

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

31

sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

32

33

from string import strip

33

from string import strip

34

from shutil import rmtree

34

from shutil import rmtree

35

36

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

36

from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

37

from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

37

from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

38

from whoosh.index import create_in, open_dir

38

from whoosh.index import create_in, open_dir

39

from whoosh.formats import Characters

39

from whoosh.formats import Characters

40

from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

40

from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

41

42

from webhelpers.html.builder import escape

42

from webhelpers.html.builder import escape

43

from sqlalchemy import engine_from_config

43

from sqlalchemy import engine_from_config

44

from vcs.utils.lazy import LazyProperty

44

from vcs.utils.lazy import LazyProperty

45

46

from rhodecode.model import init_model

46

from rhodecode.model import init_model

47

from rhodecode.model.scm import ScmModel

47

from rhodecode.model.scm import ScmModel

48

from rhodecode.model.repo import RepoModel

48

from rhodecode.model.repo import RepoModel

49

from rhodecode.config.environment import load_environment

49

from rhodecode.config.environment import load_environment

50

from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP

50

from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP

51

from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

51

from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

52

53

#EXTENSIONS WE WANT TO INDEX CONTENT OFF

53

#EXTENSIONS WE WANT TO INDEX CONTENT OFF

54

INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()

54

INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()

55

56

#CUSTOM ANALYZER wordsplit + lowercase filter

56

#CUSTOM ANALYZER wordsplit + lowercase filter

57

ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

57

ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

58

59

60

#INDEX SCHEMA DEFINITION

60

#INDEX SCHEMA DEFINITION

61

SCHEMA = Schema(owner=TEXT(),

61

SCHEMA = Schema(owner=TEXT(),

62

repository=TEXT(stored=True),

62

repository=TEXT(stored=True),

63

path=TEXT(stored=True),

63

path=TEXT(stored=True),

64

content=FieldType(format=Characters(ANALYZER),

64

content=FieldType(format=Characters(ANALYZER),

65

scorable=True, stored=True),

65

scorable=True, stored=True),

66

modtime=STORED(), extension=TEXT(stored=True))

66

modtime=STORED(), extension=TEXT(stored=True))

67

68

69

IDX_NAME = 'HG_INDEX'

69

IDX_NAME = 'HG_INDEX'

70

FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

70

FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

71

FRAGMENTER = SimpleFragmenter(200)

71

FRAGMENTER = SimpleFragmenter(200)

72

73

74

class MakeIndex(BasePasterCommand):

74

class MakeIndex(BasePasterCommand):

75

76

max_args = 1

76

max_args = 1

77

min_args = 1

77

min_args = 1

78

79

usage = "CONFIG_FILE"

79

usage = "CONFIG_FILE"

80

summary = "Creates index for full text search given configuration file"

80

summary = "Creates index for full text search given configuration file"

81

group_name = "RhodeCode"

81

group_name = "RhodeCode"

82

takes_config_file = -1

82

takes_config_file = -1

83

parser = Command.standard_parser(verbose=True)

83

parser = Command.standard_parser(verbose=True)

84

85

def command(self):

85

def command(self):

86

87

from pylons import config

87

from pylons import config

88

add_cache(config)

88

add_cache(config)

89

engine = engine_from_config(config, 'sqlalchemy.db1.')

89

engine = engine_from_config(config, 'sqlalchemy.db1.')

90

init_model(engine)

90

init_model(engine)

91

92

index_location = config['index_dir']

92

index_location = config['index_dir']

93

repo_location = self.options.repo_location if ~~self~~.~~options~~.~~repo_location~~ ~~else~~ ~~RepoModel~~().~~repos_path~~

93

repo_location = self.options.repo_location \

94

if self.options.repo_location else RepoModel().repos_path

94

repo_list = map(strip, self.options.repo_list.split(',')) \

95

repo_list = map(strip, self.options.repo_list.split(',')) \

95

if self.options.repo_list else None

96

if self.options.repo_list else None

96

97

#======================================================================

98

#======================================================================

98

# WHOOSH DAEMON

99

# WHOOSH DAEMON

99

#======================================================================

100

#======================================================================

100

from rhodecode.lib.pidlock import LockHeld, DaemonLock

101

from rhodecode.lib.pidlock import LockHeld, DaemonLock

101

from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

102

from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

102

try:

103

try:

103

l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))

104

l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))

104

WhooshIndexingDaemon(index_location=index_location,

105

WhooshIndexingDaemon(index_location=index_location,

105

repo_location=repo_location,

106

repo_location=repo_location,

106

repo_list=repo_list)\

107

repo_list=repo_list)\

107

.run(full_index=self.options.full_index)

108

.run(full_index=self.options.full_index)

108

l.release()

109

l.release()

109

except LockHeld:

110

except LockHeld:

110

sys.exit(1)

111

sys.exit(1)

111

112

def update_parser(self):

113

def update_parser(self):

113

self.parser.add_option('--repo-location',

114

self.parser.add_option('--repo-location',

114

action='store',

115

action='store',

115

dest='repo_location',

116

dest='repo_location',

116

help="Specifies repositories location to index OPTIONAL",

117

help="Specifies repositories location to index OPTIONAL",

117

)

118

)

118

self.parser.add_option('--index-only',

119

self.parser.add_option('--index-only',

119

action='store',

120

action='store',

120

dest='repo_list',

121

dest='repo_list',

121

help="Specifies a comma separated list of repositores "

122

help="Specifies a comma separated list of repositores "

122

"to build index on OPTIONAL",

123

"to build index on OPTIONAL",

123

)

124

)

124

self.parser.add_option('-f',

125

self.parser.add_option('-f',

125

action='store_true',

126

action='store_true',

126

dest='full_index',

127

dest='full_index',

127

help="Specifies that index should be made full i.e"

128

help="Specifies that index should be made full i.e"

128

" destroy old and build from scratch",

129

" destroy old and build from scratch",

129

default=False)

130

default=False)

130

131

class ResultWrapper(object):

132

class ResultWrapper(object):

132

def __init__(self, search_type, searcher, matcher, highlight_items):

133

def __init__(self, search_type, searcher, matcher, highlight_items):

133

self.search_type = search_type

134

self.search_type = search_type

134

self.searcher = searcher

135

self.searcher = searcher

135

self.matcher = matcher

136

self.matcher = matcher

136

self.highlight_items = highlight_items

137

self.highlight_items = highlight_items

137

self.fragment_size = 200 / 2

138

self.fragment_size = 200 / 2

138

139

@LazyProperty

140

@LazyProperty

140

def doc_ids(self):

141

def doc_ids(self):

141

docs_id = []

142

docs_id = []

142

while self.matcher.is_active():

143

while self.matcher.is_active():

143

docnum = self.matcher.id()

144

docnum = self.matcher.id()

144

chunks = [offsets for offsets in self.get_chunks()]

145

chunks = [offsets for offsets in self.get_chunks()]

145

docs_id.append([docnum, chunks])

146

docs_id.append([docnum, chunks])

146

self.matcher.next()

147

self.matcher.next()

147

return docs_id

148

return docs_id

148

149

def __str__(self):

150

def __str__(self):

150

return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

151

return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

151

152

def __repr__(self):

153

def __repr__(self):

153

return self.__str__()

154

return self.__str__()

154

155

def __len__(self):

156

def __len__(self):

156

return len(self.doc_ids)

157

return len(self.doc_ids)

157

158

def __iter__(self):

159

def __iter__(self):

159

"""

160

"""

160

Allows Iteration over results,and lazy generate content

161

Allows Iteration over results,and lazy generate content

161

162

*Requires* implementation of ``__getitem__`` method.

163

*Requires* implementation of ``__getitem__`` method.

163

"""

164

"""

164

for docid in self.doc_ids:

165

for docid in self.doc_ids:

165

yield self.get_full_content(docid)

166

yield self.get_full_content(docid)

166

167

def __getitem__(self, key):

168

def __getitem__(self, key):

168

"""

169

"""

169

Slicing of resultWrapper

170

Slicing of resultWrapper

170

"""

171

"""

171

i, j = key.start, key.stop

172

i, j = key.start, key.stop

172

173

slice = []

174

slice = []

174

for docid in self.doc_ids[i:j]:

175

for docid in self.doc_ids[i:j]:

175

slice.append(self.get_full_content(docid))

176

slice.append(self.get_full_content(docid))

176

return slice

177

return slice

177

178

179

def get_full_content(self, docid):

180

def get_full_content(self, docid):

180

res = self.searcher.stored_fields(docid[0])

181

res = self.searcher.stored_fields(docid[0])

181

f_path = res['path'][res['path'].find(res['repository']) \

182

f_path = res['path'][res['path'].find(res['repository']) \

182

+ len(res['repository']):].lstrip('/')

183

+ len(res['repository']):].lstrip('/')

183

184

content_short = self.get_short_content(res, docid[1])

185

content_short = self.get_short_content(res, docid[1])

185

res.update({'content_short':content_short,

186

res.update({'content_short':content_short,

186

'content_short_hl':self.highlight(content_short),

187

'content_short_hl':self.highlight(content_short),

187

'f_path':f_path})

188

'f_path':f_path})

188

189

return res

190

return res

190

191

def get_short_content(self, res, chunks):

192

def get_short_content(self, res, chunks):

192

193

return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

194

return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

194

195

def get_chunks(self):

196

def get_chunks(self):

196

"""

197

"""

197

Smart function that implements chunking the content

198

Smart function that implements chunking the content

198

but not overlap chunks so it doesn't highlight the same

199

but not overlap chunks so it doesn't highlight the same

199

close occurrences twice.

200

close occurrences twice.

200

201

:param matcher:

202

:param matcher:

202

:param size:

203

:param size:

203

"""

204

"""

204

memory = [(0, 0)]

205

memory = [(0, 0)]

205

for span in self.matcher.spans():

206

for span in self.matcher.spans():

206

start = span.startchar or 0

207

start = span.startchar or 0

207

end = span.endchar or 0

208

end = span.endchar or 0

208

start_offseted = max(0, start - self.fragment_size)

209

start_offseted = max(0, start - self.fragment_size)

209

end_offseted = end + self.fragment_size

210

end_offseted = end + self.fragment_size

210

211

if start_offseted < memory[-1][1]:

212

if start_offseted < memory[-1][1]:

212

start_offseted = memory[-1][1]

213

start_offseted = memory[-1][1]

213

memory.append((start_offseted, end_offseted,))

214

memory.append((start_offseted, end_offseted,))

214

yield (start_offseted, end_offseted,)

215

yield (start_offseted, end_offseted,)

215

216

def highlight(self, content, top=5):

217

def highlight(self, content, top=5):

217

if self.search_type != 'content':

218

if self.search_type != 'content':

218

return ''

219

return ''

219

hl = highlight(escape(content),

220

hl = highlight(escape(content),

220

self.highlight_items,

221

self.highlight_items,

221

analyzer=ANALYZER,

222

analyzer=ANALYZER,

222

fragmenter=FRAGMENTER,

223

fragmenter=FRAGMENTER,

223

formatter=FORMATTER,

224

formatter=FORMATTER,

224

top=top)

225

top=top)

225

return hl

226

return hl

             List of contributors to RhodeCode project:
                 Marcin Kuźmiński <marcin@python-works.com>
                 Lukasz Balcerzak <lukaszbalcerzak@gmail.com>
                 Jason Harris <jason@jasonfharris.com>
                 Thayne Harbaugh  <thayne@fusionio.com>
                 cejones
-                Lorenzo M. Catucci <lorenzo@sancho.ccd.uniroma2.it>
  No newline at end of file
+                Lorenzo M. Catucci <lorenzo@sancho.ccd.uniroma2.it>
+                Dmitri Kuznetsov
+                Jared Bunting <jared.bunting@peachjean.com>
  No newline at end of file

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.__init__
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 Whoosh indexing module for RhodeCode
                 :created_on: Aug 17, 2010
                 :author: marcink
                 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             import os
             import sys
             import traceback
             from os.path import dirname as dn, join as jn
             #to get the rhodecode import
             sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
             from string import strip
             from shutil import rmtree
             from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
             from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
             from whoosh.index import create_in, open_dir
             from whoosh.formats import Characters
             from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
             from webhelpers.html.builder import escape
             from sqlalchemy import engine_from_config
             from vcs.utils.lazy import LazyProperty
             from rhodecode.model import init_model
             from rhodecode.model.scm import ScmModel
             from rhodecode.model.repo import RepoModel
             from rhodecode.config.environment import load_environment
             from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
             from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
             #EXTENSIONS WE WANT TO INDEX CONTENT OFF
             INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
             #CUSTOM ANALYZER wordsplit + lowercase filter
             ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
             #INDEX SCHEMA DEFINITION
             SCHEMA = Schema(owner=TEXT(),
                             repository=TEXT(stored=True),
                             path=TEXT(stored=True),
                             content=FieldType(format=Characters(ANALYZER),
                                          scorable=True, stored=True),
                             modtime=STORED(), extension=TEXT(stored=True))
             IDX_NAME = 'HG_INDEX'
             FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
             FRAGMENTER = SimpleFragmenter(200)
             class MakeIndex(BasePasterCommand):
                 max_args = 1
                 min_args = 1
                 usage = "CONFIG_FILE"
                 summary = "Creates index for full text search given configuration file"
                 group_name = "RhodeCode"
                 takes_config_file = -1
                 parser = Command.standard_parser(verbose=True)
                 def command(self):
                     from pylons import config
                     add_cache(config)
                     engine = engine_from_config(config, 'sqlalchemy.db1.')
                     init_model(engine)
                     index_location = config['index_dir']
-                    repo_location = self.options.repo_location if self.options.repo_location else RepoModel().repos_path
+                    repo_location = self.options.repo_location \
+                        if self.options.repo_location else RepoModel().repos_path
                     repo_list = map(strip, self.options.repo_list.split(',')) \
                         if self.options.repo_list else None
                     #======================================================================
                     # WHOOSH DAEMON
                     #======================================================================
                     from rhodecode.lib.pidlock import LockHeld, DaemonLock
                     from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
                     try:
                         l = DaemonLock(file=jn(dn(dn(index_location)), 'make_index.lock'))
                         WhooshIndexingDaemon(index_location=index_location,
                                              repo_location=repo_location,
                                              repo_list=repo_list)\
                             .run(full_index=self.options.full_index)
                         l.release()
                     except LockHeld:
                         sys.exit(1)
                 def update_parser(self):
                     self.parser.add_option('--repo-location',
                                       action='store',
                                       dest='repo_location',
                                       help="Specifies repositories location to index OPTIONAL",
                                       )
                     self.parser.add_option('--index-only',
                                       action='store',
                                       dest='repo_list',
                                       help="Specifies a comma separated list of repositores "
                                             "to build index on OPTIONAL",
                                       )
                     self.parser.add_option('-f',
                                       action='store_true',
                                       dest='full_index',
                                       help="Specifies that index should be made full i.e"
                                             " destroy old and build from scratch",
                                       default=False)
             class ResultWrapper(object):
                 def __init__(self, search_type, searcher, matcher, highlight_items):
                     self.search_type = search_type
                     self.searcher = searcher
                     self.matcher = matcher
                     self.highlight_items = highlight_items
                     self.fragment_size = 200 / 2
                 @LazyProperty
                 def doc_ids(self):
                     docs_id = []
                     while self.matcher.is_active():
                         docnum = self.matcher.id()
                         chunks = [offsets for offsets in self.get_chunks()]
                         docs_id.append([docnum, chunks])
                         self.matcher.next()
                     return docs_id
                 def __str__(self):
                     return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
                 def __repr__(self):
                     return self.__str__()
                 def __len__(self):
                     return len(self.doc_ids)
                 def __iter__(self):
                     """
                     Allows Iteration over results,and lazy generate content
                     *Requires* implementation of ``__getitem__`` method.
                     """
                     for docid in self.doc_ids:
                         yield self.get_full_content(docid)
                 def __getitem__(self, key):
                     """
                     Slicing of resultWrapper
                     """
                     i, j = key.start, key.stop
                     slice = []
                     for docid in self.doc_ids[i:j]:
                         slice.append(self.get_full_content(docid))
                     return slice
                 def get_full_content(self, docid):
                     res = self.searcher.stored_fields(docid[0])
                     f_path = res['path'][res['path'].find(res['repository']) \
                                          + len(res['repository']):].lstrip('/')
                     content_short = self.get_short_content(res, docid[1])
                     res.update({'content_short':content_short,
                                 'content_short_hl':self.highlight(content_short),
                                 'f_path':f_path})
                     return res
                 def get_short_content(self, res, chunks):
                     return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
                 def get_chunks(self):
                     """
                     Smart function that implements chunking the content
                     but not overlap chunks so it doesn't highlight the same
                     close occurrences twice.
                     :param matcher:
                     :param size:
                     """
                     memory = [(0, 0)]
                     for span in self.matcher.spans():
                         start = span.startchar or 0
                         end = span.endchar or 0
                         start_offseted = max(0, start - self.fragment_size)
                         end_offseted = end + self.fragment_size
                         if start_offseted < memory[-1][1]:
                             start_offseted = memory[-1][1]
                         memory.append((start_offseted, end_offseted,))
                         yield (start_offseted, end_offseted,)
                 def highlight(self, content, top=5):
                     if self.search_type != 'content':
                         return ''
                     hl = highlight(escape(content),
                              self.highlight_items,
                              analyzer=ANALYZER,
                              fragmenter=FRAGMENTER,
                              formatter=FORMATTER,
                              top=top)
                     return hl