upstream/kallithea Files · rhodecode/lib/indexers/__init__.py

fixed license issue

marcink - - Load All Authors

File last commit:

r1206:a671db5b beta


                r1206:a671db5b

beta

Download file

             __init__.py
        
                    230 lines
            
             | 8.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / indexers / __init__.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # -*- coding: utf-8 -*-

      """

          rhodecode.lib.indexers.__init__

          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

          Whoosh indexing module for RhodeCode

          :created_on: Aug 17, 2010

          :author: marcink

          :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>

          :license: GPLv3, see COPYING for more details.

      """

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU General Public License as published by

      # the Free Software Foundation, either version 3 of the License, or

      # (at your option) any later version.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      import os

      import sys

      import traceback

      from os.path import dirname as dn, join as jn

      #to get the rhodecode import

      sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

      from string import strip

      from rhodecode.model import init_model

      from rhodecode.model.scm import ScmModel

      from rhodecode.config.environment import load_environment

      from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

      from shutil import rmtree

      from webhelpers.html.builder import escape

      from vcs.utils.lazy import LazyProperty

      from sqlalchemy import engine_from_config

      from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

      from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

      from whoosh.index import create_in, open_dir

      from whoosh.formats import Characters

      from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

      #EXTENSIONS WE WANT TO INDEX CONTENT OFF

      INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',

                          'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',

                          'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',

                          'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',

                          'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',

                          'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',

                          'yaws']

      #CUSTOM ANALYZER wordsplit + lowercase filter

      ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

      #INDEX SCHEMA DEFINITION

      SCHEMA = Schema(owner=TEXT(),

                      repository=TEXT(stored=True),

                      path=TEXT(stored=True),

                      content=FieldType(format=Characters(ANALYZER),

                                   scorable=True, stored=True),

                      modtime=STORED(), extension=TEXT(stored=True))

      IDX_NAME = 'HG_INDEX'

      FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

      FRAGMENTER = SimpleFragmenter(200)

      class MakeIndex(BasePasterCommand):

          max_args = 1

          min_args = 1

          usage = "CONFIG_FILE"

          summary = "Creates index for full text search given configuration file"

          group_name = "RhodeCode"

          takes_config_file = -1

          parser = Command.standard_parser(verbose=True)

          def command(self):

              from pylons import config

              add_cache(config)

              engine = engine_from_config(config, 'sqlalchemy.db1.')

              init_model(engine)

              index_location = config['index_dir']

              repo_location = self.options.repo_location

              repo_list = map(strip, self.options.repo_list.split(',')) \

                  if self.options.repo_list else None

              #======================================================================

              # WHOOSH DAEMON

              #======================================================================

              from rhodecode.lib.pidlock import LockHeld, DaemonLock

              from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

              try:

                  l = DaemonLock()

                  WhooshIndexingDaemon(index_location=index_location,

                                       repo_location=repo_location,

                                       repo_list=repo_list)\

                      .run(full_index=self.options.full_index)

                  l.release()

              except LockHeld:

                  sys.exit(1)

          def update_parser(self):

              self.parser.add_option('--repo-location',

                                action='store',

                                dest='repo_location',

                                help="Specifies repositories location to index REQUIRED",

                                )

              self.parser.add_option('--index-only',

                                action='store',

                                dest='repo_list',

                                help="Specifies a comma separated list of repositores "

                                      "to build index on OPTIONAL",

                                )

              self.parser.add_option('-f',

                                action='store_true',

                                dest='full_index',

                                help="Specifies that index should be made full i.e"

                                      " destroy old and build from scratch",

                                default=False)

      class ResultWrapper(object):

          def __init__(self, search_type, searcher, matcher, highlight_items):

              self.search_type = search_type

              self.searcher = searcher

              self.matcher = matcher

              self.highlight_items = highlight_items

              self.fragment_size = 200 / 2

          @LazyProperty

          def doc_ids(self):

              docs_id = []

              while self.matcher.is_active():

                  docnum = self.matcher.id()

                  chunks = [offsets for offsets in self.get_chunks()]

                  docs_id.append([docnum, chunks])

                  self.matcher.next()

              return docs_id

          def __str__(self):

              return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

          def __repr__(self):

              return self.__str__()

          def __len__(self):

              return len(self.doc_ids)

          def __iter__(self):

              """

              Allows Iteration over results,and lazy generate content

              *Requires* implementation of ``__getitem__`` method.

              """

              for docid in self.doc_ids:

                  yield self.get_full_content(docid)

          def __getitem__(self, key):

              """

              Slicing of resultWrapper

              """

              i, j = key.start, key.stop

              slice = []

              for docid in self.doc_ids[i:j]:

                  slice.append(self.get_full_content(docid))

              return slice

          def get_full_content(self, docid):

              res = self.searcher.stored_fields(docid[0])

              f_path = res['path'][res['path'].find(res['repository']) \

                                   + len(res['repository']):].lstrip('/')

              content_short = self.get_short_content(res, docid[1])

              res.update({'content_short':content_short,

                          'content_short_hl':self.highlight(content_short),

                          'f_path':f_path})

              return res

          def get_short_content(self, res, chunks):

              return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

          def get_chunks(self):

              """

              Smart function that implements chunking the content

              but not overlap chunks so it doesn't highlight the same

              close occurrences twice.

              @param matcher:

              @param size:

              """

              memory = [(0, 0)]

              for span in self.matcher.spans():

                  start = span.startchar or 0

                  end = span.endchar or 0

                  start_offseted = max(0, start - self.fragment_size)

                  end_offseted = end + self.fragment_size

                  if start_offseted < memory[-1][1]:

                      start_offseted = memory[-1][1]

                  memory.append((start_offseted, end_offseted,))

                  yield (start_offseted, end_offseted,)

          def highlight(self, content, top=5):

              if self.search_type != 'content':

                  return ''

              hl = highlight(escape(content),

                       self.highlight_items,

                       analyzer=ANALYZER,

                       fragmenter=FRAGMENTER,

                       formatter=FORMATTER,

                       top=top)

              return hl

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# -- coding: utf-8 --
				"""
				rhodecode.lib.indexers.__init__
				~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

				Whoosh indexing module for RhodeCode

				:created_on: Aug 17, 2010
				:author: marcink
				:copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
				:license: GPLv3, see COPYING for more details.
				"""
				# This program is free software: you can redistribute it and/or modify
				# it under the terms of the GNU General Public License as published by
				# the Free Software Foundation, either version 3 of the License, or
				# (at your option) any later version.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU General Public License
				# along with this program. If not, see <http://www.gnu.org/licenses/>.
				import os
				import sys
				import traceback
				from os.path import dirname as dn, join as jn

				#to get the rhodecode import
				sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

				from string import strip

				from rhodecode.model import init_model
				from rhodecode.model.scm import ScmModel
				from rhodecode.config.environment import load_environment
				from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

				from shutil import rmtree
				from webhelpers.html.builder import escape
				from vcs.utils.lazy import LazyProperty

				from sqlalchemy import engine_from_config

				from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
				from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
				from whoosh.index import create_in, open_dir
				from whoosh.formats import Characters
				from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter


				#EXTENSIONS WE WANT TO INDEX CONTENT OFF
				INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
				'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
				'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
				'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
				'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
				'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
				'yaws']

				#CUSTOM ANALYZER wordsplit + lowercase filter
				ANALYZER = RegexTokenizer(expression=r"\w+") \| LowercaseFilter()


				#INDEX SCHEMA DEFINITION
				SCHEMA = Schema(owner=TEXT(),
				repository=TEXT(stored=True),
				path=TEXT(stored=True),
				content=FieldType(format=Characters(ANALYZER),
				scorable=True, stored=True),
				modtime=STORED(), extension=TEXT(stored=True))


				IDX_NAME = 'HG_INDEX'
				FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
				FRAGMENTER = SimpleFragmenter(200)


				class MakeIndex(BasePasterCommand):

				max_args = 1
				min_args = 1

				usage = "CONFIG_FILE"
				summary = "Creates index for full text search given configuration file"
				group_name = "RhodeCode"
				takes_config_file = -1
				parser = Command.standard_parser(verbose=True)

				def command(self):

				from pylons import config
				add_cache(config)
				engine = engine_from_config(config, 'sqlalchemy.db1.')
				init_model(engine)

				index_location = config['index_dir']
				repo_location = self.options.repo_location
				repo_list = map(strip, self.options.repo_list.split(',')) \
				if self.options.repo_list else None

				#======================================================================
				# WHOOSH DAEMON
				#======================================================================
				from rhodecode.lib.pidlock import LockHeld, DaemonLock
				from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
				try:
				l = DaemonLock()
				WhooshIndexingDaemon(index_location=index_location,
				repo_location=repo_location,
				repo_list=repo_list)\
				.run(full_index=self.options.full_index)
				l.release()
				except LockHeld:
				sys.exit(1)

				def update_parser(self):
				self.parser.add_option('--repo-location',
				action='store',
				dest='repo_location',
				help="Specifies repositories location to index REQUIRED",
				)
				self.parser.add_option('--index-only',
				action='store',
				dest='repo_list',
				help="Specifies a comma separated list of repositores "
				"to build index on OPTIONAL",
				)
				self.parser.add_option('-f',
				action='store_true',
				dest='full_index',
				help="Specifies that index should be made full i.e"
				" destroy old and build from scratch",
				default=False)

				class ResultWrapper(object):
				def __init__(self, search_type, searcher, matcher, highlight_items):
				self.search_type = search_type
				self.searcher = searcher
				self.matcher = matcher
				self.highlight_items = highlight_items
				self.fragment_size = 200 / 2

				@LazyProperty
				def doc_ids(self):
				docs_id = []
				while self.matcher.is_active():
				docnum = self.matcher.id()
				chunks = [offsets for offsets in self.get_chunks()]
				docs_id.append([docnum, chunks])
				self.matcher.next()
				return docs_id

				def __str__(self):
				return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

				def __repr__(self):
				return self.__str__()

				def __len__(self):
				return len(self.doc_ids)

				def __iter__(self):
				"""
				Allows Iteration over results,and lazy generate content

				Requires implementation of ``__getitem__`` method.
				"""
				for docid in self.doc_ids:
				yield self.get_full_content(docid)

				def __getitem__(self, key):
				"""
				Slicing of resultWrapper
				"""
				i, j = key.start, key.stop

				slice = []
				for docid in self.doc_ids[i:j]:
				slice.append(self.get_full_content(docid))
				return slice


				def get_full_content(self, docid):
				res = self.searcher.stored_fields(docid[0])
				f_path = res['path'][res['path'].find(res['repository']) \
				+ len(res['repository']):].lstrip('/')

				content_short = self.get_short_content(res, docid[1])
				res.update({'content_short':content_short,
				'content_short_hl':self.highlight(content_short),
				'f_path':f_path})

				return res

				def get_short_content(self, res, chunks):

				return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

				def get_chunks(self):
				"""
				Smart function that implements chunking the content
				but not overlap chunks so it doesn't highlight the same
				close occurrences twice.
				@param matcher:
				@param size:
				"""
				memory = [(0, 0)]
				for span in self.matcher.spans():
				start = span.startchar or 0
				end = span.endchar or 0
				start_offseted = max(0, start - self.fragment_size)
				end_offseted = end + self.fragment_size

				if start_offseted < memory[-1][1]:
				start_offseted = memory[-1][1]
				memory.append((start_offseted, end_offseted,))
				yield (start_offseted, end_offseted,)

				def highlight(self, content, top=5):
				if self.search_type != 'content':
				return ''
				hl = highlight(escape(content),
				self.highlight_items,
				analyzer=ANALYZER,
				fragmenter=FRAGMENTER,
				formatter=FORMATTER,
				top=top)
				return hl