upstream/kallithea Files · rhodecode/lib/indexers/__init__.py

fixes + docs update

marcink - - Load All Authors

File last commit:

r894:1fed3c91 beta


                r894:1fed3c91

beta

Download file

             __init__.py
        
                    203 lines
            
             | 7.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / indexers / __init__.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
      import os

      import sys

        marcink
    
complete rewrite of paster commands,...

              r785
            
      import traceback

        marcink
    
renamed project to rhodecode

              r547
            
      from os.path import dirname as dn, join as jn

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
      #to get the rhodecode import

      sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

        marcink
    
fixes #90 + docs update

              r894
            
      from string import strip

        marcink
    
complete rewrite of paster commands,...

              r785
            
      from rhodecode.model import init_model

      from rhodecode.model.scm import ScmModel

        marcink
    
renamed project to rhodecode

              r547
            
      from rhodecode.config.environment import load_environment

        marcink
    
complete rewrite of paster commands,...

              r785
            
      from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

        marcink
    
renamed project to rhodecode

              r547
            
      from shutil import rmtree

      from webhelpers.html.builder import escape

      from vcs.utils.lazy import LazyProperty

        marcink
    
complete rewrite of paster commands,...

              r785
            
      from sqlalchemy import engine_from_config

        marcink
    
renamed project to rhodecode

              r547
            
      from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

      from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

      from whoosh.index import create_in, open_dir

      from whoosh.formats import Characters

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
      from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

        marcink
    
renamed project to rhodecode

              r547
            
      #EXTENSIONS WE WANT TO INDEX CONTENT OFF

      INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',

                          'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',

                          'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',

                          'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',

                          'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',

                          'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',

                          'yaws']

      #CUSTOM ANALYZER wordsplit + lowercase filter

      ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

      #INDEX SCHEMA DEFINITION

      SCHEMA = Schema(owner=TEXT(),

                      repository=TEXT(stored=True),

        marcink
    
Added searching for file names within the repository in rhodecode

              r556
            
                      path=TEXT(stored=True),

        marcink
    
renamed project to rhodecode

              r547
            
                      content=FieldType(format=Characters(ANALYZER),

                                   scorable=True, stored=True),

                      modtime=STORED(), extension=TEXT(stored=True))

      IDX_NAME = 'HG_INDEX'

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
      FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')

        marcink
    
renamed project to rhodecode

              r547
            
      FRAGMENTER = SimpleFragmenter(200)

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
complete rewrite of paster commands,...

              r785
            
      class MakeIndex(BasePasterCommand):

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
          max_args = 1

          min_args = 1

          usage = "CONFIG_FILE"

          summary = "Creates index for full text search given configuration file"

        marcink
    
Implemented whoosh index building as paster command....

              r683
            
          group_name = "RhodeCode"

          takes_config_file = -1

        marcink
    
complete rewrite of paster commands,...

              r785
            
          parser = Command.standard_parser(verbose=True)

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
          def command(self):

        marcink
    
Implemented whoosh index building as paster command....

              r683
            
        marcink
    
complete rewrite of paster commands,...

              r785
            
              from pylons import config

              add_cache(config)

              engine = engine_from_config(config, 'sqlalchemy.db1.')

              init_model(engine)

              index_location = config['index_dir']

        marcink
    
Implemented whoosh index building as paster command....

              r683
            
              repo_location = self.options.repo_location

        marcink
    
fixes #90 + docs update

              r894
            
              repo_list = map(strip, self.options.repo_list.split(','))

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
Implemented whoosh index building as paster command....

              r683
            
              #======================================================================

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              # WHOOSH DAEMON

        marcink
    
Implemented whoosh index building as paster command....

              r683
            
              #======================================================================

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              from rhodecode.lib.pidlock import LockHeld, DaemonLock

              from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon

              try:

                  l = DaemonLock()

        marcink
    
Implemented whoosh index building as paster command....

              r683
            
                  WhooshIndexingDaemon(index_location=index_location,

        marcink
    
fixes #90 + docs update

              r894
            
                                       repo_location=repo_location,

                                       repo_list=repo_list)\

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
                      .run(full_index=self.options.full_index)

                  l.release()

              except LockHeld:

                  sys.exit(1)

        marcink
    
complete rewrite of paster commands,...

              r785
            
          def update_parser(self):

              self.parser.add_option('--repo-location',

                                action='store',

                                dest='repo_location',

                                help="Specifies repositories location to index REQUIRED",

                                )

        marcink
    
fixes #90 + docs update

              r894
            
              self.parser.add_option('--index-only',

                                action='store',

                                dest='repo_list',

                                help="Specifies a comma separated list of repositores "

                                      "to build index on OPTIONAL",

                                )

        marcink
    
complete rewrite of paster commands,...

              r785
            
              self.parser.add_option('-f',

                                action='store_true',

                                dest='full_index',

                                help="Specifies that index should be made full i.e"

                                      " destroy old and build from scratch",

                                default=False)

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
      class ResultWrapper(object):

        marcink
    
Added searching for file names within the repository in rhodecode

              r556
            
          def __init__(self, search_type, searcher, matcher, highlight_items):

              self.search_type = search_type

        marcink
    
renamed project to rhodecode

              r547
            
              self.searcher = searcher

              self.matcher = matcher

              self.highlight_items = highlight_items

              self.fragment_size = 200 / 2

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
          @LazyProperty

          def doc_ids(self):

              docs_id = []

              while self.matcher.is_active():

                  docnum = self.matcher.id()

                  chunks = [offsets for offsets in self.get_chunks()]

                  docs_id.append([docnum, chunks])

                  self.matcher.next()

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              return docs_id

        marcink
    
renamed project to rhodecode

              r547
            
          def __str__(self):

              return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

          def __repr__(self):

              return self.__str__()

          def __len__(self):

              return len(self.doc_ids)

          def __iter__(self):

              """

              Allows Iteration over results,and lazy generate content

              *Requires* implementation of ``__getitem__`` method.

              """

              for docid in self.doc_ids:

                  yield self.get_full_content(docid)

          def __getslice__(self, i, j):

              """

              Slicing of resultWrapper

              """

              slice = []

              for docid in self.doc_ids[i:j]:

                  slice.append(self.get_full_content(docid))

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              return slice

        marcink
    
renamed project to rhodecode

              r547
            
          def get_full_content(self, docid):

              res = self.searcher.stored_fields(docid[0])

              f_path = res['path'][res['path'].find(res['repository']) \

                                   + len(res['repository']):].lstrip('/')

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
              content_short = self.get_short_content(res, docid[1])

              res.update({'content_short':content_short,

                          'content_short_hl':self.highlight(content_short),

                          'f_path':f_path})

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              return res

        marcink
    
renamed project to rhodecode

              r547
            
          def get_short_content(self, res, chunks):

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
              return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
          def get_chunks(self):

              """

              Smart function that implements chunking the content

              but not overlap chunks so it doesn't highlight the same

        marcink
    
Added searching for file names within the repository in rhodecode

              r556
            
              close occurrences twice.

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              @param matcher:

              @param size:

        marcink
    
renamed project to rhodecode

              r547
            
              """

              memory = [(0, 0)]

              for span in self.matcher.spans():

                  start = span.startchar or 0

                  end = span.endchar or 0

                  start_offseted = max(0, start - self.fragment_size)

                  end_offseted = end + self.fragment_size

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
        marcink
    
renamed project to rhodecode

              r547
            
                  if start_offseted < memory[-1][1]:

                      start_offseted = memory[-1][1]

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
                  memory.append((start_offseted, end_offseted,))

                  yield (start_offseted, end_offseted,)

        marcink
    
renamed project to rhodecode

              r547
            
          def highlight(self, content, top=5):

        marcink
    
Added searching for file names within the repository in rhodecode

              r556
            
              if self.search_type != 'content':

                  return ''

        marcink
    
renamed project to rhodecode

              r547
            
              hl = highlight(escape(content),

                       self.highlight_items,

                       analyzer=ANALYZER,

                       fragmenter=FRAGMENTER,

                       formatter=FORMATTER,

                       top=top)

        marcink
    
Hacking for git support,and new faster repo scan

              r631
            
              return hl

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

marcink Hacking for git support,and new faster repo scan	r631	import os
		import sys
marcink complete rewrite of paster commands,...	r785	import traceback
marcink renamed project to rhodecode	r547	from os.path import dirname as dn, join as jn
marcink Hacking for git support,and new faster repo scan	r631
		#to get the rhodecode import
		sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

marcink fixes #90 + docs update	r894	from string import strip

marcink complete rewrite of paster commands,...	r785	from rhodecode.model import init_model
		from rhodecode.model.scm import ScmModel
marcink renamed project to rhodecode	r547	from rhodecode.config.environment import load_environment
marcink complete rewrite of paster commands,...	r785	from rhodecode.lib.utils import BasePasterCommand, Command, add_cache

marcink renamed project to rhodecode	r547	from shutil import rmtree
		from webhelpers.html.builder import escape
		from vcs.utils.lazy import LazyProperty

marcink complete rewrite of paster commands,...	r785	from sqlalchemy import engine_from_config

marcink renamed project to rhodecode	r547	from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
		from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
		from whoosh.index import create_in, open_dir
		from whoosh.formats import Characters
marcink Hacking for git support,and new faster repo scan	r631	from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
marcink renamed project to rhodecode	r547

		#EXTENSIONS WE WANT TO INDEX CONTENT OFF
		INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
		'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
		'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
		'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
		'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
		'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
		'yaws']

		#CUSTOM ANALYZER wordsplit + lowercase filter
		ANALYZER = RegexTokenizer(expression=r"\w+") \| LowercaseFilter()


		#INDEX SCHEMA DEFINITION
		SCHEMA = Schema(owner=TEXT(),
		repository=TEXT(stored=True),
marcink Added searching for file names within the repository in rhodecode	r556	path=TEXT(stored=True),
marcink renamed project to rhodecode	r547	content=FieldType(format=Characters(ANALYZER),
		scorable=True, stored=True),
		modtime=STORED(), extension=TEXT(stored=True))


		IDX_NAME = 'HG_INDEX'
marcink Hacking for git support,and new faster repo scan	r631	FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
marcink renamed project to rhodecode	r547	FRAGMENTER = SimpleFragmenter(200)
marcink Hacking for git support,and new faster repo scan	r631

marcink complete rewrite of paster commands,...	r785	class MakeIndex(BasePasterCommand):
marcink Hacking for git support,and new faster repo scan	r631
		max_args = 1
		min_args = 1

		usage = "CONFIG_FILE"
		summary = "Creates index for full text search given configuration file"
marcink Implemented whoosh index building as paster command....	r683	group_name = "RhodeCode"
		takes_config_file = -1
marcink complete rewrite of paster commands,...	r785	parser = Command.standard_parser(verbose=True)

marcink Hacking for git support,and new faster repo scan	r631	def command(self):
marcink Implemented whoosh index building as paster command....	r683
marcink complete rewrite of paster commands,...	r785	from pylons import config
		add_cache(config)
		engine = engine_from_config(config, 'sqlalchemy.db1.')
		init_model(engine)

		index_location = config['index_dir']
marcink Implemented whoosh index building as paster command....	r683	repo_location = self.options.repo_location
marcink fixes #90 + docs update	r894	repo_list = map(strip, self.options.repo_list.split(','))
marcink Hacking for git support,and new faster repo scan	r631
marcink Implemented whoosh index building as paster command....	r683	#======================================================================
marcink Hacking for git support,and new faster repo scan	r631	# WHOOSH DAEMON
marcink Implemented whoosh index building as paster command....	r683	#======================================================================
marcink Hacking for git support,and new faster repo scan	r631	from rhodecode.lib.pidlock import LockHeld, DaemonLock
		from rhodecode.lib.indexers.daemon import WhooshIndexingDaemon
		try:
		l = DaemonLock()
marcink Implemented whoosh index building as paster command....	r683	WhooshIndexingDaemon(index_location=index_location,
marcink fixes #90 + docs update	r894	repo_location=repo_location,
		repo_list=repo_list)\
marcink Hacking for git support,and new faster repo scan	r631	.run(full_index=self.options.full_index)
		l.release()
		except LockHeld:
		sys.exit(1)

marcink complete rewrite of paster commands,...	r785	def update_parser(self):
		self.parser.add_option('--repo-location',
		action='store',
		dest='repo_location',
		help="Specifies repositories location to index REQUIRED",
		)
marcink fixes #90 + docs update	r894	self.parser.add_option('--index-only',
		action='store',
		dest='repo_list',
		help="Specifies a comma separated list of repositores "
		"to build index on OPTIONAL",
		)
marcink complete rewrite of paster commands,...	r785	self.parser.add_option('-f',
		action='store_true',
		dest='full_index',
		help="Specifies that index should be made full i.e"
		" destroy old and build from scratch",
		default=False)
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	class ResultWrapper(object):
marcink Added searching for file names within the repository in rhodecode	r556	def __init__(self, search_type, searcher, matcher, highlight_items):
		self.search_type = search_type
marcink renamed project to rhodecode	r547	self.searcher = searcher
		self.matcher = matcher
		self.highlight_items = highlight_items
		self.fragment_size = 200 / 2
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	@LazyProperty
		def doc_ids(self):
		docs_id = []
		while self.matcher.is_active():
		docnum = self.matcher.id()
		chunks = [offsets for offsets in self.get_chunks()]
		docs_id.append([docnum, chunks])
		self.matcher.next()
marcink Hacking for git support,and new faster repo scan	r631	return docs_id

marcink renamed project to rhodecode	r547	def __str__(self):
		return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

		def __repr__(self):
		return self.__str__()

		def __len__(self):
		return len(self.doc_ids)

		def __iter__(self):
		"""
		Allows Iteration over results,and lazy generate content

		Requires implementation of ``__getitem__`` method.
		"""
		for docid in self.doc_ids:
		yield self.get_full_content(docid)

		def __getslice__(self, i, j):
		"""
		Slicing of resultWrapper
		"""
		slice = []
		for docid in self.doc_ids[i:j]:
		slice.append(self.get_full_content(docid))
marcink Hacking for git support,and new faster repo scan	r631	return slice

marcink renamed project to rhodecode	r547
		def get_full_content(self, docid):
		res = self.searcher.stored_fields(docid[0])
		f_path = res['path'][res['path'].find(res['repository']) \
		+ len(res['repository']):].lstrip('/')
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	content_short = self.get_short_content(res, docid[1])
		res.update({'content_short':content_short,
		'content_short_hl':self.highlight(content_short),
		'f_path':f_path})
marcink Hacking for git support,and new faster repo scan	r631
		return res

marcink renamed project to rhodecode	r547	def get_short_content(self, res, chunks):
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	def get_chunks(self):
		"""
		Smart function that implements chunking the content
		but not overlap chunks so it doesn't highlight the same
marcink Added searching for file names within the repository in rhodecode	r556	close occurrences twice.
marcink Hacking for git support,and new faster repo scan	r631	@param matcher:
		@param size:
marcink renamed project to rhodecode	r547	"""
		memory = [(0, 0)]
		for span in self.matcher.spans():
		start = span.startchar or 0
		end = span.endchar or 0
		start_offseted = max(0, start - self.fragment_size)
		end_offseted = end + self.fragment_size
marcink Hacking for git support,and new faster repo scan	r631
marcink renamed project to rhodecode	r547	if start_offseted < memory[-1][1]:
		start_offseted = memory[-1][1]
marcink Hacking for git support,and new faster repo scan	r631	memory.append((start_offseted, end_offseted,))
		yield (start_offseted, end_offseted,)

marcink renamed project to rhodecode	r547	def highlight(self, content, top=5):
marcink Added searching for file names within the repository in rhodecode	r556	if self.search_type != 'content':
		return ''
marcink renamed project to rhodecode	r547	hl = highlight(escape(content),
		self.highlight_items,
		analyzer=ANALYZER,
		fragmenter=FRAGMENTER,
		formatter=FORMATTER,
		top=top)
marcink Hacking for git support,and new faster repo scan	r631	return hl