upstream/kallithea Files · pylons_app/lib/indexers/__init__.py

merged

marcink - - Load All Authors

File last commit:

r497:fb0c3af6 celery


                r545:2e2ae0af

default

Download file

             __init__.py
        
                    139 lines
            
             | 4.8 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / pylons_app / lib / indexers / __init__.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
      from os.path import dirname as dn, join as jn

      from pylons_app.config.environment import load_environment

      from pylons_app.model.hg_model import HgModel

      from shutil import rmtree

      from webhelpers.html.builder import escape

      from vcs.utils.lazy import LazyProperty

      from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter

      from whoosh.fields import TEXT, ID, STORED, Schema, FieldType

      from whoosh.index import create_in, open_dir

      from whoosh.formats import Characters

      from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter   

        marcink
    
Implemented search using whoosh. Still as experimental option.

              r406
            
      import os

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
      import sys

        marcink
    
Implemented search using whoosh. Still as experimental option.

              r406
            
      import traceback

      #to get the pylons_app import

      sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))

      #LOCATION WE KEEP THE INDEX

      IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')

        marcink
    
updated config files,...

              r436
            
      #EXTENSIONS WE WANT TO INDEX CONTENT OFF

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
      INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',

                          'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',

                          'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',

                          'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',

                          'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',

                          'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',

        marcink
    
updated config files,...

              r436
            
                          'yaws']

        marcink
    
Implemented search using whoosh. Still as experimental option.

              r406
            
      #CUSTOM ANALYZER wordsplit + lowercase filter

        marcink
    
updated config files,...

              r436
            
      ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

        marcink
    
Implemented search using whoosh. Still as experimental option.

              r406
            
        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
        marcink
    
Implemented search using whoosh. Still as experimental option.

              r406
            
      #INDEX SCHEMA DEFINITION

      SCHEMA = Schema(owner=TEXT(),

                      repository=TEXT(stored=True),

                      path=ID(stored=True, unique=True),

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
                      content=FieldType(format=Characters(ANALYZER),

                                   scorable=True, stored=True),

                      modtime=STORED(), extension=TEXT(stored=True))

      IDX_NAME = 'HG_INDEX'

      FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') 

      FRAGMENTER = SimpleFragmenter(200)

      class ResultWrapper(object):

          def __init__(self, searcher, matcher, highlight_items):

              self.searcher = searcher

              self.matcher = matcher

              self.highlight_items = highlight_items

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
              self.fragment_size = 200 / 2

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
          @LazyProperty

          def doc_ids(self):

              docs_id = []

              while self.matcher.is_active():

                  docnum = self.matcher.id()

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
                  chunks = [offsets for offsets in self.get_chunks()]

                  docs_id.append([docnum, chunks])

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
                  self.matcher.next()

              return docs_id   

          def __str__(self):

              return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

          def __repr__(self):

              return self.__str__()

          def __len__(self):

              return len(self.doc_ids)

          def __iter__(self):

              """

              Allows Iteration over results,and lazy generate content

              *Requires* implementation of ``__getitem__`` method.

              """

              for docid in self.doc_ids:

                  yield self.get_full_content(docid)

        marcink
    
Implemented search using whoosh. Still as experimental option.

              r406
            
        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
          def __getslice__(self, i, j):

              """

              Slicing of resultWrapper

              """

              slice = []

              for docid in self.doc_ids[i:j]:

                  slice.append(self.get_full_content(docid))

              return slice   

          def get_full_content(self, docid):

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
              res = self.searcher.stored_fields(docid[0])

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
              f_path = res['path'][res['path'].find(res['repository']) \

                                   + len(res['repository']):].lstrip('/')

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
              content_short = self.get_short_content(res, docid[1])

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
              res.update({'content_short':content_short,

                          'content_short_hl':self.highlight(content_short),

                          'f_path':f_path})

              return res        

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
          def get_short_content(self, res, chunks):

              return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

          def get_chunks(self):

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
              """

              Smart function that implements chunking the content

              but not overlap chunks so it doesn't highlight the same

              close occurences twice.

              @param matcher:

              @param size:

              """

              memory = [(0, 0)]

              for span in self.matcher.spans():

                  start = span.startchar or 0

                  end = span.endchar or 0

                  start_offseted = max(0, start - self.fragment_size)

                  end_offseted = end + self.fragment_size

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
                  if start_offseted < memory[-1][1]:

                      start_offseted = memory[-1][1]

                  memory.append((start_offseted, end_offseted,))    

        marcink
    
fixed search chunking bug and optimized chunk size

              r479
            
                  yield (start_offseted, end_offseted,)  

        marcink
    
Reimplemented searching for speed on large files and added paging for search results...

              r478
            
          def highlight(self, content, top=5):

              hl = highlight(escape(content),

                       self.highlight_items,

                       analyzer=ANALYZER,

                       fragmenter=FRAGMENTER,

                       formatter=FORMATTER,

                       top=top)

              return hl

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

marcink Reimplemented searching for speed on large files and added paging for search results...	r478	from os.path import dirname as dn, join as jn
		from pylons_app.config.environment import load_environment
		from pylons_app.model.hg_model import HgModel
		from shutil import rmtree
		from webhelpers.html.builder import escape
		from vcs.utils.lazy import LazyProperty

		from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
		from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
		from whoosh.index import create_in, open_dir
		from whoosh.formats import Characters
		from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter

marcink Implemented search using whoosh. Still as experimental option.	r406	import os
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	import sys
marcink Implemented search using whoosh. Still as experimental option.	r406	import traceback

		#to get the pylons_app import
		sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))


		#LOCATION WE KEEP THE INDEX
		IDX_LOCATION = jn(dn(dn(dn(dn(os.path.abspath(__file__))))), 'data', 'index')

marcink updated config files,...	r436	#EXTENSIONS WE WANT TO INDEX CONTENT OFF
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	INDEX_EXTENSIONS = ['action', 'adp', 'ashx', 'asmx', 'aspx', 'asx', 'axd', 'c',
		'cfg', 'cfm', 'cpp', 'cs', 'css', 'diff', 'do', 'el', 'erl',
		'h', 'htm', 'html', 'ini', 'java', 'js', 'jsp', 'jspx', 'lisp',
		'lua', 'm', 'mako', 'ml', 'pas', 'patch', 'php', 'php3',
		'php4', 'phtml', 'pm', 'py', 'rb', 'rst', 's', 'sh', 'sql',
		'tpl', 'txt', 'vim', 'wss', 'xhtml', 'xml', 'xsl', 'xslt',
marcink updated config files,...	r436	'yaws']
marcink Implemented search using whoosh. Still as experimental option.	r406
		#CUSTOM ANALYZER wordsplit + lowercase filter
marcink updated config files,...	r436	ANALYZER = RegexTokenizer(expression=r"\w+") \| LowercaseFilter()
marcink Implemented search using whoosh. Still as experimental option.	r406
marcink Reimplemented searching for speed on large files and added paging for search results...	r478
marcink Implemented search using whoosh. Still as experimental option.	r406	#INDEX SCHEMA DEFINITION
		SCHEMA = Schema(owner=TEXT(),
		repository=TEXT(stored=True),
		path=ID(stored=True, unique=True),
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	content=FieldType(format=Characters(ANALYZER),
		scorable=True, stored=True),
		modtime=STORED(), extension=TEXT(stored=True))


		IDX_NAME = 'HG_INDEX'
		FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
		FRAGMENTER = SimpleFragmenter(200)

		class ResultWrapper(object):
		def __init__(self, searcher, matcher, highlight_items):
		self.searcher = searcher
		self.matcher = matcher
		self.highlight_items = highlight_items
marcink fixed search chunking bug and optimized chunk size	r479	self.fragment_size = 200 / 2
marcink Reimplemented searching for speed on large files and added paging for search results...	r478
		@LazyProperty
		def doc_ids(self):
		docs_id = []
		while self.matcher.is_active():
		docnum = self.matcher.id()
marcink fixed search chunking bug and optimized chunk size	r479	chunks = [offsets for offsets in self.get_chunks()]
		docs_id.append([docnum, chunks])
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	self.matcher.next()
		return docs_id

		def __str__(self):
		return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))

		def __repr__(self):
		return self.__str__()

		def __len__(self):
		return len(self.doc_ids)

		def __iter__(self):
		"""
		Allows Iteration over results,and lazy generate content

		Requires implementation of ``__getitem__`` method.
		"""
		for docid in self.doc_ids:
		yield self.get_full_content(docid)
marcink Implemented search using whoosh. Still as experimental option.	r406
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	def __getslice__(self, i, j):
		"""
		Slicing of resultWrapper
		"""
		slice = []
		for docid in self.doc_ids[i:j]:
		slice.append(self.get_full_content(docid))
		return slice


		def get_full_content(self, docid):
marcink fixed search chunking bug and optimized chunk size	r479	res = self.searcher.stored_fields(docid[0])
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	f_path = res['path'][res['path'].find(res['repository']) \
		+ len(res['repository']):].lstrip('/')

marcink fixed search chunking bug and optimized chunk size	r479	content_short = self.get_short_content(res, docid[1])
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	res.update({'content_short':content_short,
		'content_short_hl':self.highlight(content_short),
		'f_path':f_path})

		return res
marcink fixed search chunking bug and optimized chunk size	r479
		def get_short_content(self, res, chunks):

		return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])

		def get_chunks(self):
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	"""
		Smart function that implements chunking the content
		but not overlap chunks so it doesn't highlight the same
		close occurences twice.
		@param matcher:
		@param size:
		"""
		memory = [(0, 0)]
		for span in self.matcher.spans():
		start = span.startchar or 0
		end = span.endchar or 0
		start_offseted = max(0, start - self.fragment_size)
		end_offseted = end + self.fragment_size
marcink fixed search chunking bug and optimized chunk size	r479
marcink Reimplemented searching for speed on large files and added paging for search results...	r478	if start_offseted < memory[-1][1]:
		start_offseted = memory[-1][1]
		memory.append((start_offseted, end_offseted,))
marcink fixed search chunking bug and optimized chunk size	r479	yield (start_offseted, end_offseted,)
marcink Reimplemented searching for speed on large files and added paging for search results...	r478
		def highlight(self, content, top=5):
		hl = highlight(escape(content),
		self.highlight_items,
		analyzer=ANALYZER,
		fragmenter=FRAGMENTER,
		formatter=FORMATTER,
		top=top)
		return hl