rhodecode-enterprise-ce Files · rhodecode/lib/index/search_utils.py

validators/schemas: python3 fixes str vs unicode and few test breaking fixes

super-admin - - Load All Authors

File last commit:

r5054:c54edc4f default


                r5066:ccd88b7c

default

Download file

             search_utils.py
        
                    197 lines
            
             | 6.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / index / search_utils.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        super-admin
    
libs: removed utf8 markers

              r5054
            
        dan
    
search: add support for elastic search 6...

              r3319
            
        marcink
    
code: update copyrights to 2020

              r4306
            
      # Copyright (C) 2012-2020 RhodeCode GmbH

        dan
    
search: add support for elastic search 6...

              r3319
            
      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      import re

      import pygments.filter

      import pygments.filters

      from pygments.token import Comment

      HL_BEG_MARKER = '__RCSearchHLMarkBEG__'

      HL_END_MARKER = '__RCSearchHLMarkEND__'

      HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)

      class ElasticSearchHLFilter(pygments.filters.Filter):

          _names = [HL_BEG_MARKER, HL_END_MARKER]

          def __init__(self, **options):

              pygments.filters.Filter.__init__(self, **options)

          def filter(self, lexer, stream):

              def tokenize(_value):

                  for token in re.split('({}|{})'.format(

                          self._names[0], self._names[1]), _value):

                      if token:

                          yield token

              hl = False

              for ttype, value in stream:

                  if self._names[0] in value or self._names[1] in value:

                      for item in tokenize(value):

                          if item == self._names[0]:

                              # skip marker, but start HL

                              hl = True

                              continue

                          elif item == self._names[1]:

                              hl = False

                              continue

                          if hl:

                              yield Comment.ElasticMatch, item

                          else:

                              yield ttype, item

                  else:

                      if hl:

                          yield Comment.ElasticMatch, value

                      else:

                          yield ttype, value

      def extract_phrases(text_query):

          """

          Extracts phrases from search term string making sure phrases

          contained in double quotes are kept together - and discarding empty values

          or fully whitespace values eg.

          'some   text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

          """

          in_phrase = False

          buf = ''

          phrases = []

          for char in text_query:

              if in_phrase:

                  if char == '"':  # end phrase

                      phrases.append(buf)

                      buf = ''

                      in_phrase = False

                      continue

                  else:

                      buf += char

                      continue

              else:

                  if char == '"':  # start phrase

                      in_phrase = True

                      phrases.append(buf)

                      buf = ''

                      continue

                  elif char == ' ':

                      phrases.append(buf)

                      buf = ''

                      continue

                  else:

                      buf += char

          phrases.append(buf)

          phrases = [phrase.strip() for phrase in phrases if phrase.strip()]

          return phrases

      def get_matching_phrase_offsets(text, phrases):

          """

          Returns a list of string offsets in `text` that the list of `terms` match

          >>> get_matching_phrase_offsets('some text here', ['some', 'here'])

          [(0, 4), (10, 14)]

          """

          phrases = phrases or []

          offsets = []

          for phrase in phrases:

              for match in re.finditer(phrase, text):

                  offsets.append((match.start(), match.end()))

          return offsets

      def get_matching_markers_offsets(text, markers=None):

          """

          Returns a list of string offsets in `text` that the are between matching markers

          >>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])

          [(0, 5), (16, 22)]

          """

          markers = markers or [HL_MARKER_RE]

          offsets = []

          if markers:

              for mark in markers:

                  for match in re.finditer(mark, text):

                      offsets.append((match.start(), match.end()))

          return offsets

      def normalize_text_for_matching(x):

          """

          Replaces all non alfanum characters to spaces and lower cases the string,

          useful for comparing two text strings without punctuation

          """

          return re.sub(r'[^\w]', ' ', x.lower())

      def get_matching_line_offsets(lines, terms=None, markers=None):

          """ Return a set of `lines` indices (starting from 1) matching a

          text search query, along with `context` lines above/below matching lines

          :param lines: list of strings representing lines

          :param terms: search term string to match in lines eg. 'some text'

          :param markers: instead of terms, use highlight markers instead that

              mark beginning and end for matched item. eg. ['START(.*?)END']

           eg.

          text = '''

          words words words

          words words words

          some text some

          words words words

          words words words

          text here what

          '''

          get_matching_line_offsets(text, 'text', context=1)

          6, {3: [(5, 9)], 6: [(0, 4)]]

          """

          matching_lines = {}

          line_index = 0

          if terms:

              phrases = [normalize_text_for_matching(phrase)

                         for phrase in extract_phrases(terms)]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  normalized_line = normalize_text_for_matching(line)

                  match_offsets = get_matching_phrase_offsets(normalized_line, phrases)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          else:

              markers = markers or [HL_MARKER_RE]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  match_offsets = get_matching_markers_offsets(line, markers=markers)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          return line_index, matching_lines

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

super-admin libs: removed utf8 markers	r5054
dan search: add support for elastic search 6...	r3319
marcink code: update copyrights to 2020	r4306	# Copyright (C) 2012-2020 RhodeCode GmbH
dan search: add support for elastic search 6...	r3319	#
		# This program is free software: you can redistribute it and/or modify
		# it under the terms of the GNU Affero General Public License, version 3
		# (only), as published by the Free Software Foundation.
		#
		# This program is distributed in the hope that it will be useful,
		# but WITHOUT ANY WARRANTY; without even the implied warranty of
		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		# GNU General Public License for more details.
		#
		# You should have received a copy of the GNU Affero General Public License
		# along with this program. If not, see <http://www.gnu.org/licenses/>.
		#
		# This program is dual-licensed. If you wish to learn more about the
		# RhodeCode Enterprise Edition, including its added features, Support services,
		# and proprietary license terms, please see https://rhodecode.com/licenses/
		import re

		import pygments.filter
		import pygments.filters
		from pygments.token import Comment

		HL_BEG_MARKER = '__RCSearchHLMarkBEG__'
		HL_END_MARKER = '__RCSearchHLMarkEND__'
		HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)


		class ElasticSearchHLFilter(pygments.filters.Filter):
		_names = [HL_BEG_MARKER, HL_END_MARKER]

		def __init__(self, **options):
		pygments.filters.Filter.__init__(self, **options)

		def filter(self, lexer, stream):
		def tokenize(_value):
		for token in re.split('({}\|{})'.format(
		self._names[0], self._names[1]), _value):
		if token:
		yield token

		hl = False
		for ttype, value in stream:

		if self._names[0] in value or self._names[1] in value:
		for item in tokenize(value):
		if item == self._names[0]:
		# skip marker, but start HL
		hl = True
		continue
		elif item == self._names[1]:
		hl = False
		continue

		if hl:
		yield Comment.ElasticMatch, item
		else:
		yield ttype, item
		else:
		if hl:
		yield Comment.ElasticMatch, value
		else:
		yield ttype, value


		def extract_phrases(text_query):
		"""
		Extracts phrases from search term string making sure phrases
		contained in double quotes are kept together - and discarding empty values
		or fully whitespace values eg.

		'some text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

		"""

		in_phrase = False
		buf = ''
		phrases = []
		for char in text_query:
		if in_phrase:
		if char == '"': # end phrase
		phrases.append(buf)
		buf = ''
		in_phrase = False
		continue
		else:
		buf += char
		continue
		else:
		if char == '"': # start phrase
		in_phrase = True
		phrases.append(buf)
		buf = ''
		continue
		elif char == ' ':
		phrases.append(buf)
		buf = ''
		continue
		else:
		buf += char

		phrases.append(buf)
		phrases = [phrase.strip() for phrase in phrases if phrase.strip()]
		return phrases


		def get_matching_phrase_offsets(text, phrases):
		"""
		Returns a list of string offsets in `text` that the list of `terms` match

		>>> get_matching_phrase_offsets('some text here', ['some', 'here'])
		[(0, 4), (10, 14)]

		"""
		phrases = phrases or []
		offsets = []

		for phrase in phrases:
		for match in re.finditer(phrase, text):
		offsets.append((match.start(), match.end()))

		return offsets


		def get_matching_markers_offsets(text, markers=None):
		"""
		Returns a list of string offsets in `text` that the are between matching markers

		>>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])
		[(0, 5), (16, 22)]

		"""
		markers = markers or [HL_MARKER_RE]
		offsets = []

		if markers:
		for mark in markers:
		for match in re.finditer(mark, text):
		offsets.append((match.start(), match.end()))

		return offsets


		def normalize_text_for_matching(x):
		"""
		Replaces all non alfanum characters to spaces and lower cases the string,
		useful for comparing two text strings without punctuation
		"""
		return re.sub(r'[^\w]', ' ', x.lower())


		def get_matching_line_offsets(lines, terms=None, markers=None):
		""" Return a set of `lines` indices (starting from 1) matching a
		text search query, along with `context` lines above/below matching lines

		:param lines: list of strings representing lines
		:param terms: search term string to match in lines eg. 'some text'
		:param markers: instead of terms, use highlight markers instead that
		mark beginning and end for matched item. eg. ['START(.*?)END']

		eg.

		text = '''
		words words words
		words words words
		some text some
		words words words
		words words words
		text here what
		'''
		get_matching_line_offsets(text, 'text', context=1)
		6, {3: [(5, 9)], 6: [(0, 4)]]

		"""
		matching_lines = {}
		line_index = 0

		if terms:
		phrases = [normalize_text_for_matching(phrase)
		for phrase in extract_phrases(terms)]

		for line_index, line in enumerate(lines.splitlines(), start=1):
		normalized_line = normalize_text_for_matching(line)
		match_offsets = get_matching_phrase_offsets(normalized_line, phrases)
		if match_offsets:
		matching_lines[line_index] = match_offsets

		else:
		markers = markers or [HL_MARKER_RE]
		for line_index, line in enumerate(lines.splitlines(), start=1):
		match_offsets = get_matching_markers_offsets(line, markers=markers)
		if match_offsets:
		matching_lines[line_index] = match_offsets

		return line_index, matching_lines