rhodecode-enterprise-ce Files · rhodecode/lib/index/search_utils.py

libs: removed utf8 markers

super-admin - - Load All Authors

File last commit:

r5054:c54edc4f default


                r5054:c54edc4f

default

Download file

             search_utils.py
        
                    197 lines
            
             | 6.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / index / search_utils.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # Copyright (C) 2012-2020 RhodeCode GmbH

      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      import re

      import pygments.filter

      import pygments.filters

      from pygments.token import Comment

      HL_BEG_MARKER = '__RCSearchHLMarkBEG__'

      HL_END_MARKER = '__RCSearchHLMarkEND__'

      HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)

      class ElasticSearchHLFilter(pygments.filters.Filter):

          _names = [HL_BEG_MARKER, HL_END_MARKER]

          def __init__(self, **options):

              pygments.filters.Filter.__init__(self, **options)

          def filter(self, lexer, stream):

              def tokenize(_value):

                  for token in re.split('({}|{})'.format(

                          self._names[0], self._names[1]), _value):

                      if token:

                          yield token

              hl = False

              for ttype, value in stream:

                  if self._names[0] in value or self._names[1] in value:

                      for item in tokenize(value):

                          if item == self._names[0]:

                              # skip marker, but start HL

                              hl = True

                              continue

                          elif item == self._names[1]:

                              hl = False

                              continue

                          if hl:

                              yield Comment.ElasticMatch, item

                          else:

                              yield ttype, item

                  else:

                      if hl:

                          yield Comment.ElasticMatch, value

                      else:

                          yield ttype, value

      def extract_phrases(text_query):

          """

          Extracts phrases from search term string making sure phrases

          contained in double quotes are kept together - and discarding empty values

          or fully whitespace values eg.

          'some   text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

          """

          in_phrase = False

          buf = ''

          phrases = []

          for char in text_query:

              if in_phrase:

                  if char == '"':  # end phrase

                      phrases.append(buf)

                      buf = ''

                      in_phrase = False

                      continue

                  else:

                      buf += char

                      continue

              else:

                  if char == '"':  # start phrase

                      in_phrase = True

                      phrases.append(buf)

                      buf = ''

                      continue

                  elif char == ' ':

                      phrases.append(buf)

                      buf = ''

                      continue

                  else:

                      buf += char

          phrases.append(buf)

          phrases = [phrase.strip() for phrase in phrases if phrase.strip()]

          return phrases

      def get_matching_phrase_offsets(text, phrases):

          """

          Returns a list of string offsets in `text` that the list of `terms` match

          >>> get_matching_phrase_offsets('some text here', ['some', 'here'])

          [(0, 4), (10, 14)]

          """

          phrases = phrases or []

          offsets = []

          for phrase in phrases:

              for match in re.finditer(phrase, text):

                  offsets.append((match.start(), match.end()))

          return offsets

      def get_matching_markers_offsets(text, markers=None):

          """

          Returns a list of string offsets in `text` that the are between matching markers

          >>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])

          [(0, 5), (16, 22)]

          """

          markers = markers or [HL_MARKER_RE]

          offsets = []

          if markers:

              for mark in markers:

                  for match in re.finditer(mark, text):

                      offsets.append((match.start(), match.end()))

          return offsets

      def normalize_text_for_matching(x):

          """

          Replaces all non alfanum characters to spaces and lower cases the string,

          useful for comparing two text strings without punctuation

          """

          return re.sub(r'[^\w]', ' ', x.lower())

      def get_matching_line_offsets(lines, terms=None, markers=None):

          """ Return a set of `lines` indices (starting from 1) matching a

          text search query, along with `context` lines above/below matching lines

          :param lines: list of strings representing lines

          :param terms: search term string to match in lines eg. 'some text'

          :param markers: instead of terms, use highlight markers instead that

              mark beginning and end for matched item. eg. ['START(.*?)END']

           eg.

          text = '''

          words words words

          words words words

          some text some

          words words words

          words words words

          text here what

          '''

          get_matching_line_offsets(text, 'text', context=1)

          6, {3: [(5, 9)], 6: [(0, 4)]]

          """

          matching_lines = {}

          line_index = 0

          if terms:

              phrases = [normalize_text_for_matching(phrase)

                         for phrase in extract_phrases(terms)]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  normalized_line = normalize_text_for_matching(line)

                  match_offsets = get_matching_phrase_offsets(normalized_line, phrases)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          else:

              markers = markers or [HL_MARKER_RE]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  match_offsets = get_matching_markers_offsets(line, markers=markers)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          return line_index, matching_lines

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages



				# Copyright (C) 2012-2020 RhodeCode GmbH
				#
				# This program is free software: you can redistribute it and/or modify
				# it under the terms of the GNU Affero General Public License, version 3
				# (only), as published by the Free Software Foundation.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU Affero General Public License
				# along with this program. If not, see <http://www.gnu.org/licenses/>.
				#
				# This program is dual-licensed. If you wish to learn more about the
				# RhodeCode Enterprise Edition, including its added features, Support services,
				# and proprietary license terms, please see https://rhodecode.com/licenses/
				import re

				import pygments.filter
				import pygments.filters
				from pygments.token import Comment

				HL_BEG_MARKER = '__RCSearchHLMarkBEG__'
				HL_END_MARKER = '__RCSearchHLMarkEND__'
				HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)


				class ElasticSearchHLFilter(pygments.filters.Filter):
				_names = [HL_BEG_MARKER, HL_END_MARKER]

				def __init__(self, **options):
				pygments.filters.Filter.__init__(self, **options)

				def filter(self, lexer, stream):
				def tokenize(_value):
				for token in re.split('({}\|{})'.format(
				self._names[0], self._names[1]), _value):
				if token:
				yield token

				hl = False
				for ttype, value in stream:

				if self._names[0] in value or self._names[1] in value:
				for item in tokenize(value):
				if item == self._names[0]:
				# skip marker, but start HL
				hl = True
				continue
				elif item == self._names[1]:
				hl = False
				continue

				if hl:
				yield Comment.ElasticMatch, item
				else:
				yield ttype, item
				else:
				if hl:
				yield Comment.ElasticMatch, value
				else:
				yield ttype, value


				def extract_phrases(text_query):
				"""
				Extracts phrases from search term string making sure phrases
				contained in double quotes are kept together - and discarding empty values
				or fully whitespace values eg.

				'some text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

				"""

				in_phrase = False
				buf = ''
				phrases = []
				for char in text_query:
				if in_phrase:
				if char == '"': # end phrase
				phrases.append(buf)
				buf = ''
				in_phrase = False
				continue
				else:
				buf += char
				continue
				else:
				if char == '"': # start phrase
				in_phrase = True
				phrases.append(buf)
				buf = ''
				continue
				elif char == ' ':
				phrases.append(buf)
				buf = ''
				continue
				else:
				buf += char

				phrases.append(buf)
				phrases = [phrase.strip() for phrase in phrases if phrase.strip()]
				return phrases


				def get_matching_phrase_offsets(text, phrases):
				"""
				Returns a list of string offsets in `text` that the list of `terms` match

				>>> get_matching_phrase_offsets('some text here', ['some', 'here'])
				[(0, 4), (10, 14)]

				"""
				phrases = phrases or []
				offsets = []

				for phrase in phrases:
				for match in re.finditer(phrase, text):
				offsets.append((match.start(), match.end()))

				return offsets


				def get_matching_markers_offsets(text, markers=None):
				"""
				Returns a list of string offsets in `text` that the are between matching markers

				>>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])
				[(0, 5), (16, 22)]

				"""
				markers = markers or [HL_MARKER_RE]
				offsets = []

				if markers:
				for mark in markers:
				for match in re.finditer(mark, text):
				offsets.append((match.start(), match.end()))

				return offsets


				def normalize_text_for_matching(x):
				"""
				Replaces all non alfanum characters to spaces and lower cases the string,
				useful for comparing two text strings without punctuation
				"""
				return re.sub(r'[^\w]', ' ', x.lower())


				def get_matching_line_offsets(lines, terms=None, markers=None):
				""" Return a set of `lines` indices (starting from 1) matching a
				text search query, along with `context` lines above/below matching lines

				:param lines: list of strings representing lines
				:param terms: search term string to match in lines eg. 'some text'
				:param markers: instead of terms, use highlight markers instead that
				mark beginning and end for matched item. eg. ['START(.*?)END']

				eg.

				text = '''
				words words words
				words words words
				some text some
				words words words
				words words words
				text here what
				'''
				get_matching_line_offsets(text, 'text', context=1)
				6, {3: [(5, 9)], 6: [(0, 4)]]

				"""
				matching_lines = {}
				line_index = 0

				if terms:
				phrases = [normalize_text_for_matching(phrase)
				for phrase in extract_phrases(terms)]

				for line_index, line in enumerate(lines.splitlines(), start=1):
				normalized_line = normalize_text_for_matching(line)
				match_offsets = get_matching_phrase_offsets(normalized_line, phrases)
				if match_offsets:
				matching_lines[line_index] = match_offsets

				else:
				markers = markers or [HL_MARKER_RE]
				for line_index, line in enumerate(lines.splitlines(), start=1):
				match_offsets = get_matching_markers_offsets(line, markers=markers)
				if match_offsets:
				matching_lines[line_index] = match_offsets

				return line_index, matching_lines