rhodecode-enterprise-ce Files · rhodecode/lib/index/search_utils.py

pull-requests: add merge check that detects WIP marker in title. This will prevent merges in such case....

pull-requests: add merge check that detects WIP marker in title. This will prevent merges in such case. Usually WIP in title means unfinished task that needs still some work. This pattern is present in Gitlab/Github and is already quite common.

dan - - Load All Authors

File last commit:

r3442:3bc8f801 default


                r4099:c12e69d0

default

Download file

             search_utils.py
        
                    197 lines
            
             | 6.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / index / search_utils.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # -*- coding: utf-8 -*-

      # Copyright (C) 2012-2019 RhodeCode GmbH

      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      import re

      import pygments.filter

      import pygments.filters

      from pygments.token import Comment

      HL_BEG_MARKER = '__RCSearchHLMarkBEG__'

      HL_END_MARKER = '__RCSearchHLMarkEND__'

      HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)

      class ElasticSearchHLFilter(pygments.filters.Filter):

          _names = [HL_BEG_MARKER, HL_END_MARKER]

          def __init__(self, **options):

              pygments.filters.Filter.__init__(self, **options)

          def filter(self, lexer, stream):

              def tokenize(_value):

                  for token in re.split('({}|{})'.format(

                          self._names[0], self._names[1]), _value):

                      if token:

                          yield token

              hl = False

              for ttype, value in stream:

                  if self._names[0] in value or self._names[1] in value:

                      for item in tokenize(value):

                          if item == self._names[0]:

                              # skip marker, but start HL

                              hl = True

                              continue

                          elif item == self._names[1]:

                              hl = False

                              continue

                          if hl:

                              yield Comment.ElasticMatch, item

                          else:

                              yield ttype, item

                  else:

                      if hl:

                          yield Comment.ElasticMatch, value

                      else:

                          yield ttype, value

      def extract_phrases(text_query):

          """

          Extracts phrases from search term string making sure phrases

          contained in double quotes are kept together - and discarding empty values

          or fully whitespace values eg.

          'some   text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

          """

          in_phrase = False

          buf = ''

          phrases = []

          for char in text_query:

              if in_phrase:

                  if char == '"':  # end phrase

                      phrases.append(buf)

                      buf = ''

                      in_phrase = False

                      continue

                  else:

                      buf += char

                      continue

              else:

                  if char == '"':  # start phrase

                      in_phrase = True

                      phrases.append(buf)

                      buf = ''

                      continue

                  elif char == ' ':

                      phrases.append(buf)

                      buf = ''

                      continue

                  else:

                      buf += char

          phrases.append(buf)

          phrases = [phrase.strip() for phrase in phrases if phrase.strip()]

          return phrases

      def get_matching_phrase_offsets(text, phrases):

          """

          Returns a list of string offsets in `text` that the list of `terms` match

          >>> get_matching_phrase_offsets('some text here', ['some', 'here'])

          [(0, 4), (10, 14)]

          """

          phrases = phrases or []

          offsets = []

          for phrase in phrases:

              for match in re.finditer(phrase, text):

                  offsets.append((match.start(), match.end()))

          return offsets

      def get_matching_markers_offsets(text, markers=None):

          """

          Returns a list of string offsets in `text` that the are between matching markers

          >>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])

          [(0, 5), (16, 22)]

          """

          markers = markers or [HL_MARKER_RE]

          offsets = []

          if markers:

              for mark in markers:

                  for match in re.finditer(mark, text):

                      offsets.append((match.start(), match.end()))

          return offsets

      def normalize_text_for_matching(x):

          """

          Replaces all non alfanum characters to spaces and lower cases the string,

          useful for comparing two text strings without punctuation

          """

          return re.sub(r'[^\w]', ' ', x.lower())

      def get_matching_line_offsets(lines, terms=None, markers=None):

          """ Return a set of `lines` indices (starting from 1) matching a

          text search query, along with `context` lines above/below matching lines

          :param lines: list of strings representing lines

          :param terms: search term string to match in lines eg. 'some text'

          :param markers: instead of terms, use highlight markers instead that

              mark beginning and end for matched item. eg. ['START(.*?)END']

           eg.

          text = '''

          words words words

          words words words

          some text some

          words words words

          words words words

          text here what

          '''

          get_matching_line_offsets(text, 'text', context=1)

          6, {3: [(5, 9)], 6: [(0, 4)]]

          """

          matching_lines = {}

          line_index = 0

          if terms:

              phrases = [normalize_text_for_matching(phrase)

                         for phrase in extract_phrases(terms)]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  normalized_line = normalize_text_for_matching(line)

                  match_offsets = get_matching_phrase_offsets(normalized_line, phrases)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          else:

              markers = markers or [HL_MARKER_RE]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  match_offsets = get_matching_markers_offsets(line, markers=markers)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          return line_index, matching_lines

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# -- coding: utf-8 --

				# Copyright (C) 2012-2019 RhodeCode GmbH
				#
				# This program is free software: you can redistribute it and/or modify
				# it under the terms of the GNU Affero General Public License, version 3
				# (only), as published by the Free Software Foundation.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU Affero General Public License
				# along with this program. If not, see <http://www.gnu.org/licenses/>.
				#
				# This program is dual-licensed. If you wish to learn more about the
				# RhodeCode Enterprise Edition, including its added features, Support services,
				# and proprietary license terms, please see https://rhodecode.com/licenses/
				import re

				import pygments.filter
				import pygments.filters
				from pygments.token import Comment

				HL_BEG_MARKER = '__RCSearchHLMarkBEG__'
				HL_END_MARKER = '__RCSearchHLMarkEND__'
				HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)


				class ElasticSearchHLFilter(pygments.filters.Filter):
				_names = [HL_BEG_MARKER, HL_END_MARKER]

				def __init__(self, **options):
				pygments.filters.Filter.__init__(self, **options)

				def filter(self, lexer, stream):
				def tokenize(_value):
				for token in re.split('({}\|{})'.format(
				self._names[0], self._names[1]), _value):
				if token:
				yield token

				hl = False
				for ttype, value in stream:

				if self._names[0] in value or self._names[1] in value:
				for item in tokenize(value):
				if item == self._names[0]:
				# skip marker, but start HL
				hl = True
				continue
				elif item == self._names[1]:
				hl = False
				continue

				if hl:
				yield Comment.ElasticMatch, item
				else:
				yield ttype, item
				else:
				if hl:
				yield Comment.ElasticMatch, value
				else:
				yield ttype, value


				def extract_phrases(text_query):
				"""
				Extracts phrases from search term string making sure phrases
				contained in double quotes are kept together - and discarding empty values
				or fully whitespace values eg.

				'some text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

				"""

				in_phrase = False
				buf = ''
				phrases = []
				for char in text_query:
				if in_phrase:
				if char == '"': # end phrase
				phrases.append(buf)
				buf = ''
				in_phrase = False
				continue
				else:
				buf += char
				continue
				else:
				if char == '"': # start phrase
				in_phrase = True
				phrases.append(buf)
				buf = ''
				continue
				elif char == ' ':
				phrases.append(buf)
				buf = ''
				continue
				else:
				buf += char

				phrases.append(buf)
				phrases = [phrase.strip() for phrase in phrases if phrase.strip()]
				return phrases


				def get_matching_phrase_offsets(text, phrases):
				"""
				Returns a list of string offsets in `text` that the list of `terms` match

				>>> get_matching_phrase_offsets('some text here', ['some', 'here'])
				[(0, 4), (10, 14)]

				"""
				phrases = phrases or []
				offsets = []

				for phrase in phrases:
				for match in re.finditer(phrase, text):
				offsets.append((match.start(), match.end()))

				return offsets


				def get_matching_markers_offsets(text, markers=None):
				"""
				Returns a list of string offsets in `text` that the are between matching markers

				>>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])
				[(0, 5), (16, 22)]

				"""
				markers = markers or [HL_MARKER_RE]
				offsets = []

				if markers:
				for mark in markers:
				for match in re.finditer(mark, text):
				offsets.append((match.start(), match.end()))

				return offsets


				def normalize_text_for_matching(x):
				"""
				Replaces all non alfanum characters to spaces and lower cases the string,
				useful for comparing two text strings without punctuation
				"""
				return re.sub(r'[^\w]', ' ', x.lower())


				def get_matching_line_offsets(lines, terms=None, markers=None):
				""" Return a set of `lines` indices (starting from 1) matching a
				text search query, along with `context` lines above/below matching lines

				:param lines: list of strings representing lines
				:param terms: search term string to match in lines eg. 'some text'
				:param markers: instead of terms, use highlight markers instead that
				mark beginning and end for matched item. eg. ['START(.*?)END']

				eg.

				text = '''
				words words words
				words words words
				some text some
				words words words
				words words words
				text here what
				'''
				get_matching_line_offsets(text, 'text', context=1)
				6, {3: [(5, 9)], 6: [(0, 4)]]

				"""
				matching_lines = {}
				line_index = 0

				if terms:
				phrases = [normalize_text_for_matching(phrase)
				for phrase in extract_phrases(terms)]

				for line_index, line in enumerate(lines.splitlines(), start=1):
				normalized_line = normalize_text_for_matching(line)
				match_offsets = get_matching_phrase_offsets(normalized_line, phrases)
				if match_offsets:
				matching_lines[line_index] = match_offsets

				else:
				markers = markers or [HL_MARKER_RE]
				for line_index, line in enumerate(lines.splitlines(), start=1):
				match_offsets = get_matching_markers_offsets(line, markers=markers)
				if match_offsets:
				matching_lines[line_index] = match_offsets

				return line_index, matching_lines