rhodecode-enterprise-ce Files · rhodecode/lib/index/search_utils.py

search: add support for elastic search 6...

search: add support for elastic search 6 - elasticsearch new query lang and searcher - much more advanced query lang - added context aware search in repository groups, repository, commits, files view - optimized quick search bar speed when using full text search - added option to increase hits per file number from query URL using max_lines - search results can be now marked inside a text file using ?mark=HIGHLIGHT_TEXT added to the url

dan - - Load All Authors

File last commit:

r3319:b8fd1d7a default


                r3319:b8fd1d7a

default

Download file

             search_utils.py
        
                    257 lines
            
             | 8.3 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / index / search_utils.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # -*- coding: utf-8 -*-

      # Copyright (C) 2012-2018 RhodeCode GmbH

      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      import re

      import pygments.filter

      import pygments.filters

      from pygments.token import Comment

      HL_BEG_MARKER = '__RCSearchHLMarkBEG__'

      HL_END_MARKER = '__RCSearchHLMarkEND__'

      HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)

      class ElasticSearchHLFilter(pygments.filters.Filter):

          _names = [HL_BEG_MARKER, HL_END_MARKER]

          def __init__(self, **options):

              pygments.filters.Filter.__init__(self, **options)

          def filter(self, lexer, stream):

              def tokenize(_value):

                  for token in re.split('({}|{})'.format(

                          self._names[0], self._names[1]), _value):

                      if token:

                          yield token

              hl = False

              for ttype, value in stream:

                  if self._names[0] in value or self._names[1] in value:

                      for item in tokenize(value):

                          if item == self._names[0]:

                              # skip marker, but start HL

                              hl = True

                              continue

                          elif item == self._names[1]:

                              hl = False

                              continue

                          if hl:

                              yield Comment.ElasticMatch, item

                          else:

                              yield ttype, item

                  else:

                      if hl:

                          yield Comment.ElasticMatch, value

                      else:

                          yield ttype, value

      def extract_phrases(text_query):

          """

          Extracts phrases from search term string making sure phrases

          contained in double quotes are kept together - and discarding empty values

          or fully whitespace values eg.

          'some   text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

          """

          in_phrase = False

          buf = ''

          phrases = []

          for char in text_query:

              if in_phrase:

                  if char == '"':  # end phrase

                      phrases.append(buf)

                      buf = ''

                      in_phrase = False

                      continue

                  else:

                      buf += char

                      continue

              else:

                  if char == '"':  # start phrase

                      in_phrase = True

                      phrases.append(buf)

                      buf = ''

                      continue

                  elif char == ' ':

                      phrases.append(buf)

                      buf = ''

                      continue

                  else:

                      buf += char

          phrases.append(buf)

          phrases = [phrase.strip() for phrase in phrases if phrase.strip()]

          return phrases

      def get_matching_phrase_offsets(text, phrases):

          """

          Returns a list of string offsets in `text` that the list of `terms` match

          >>> get_matching_phrase_offsets('some text here', ['some', 'here'])

          [(0, 4), (10, 14)]

          """

          phrases = phrases or []

          offsets = []

          for phrase in phrases:

              for match in re.finditer(phrase, text):

                  offsets.append((match.start(), match.end()))

          return offsets

      def get_matching_markers_offsets(text, markers=None):

          """

          Returns a list of string offsets in `text` that the are between matching markers

          >>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])

          [(0, 5), (16, 22)]

          """

          markers = markers or [HL_MARKER_RE]

          offsets = []

          if markers:

              for mark in markers:

                  for match in re.finditer(mark, text):

                      offsets.append((match.start(), match.end()))

          return offsets

      def normalize_text_for_matching(x):

          """

          Replaces all non alfanum characters to spaces and lower cases the string,

          useful for comparing two text strings without punctuation

          """

          return re.sub(r'[^\w]', ' ', x.lower())

      def get_matching_line_offsets(lines, terms=None, markers=None):

          """ Return a set of `lines` indices (starting from 1) matching a

          text search query, along with `context` lines above/below matching lines

          :param lines: list of strings representing lines

          :param terms: search term string to match in lines eg. 'some text'

          :param markers: instead of terms, use highlight markers instead that

              mark beginning and end for matched item. eg. ['START(.*?)END']

           eg.

          text = '''

          words words words

          words words words

          some text some

          words words words

          words words words

          text here what

          '''

          get_matching_line_offsets(text, 'text', context=1)

          6, {3: [(5, 9)], 6: [(0, 4)]]

          """

          matching_lines = {}

          line_index = 0

          if terms:

              phrases = [normalize_text_for_matching(phrase)

                         for phrase in extract_phrases(terms)]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  normalized_line = normalize_text_for_matching(line)

                  match_offsets = get_matching_phrase_offsets(normalized_line, phrases)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          else:

              markers = markers or [HL_MARKER_RE]

              for line_index, line in enumerate(lines.splitlines(), start=1):

                  match_offsets = get_matching_markers_offsets(line, markers=markers)

                  if match_offsets:

                      matching_lines[line_index] = match_offsets

          return line_index, matching_lines

      def lucene_query_parser():

          # from pyparsing lucene_grammar

          from pyparsing import (

              Literal, CaselessKeyword, Forward, Regex, QuotedString, Suppress,

              Optional, Group, infixNotation, opAssoc, ParserElement, pyparsing_common)

          ParserElement.enablePackrat()

          COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map(Literal, ":[]{}~^")

          LPAR, RPAR = map(Suppress, "()")

          and_, or_, not_, to_ = map(CaselessKeyword, "AND OR NOT TO".split())

          keyword = and_ | or_ | not_ | to_

          expression = Forward()

          valid_word = Regex(r'([a-zA-Z0-9*_+.-]|\\[!(){}\[\]^"~*?\\:])+').setName("word")

          valid_word.setParseAction(

              lambda t: t[0]

                  .replace('\\\\', chr(127))

                  .replace('\\', '')

                  .replace(chr(127), '\\')

          )

          string = QuotedString('"')

          required_modifier = Literal("+")("required")

          prohibit_modifier = Literal("-")("prohibit")

          integer = Regex(r"\d+").setParseAction(lambda t: int(t[0]))

          proximity_modifier = Group(TILDE + integer("proximity"))

          number = pyparsing_common.fnumber()

          fuzzy_modifier = TILDE + Optional(number, default=0.5)("fuzzy")

          term = Forward()

          field_name = valid_word().setName("fieldname")

          incl_range_search = Group(LBRACK + term("lower") + to_ + term("upper") + RBRACK)

          excl_range_search = Group(LBRACE + term("lower") + to_ + term("upper") + RBRACE)

          range_search = incl_range_search("incl_range") | excl_range_search("excl_range")

          boost = (CARAT + number("boost"))

          string_expr = Group(string + proximity_modifier) | string

          word_expr = Group(valid_word + fuzzy_modifier) | valid_word

          term << (Optional(field_name("field") + COLON) +

                   (word_expr | string_expr | range_search | Group(

                       LPAR + expression + RPAR)) +

                   Optional(boost))

          term.setParseAction(lambda t: [t] if 'field' in t or 'boost' in t else None)

          expression << infixNotation(

              term,

              [

                  (required_modifier | prohibit_modifier, 1, opAssoc.RIGHT),

                  ((not_ | '!').setParseAction(lambda: "NOT"), 1, opAssoc.RIGHT),

                  ((and_ | '&&').setParseAction(lambda: "AND"), 2, opAssoc.LEFT),

                  (Optional(or_ | '||').setParseAction(lambda: "OR"), 2, opAssoc.LEFT),

              ]

          )

          return expression

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# -- coding: utf-8 --

				# Copyright (C) 2012-2018 RhodeCode GmbH
				#
				# This program is free software: you can redistribute it and/or modify
				# it under the terms of the GNU Affero General Public License, version 3
				# (only), as published by the Free Software Foundation.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU Affero General Public License
				# along with this program. If not, see <http://www.gnu.org/licenses/>.
				#
				# This program is dual-licensed. If you wish to learn more about the
				# RhodeCode Enterprise Edition, including its added features, Support services,
				# and proprietary license terms, please see https://rhodecode.com/licenses/
				import re

				import pygments.filter
				import pygments.filters
				from pygments.token import Comment

				HL_BEG_MARKER = '__RCSearchHLMarkBEG__'
				HL_END_MARKER = '__RCSearchHLMarkEND__'
				HL_MARKER_RE = '{}(.*?){}'.format(HL_BEG_MARKER, HL_END_MARKER)


				class ElasticSearchHLFilter(pygments.filters.Filter):
				_names = [HL_BEG_MARKER, HL_END_MARKER]

				def __init__(self, **options):
				pygments.filters.Filter.__init__(self, **options)

				def filter(self, lexer, stream):
				def tokenize(_value):
				for token in re.split('({}\|{})'.format(
				self._names[0], self._names[1]), _value):
				if token:
				yield token

				hl = False
				for ttype, value in stream:

				if self._names[0] in value or self._names[1] in value:
				for item in tokenize(value):
				if item == self._names[0]:
				# skip marker, but start HL
				hl = True
				continue
				elif item == self._names[1]:
				hl = False
				continue

				if hl:
				yield Comment.ElasticMatch, item
				else:
				yield ttype, item
				else:
				if hl:
				yield Comment.ElasticMatch, value
				else:
				yield ttype, value


				def extract_phrases(text_query):
				"""
				Extracts phrases from search term string making sure phrases
				contained in double quotes are kept together - and discarding empty values
				or fully whitespace values eg.

				'some text "a phrase" more' => ['some', 'text', 'a phrase', 'more']

				"""

				in_phrase = False
				buf = ''
				phrases = []
				for char in text_query:
				if in_phrase:
				if char == '"': # end phrase
				phrases.append(buf)
				buf = ''
				in_phrase = False
				continue
				else:
				buf += char
				continue
				else:
				if char == '"': # start phrase
				in_phrase = True
				phrases.append(buf)
				buf = ''
				continue
				elif char == ' ':
				phrases.append(buf)
				buf = ''
				continue
				else:
				buf += char

				phrases.append(buf)
				phrases = [phrase.strip() for phrase in phrases if phrase.strip()]
				return phrases


				def get_matching_phrase_offsets(text, phrases):
				"""
				Returns a list of string offsets in `text` that the list of `terms` match

				>>> get_matching_phrase_offsets('some text here', ['some', 'here'])
				[(0, 4), (10, 14)]

				"""
				phrases = phrases or []
				offsets = []

				for phrase in phrases:
				for match in re.finditer(phrase, text):
				offsets.append((match.start(), match.end()))

				return offsets


				def get_matching_markers_offsets(text, markers=None):
				"""
				Returns a list of string offsets in `text` that the are between matching markers

				>>> get_matching_markers_offsets('$1some$2 text $1here$2 marked', ['\$1(.*?)\$2'])
				[(0, 5), (16, 22)]

				"""
				markers = markers or [HL_MARKER_RE]
				offsets = []

				if markers:
				for mark in markers:
				for match in re.finditer(mark, text):
				offsets.append((match.start(), match.end()))

				return offsets


				def normalize_text_for_matching(x):
				"""
				Replaces all non alfanum characters to spaces and lower cases the string,
				useful for comparing two text strings without punctuation
				"""
				return re.sub(r'[^\w]', ' ', x.lower())


				def get_matching_line_offsets(lines, terms=None, markers=None):
				""" Return a set of `lines` indices (starting from 1) matching a
				text search query, along with `context` lines above/below matching lines

				:param lines: list of strings representing lines
				:param terms: search term string to match in lines eg. 'some text'
				:param markers: instead of terms, use highlight markers instead that
				mark beginning and end for matched item. eg. ['START(.*?)END']

				eg.

				text = '''
				words words words
				words words words
				some text some
				words words words
				words words words
				text here what
				'''
				get_matching_line_offsets(text, 'text', context=1)
				6, {3: [(5, 9)], 6: [(0, 4)]]

				"""
				matching_lines = {}
				line_index = 0

				if terms:
				phrases = [normalize_text_for_matching(phrase)
				for phrase in extract_phrases(terms)]

				for line_index, line in enumerate(lines.splitlines(), start=1):
				normalized_line = normalize_text_for_matching(line)
				match_offsets = get_matching_phrase_offsets(normalized_line, phrases)
				if match_offsets:
				matching_lines[line_index] = match_offsets

				else:
				markers = markers or [HL_MARKER_RE]
				for line_index, line in enumerate(lines.splitlines(), start=1):
				match_offsets = get_matching_markers_offsets(line, markers=markers)
				if match_offsets:
				matching_lines[line_index] = match_offsets

				return line_index, matching_lines


				def lucene_query_parser():
				# from pyparsing lucene_grammar
				from pyparsing import (
				Literal, CaselessKeyword, Forward, Regex, QuotedString, Suppress,
				Optional, Group, infixNotation, opAssoc, ParserElement, pyparsing_common)

				ParserElement.enablePackrat()

				COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map(Literal, ":[]{}~^")
				LPAR, RPAR = map(Suppress, "()")
				and_, or_, not_, to_ = map(CaselessKeyword, "AND OR NOT TO".split())
				keyword = and_ \| or_ \| not_ \| to_

				expression = Forward()

				valid_word = Regex(r'([a-zA-Z0-9_+.-]\|\\[!(){}\[\]^"~?\\:])+').setName("word")
				valid_word.setParseAction(
				lambda t: t[0]
				.replace('\\\\', chr(127))
				.replace('\\', '')
				.replace(chr(127), '\\')
				)

				string = QuotedString('"')

				required_modifier = Literal("+")("required")
				prohibit_modifier = Literal("-")("prohibit")
				integer = Regex(r"\d+").setParseAction(lambda t: int(t[0]))
				proximity_modifier = Group(TILDE + integer("proximity"))
				number = pyparsing_common.fnumber()
				fuzzy_modifier = TILDE + Optional(number, default=0.5)("fuzzy")

				term = Forward()
				field_name = valid_word().setName("fieldname")
				incl_range_search = Group(LBRACK + term("lower") + to_ + term("upper") + RBRACK)
				excl_range_search = Group(LBRACE + term("lower") + to_ + term("upper") + RBRACE)
				range_search = incl_range_search("incl_range") \| excl_range_search("excl_range")
				boost = (CARAT + number("boost"))

				string_expr = Group(string + proximity_modifier) \| string
				word_expr = Group(valid_word + fuzzy_modifier) \| valid_word
				term << (Optional(field_name("field") + COLON) +
				(word_expr \| string_expr \| range_search \| Group(
				LPAR + expression + RPAR)) +
				Optional(boost))
				term.setParseAction(lambda t: [t] if 'field' in t or 'boost' in t else None)

				expression << infixNotation(
				term,
				[
				(required_modifier \| prohibit_modifier, 1, opAssoc.RIGHT),
				((not_ \| '!').setParseAction(lambda: "NOT"), 1, opAssoc.RIGHT),
				((and_ \| '&&').setParseAction(lambda: "AND"), 2, opAssoc.LEFT),
				(Optional(or_ \| '\|\|').setParseAction(lambda: "OR"), 2, opAssoc.LEFT),
				]
				)

				return expression