rhodecode-enterprise-ce Commit - r1356:1e4a47eb

diffs: optimize how lexer is fetche for rich highlight mode....

marcink -

r1356:1e4a47eb default

parent child

rhodecode/lib/codeblocks.py

0 +23 -9

              # -*- coding: utf-8 -*-
              # Copyright (C) 2011-2017 RhodeCode GmbH
              #
              # This program is free software: you can redistribute it and/or modify
              # it under the terms of the GNU Affero General Public License, version 3
              # (only), as published by the Free Software Foundation.
              #
              # This program is distributed in the hope that it will be useful,
              # but WITHOUT ANY WARRANTY; without even the implied warranty of
              # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              # GNU General Public License for more details.
              #
              # You should have received a copy of the GNU Affero General Public License
              # along with this program.  If not, see <http://www.gnu.org/licenses/>.
              #
              # This program is dual-licensed. If you wish to learn more about the
              # RhodeCode Enterprise Edition, including its added features, Support services,
              # and proprietary license terms, please see https://rhodecode.com/licenses/
              import logging
              import difflib
              from itertools import groupby
              from pygments import lex
              from pygments.formatters.html import _get_ttype_class as pygment_token_class
              from rhodecode.lib.helpers import (
                  get_lexer_for_filenode, get_lexer_safe, html_escape)
              from rhodecode.lib.utils2 import AttributeDict
              from rhodecode.lib.vcs.nodes import FileNode
              from rhodecode.lib.diff_match_patch import diff_match_patch
              from rhodecode.lib.diffs import LimitedDiffContainer
              from pygments.lexers import get_lexer_by_name
              plain_text_lexer = get_lexer_by_name(
                  'text', stripall=False, stripnl=False, ensurenl=False)
              log = logging.getLogger()
              def filenode_as_lines_tokens(filenode, lexer=None):
+                 org_lexer = lexer
                  lexer = lexer or get_lexer_for_filenode(filenode)
-                 log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)
+                 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
+                           lexer, filenode, org_lexer)
                  tokens = tokenize_string(filenode.content, lexer)
                  lines = split_token_stream(tokens, split_string='\n')
                  rv = list(lines)
                  return rv
              def tokenize_string(content, lexer):
                  """
                  Use pygments to tokenize some content based on a lexer
                  ensuring all original new lines and whitespace is preserved
                  """
                  lexer.stripall = False
                  lexer.stripnl = False
                  lexer.ensurenl = False
                  for token_type, token_text in lex(content, lexer):
                      yield pygment_token_class(token_type), token_text
              def split_token_stream(tokens, split_string=u'\n'):
                  """
                  Take a list of (TokenType, text) tuples and split them by a string
                  >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
                  [(TEXT, 'some'), (TEXT, 'text'),
                   (TEXT, 'more'), (TEXT, 'text')]
                  """
                  buffer = []
                  for token_class, token_text in tokens:
                      parts = token_text.split(split_string)
                      for part in parts[:-1]:
                          buffer.append((token_class, part))
                          yield buffer
                          buffer = []
                      buffer.append((token_class, parts[-1]))
                  if buffer:
                      yield buffer
              def filenode_as_annotated_lines_tokens(filenode):
                  """
                  Take a file node and return a list of annotations => lines, if no annotation
                  is found, it will be None.
                  eg:
                  [
                      (annotation1, [
                          (1, line1_tokens_list),
                          (2, line2_tokens_list),
                      ]),
                      (annotation2, [
                          (3, line1_tokens_list),
                      ]),
                      (None, [
                          (4, line1_tokens_list),
                      ]),
                      (annotation1, [
                          (5, line1_tokens_list),
                          (6, line2_tokens_list),
                      ])
                  ]
                  """
                  commit_cache = {} # cache commit_getter lookups
                  def _get_annotation(commit_id, commit_getter):
                      if commit_id not in commit_cache:
                          commit_cache[commit_id] = commit_getter()
                      return commit_cache[commit_id]
                  annotation_lookup = {
                      line_no: _get_annotation(commit_id, commit_getter)
                      for line_no, commit_id, commit_getter, line_content
                      in filenode.annotate
                  }
                  annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                                        for line_no, tokens
                                        in enumerate(filenode_as_lines_tokens(filenode), 1))
                  grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
                  for annotation, group in grouped_annotations_lines:
                      yield (
                          annotation, [(line_no, tokens)
                                        for (_, line_no, tokens) in group]
                      )
              def render_tokenstream(tokenstream):
                  result = []
                  for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
                      if token_class:
                          result.append(u'<span class="%s">' % token_class)
                      else:
                          result.append(u'<span>')
                      for op_tag, token_text in token_ops_texts:
                          if op_tag:
                              result.append(u'<%s>' % op_tag)
                          escaped_text = html_escape(token_text)
                          # TODO: dan: investigate showing hidden characters like space/nl/tab
                          # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
                          # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
                          # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
                          result.append(escaped_text)
                          if op_tag:
                              result.append(u'</%s>' % op_tag)
                      result.append(u'</span>')
                  html = ''.join(result)
                  return html
              def rollup_tokenstream(tokenstream):
                  """
                  Group a token stream of the format:
                      ('class', 'op', 'text')
                  or
                      ('class', 'text')
                  into
                      [('class1',
                          [('op1', 'text'),
                           ('op2', 'text')]),
                       ('class2',
                          [('op3', 'text')])]
                  This is used to get the minimal tags necessary when
                  rendering to html eg for a token stream ie.
                  <span class="A"><ins>he</ins>llo</span>
                  vs
                  <span class="A"><ins>he</ins></span><span class="A">llo</span>
                  If a 2 tuple is passed in, the output op will be an empty string.
                  eg:
                  >>> rollup_tokenstream([('classA', '',      'h'),
                                          ('classA', 'del',   'ell'),
                                          ('classA', '',      'o'),
                                          ('classB', '',      ' '),
                                          ('classA', '',      'the'),
                                          ('classA', '',      're'),
                                          ])
                  [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
                   ('classB', [('', ' ')],
                   ('classA', [('', 'there')]]
                  """
                  if tokenstream and len(tokenstream[0]) == 2:
                      tokenstream = ((t[0], '', t[1]) for t in tokenstream)
                  result = []
                  for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
                      ops = []
                      for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
                          text_buffer = []
                          for t_class, t_op, t_text in token_text_list:
                              text_buffer.append(t_text)
                          ops.append((token_op, ''.join(text_buffer)))
                      result.append((token_class, ops))
                  return result
              def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
                  """
                  Converts a list of (token_class, token_text) tuples to a list of
                  (token_class, token_op, token_text) tuples where token_op is one of
                  ('ins', 'del', '')
                  :param old_tokens: list of (token_class, token_text) tuples of old line
                  :param new_tokens: list of (token_class, token_text) tuples of new line
                  :param use_diff_match_patch: boolean, will use google's diff match patch
                      library which has options to 'smooth' out the character by character
                      differences making nicer ins/del blocks
                  """
                  old_tokens_result = []
                  new_tokens_result = []
                  similarity = difflib.SequenceMatcher(None,
                      ''.join(token_text for token_class, token_text in old_tokens),
                      ''.join(token_text for token_class, token_text in new_tokens)
                  ).ratio()
                  if similarity < 0.6: # return, the blocks are too different
                      for token_class, token_text in old_tokens:
                          old_tokens_result.append((token_class, '', token_text))
                      for token_class, token_text in new_tokens:
                          new_tokens_result.append((token_class, '', token_text))
                      return old_tokens_result, new_tokens_result, similarity
                  token_sequence_matcher = difflib.SequenceMatcher(None,
                      [x[1] for x in old_tokens],
                      [x[1] for x in new_tokens])
                  for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
                      # check the differences by token block types first to give a more
                      # nicer "block" level replacement vs character diffs
                      if tag == 'equal':
                          for token_class, token_text in old_tokens[o1:o2]:
                              old_tokens_result.append((token_class, '', token_text))
                          for token_class, token_text in new_tokens[n1:n2]:
                              new_tokens_result.append((token_class, '', token_text))
                      elif tag == 'delete':
                          for token_class, token_text in old_tokens[o1:o2]:
                              old_tokens_result.append((token_class, 'del', token_text))
                      elif tag == 'insert':
                          for token_class, token_text in new_tokens[n1:n2]:
                              new_tokens_result.append((token_class, 'ins', token_text))
                      elif tag == 'replace':
                          # if same type token blocks must be replaced, do a diff on the
                          # characters in the token blocks to show individual changes
                          old_char_tokens = []
                          new_char_tokens = []
                          for token_class, token_text in old_tokens[o1:o2]:
                              for char in token_text:
                                  old_char_tokens.append((token_class, char))
                          for token_class, token_text in new_tokens[n1:n2]:
                              for char in token_text:
                                  new_char_tokens.append((token_class, char))
                          old_string = ''.join([token_text for
                              token_class, token_text in old_char_tokens])
                          new_string = ''.join([token_text for
                              token_class, token_text in new_char_tokens])
                          char_sequence = difflib.SequenceMatcher(
                              None, old_string, new_string)
                          copcodes = char_sequence.get_opcodes()
                          obuffer, nbuffer = [], []
                          if use_diff_match_patch:
                              dmp = diff_match_patch()
                              dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
                              reps = dmp.diff_main(old_string, new_string)
                              dmp.diff_cleanupEfficiency(reps)
                              a, b = 0, 0
                              for op, rep in reps:
                                  l = len(rep)
                                  if op == 0:
                                      for i, c in enumerate(rep):
                                          obuffer.append((old_char_tokens[a+i][0], '', c))
                                          nbuffer.append((new_char_tokens[b+i][0], '', c))
                                      a += l
                                      b += l
                                  elif op == -1:
                                      for i, c in enumerate(rep):
                                          obuffer.append((old_char_tokens[a+i][0], 'del', c))
                                      a += l
                                  elif op == 1:
                                      for i, c in enumerate(rep):
                                          nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
                                      b += l
                          else:
                              for ctag, co1, co2, cn1, cn2 in copcodes:
                                  if ctag == 'equal':
                                      for token_class, token_text in old_char_tokens[co1:co2]:
                                          obuffer.append((token_class, '', token_text))
                                      for token_class, token_text in new_char_tokens[cn1:cn2]:
                                          nbuffer.append((token_class, '', token_text))
                                  elif ctag == 'delete':
                                      for token_class, token_text in old_char_tokens[co1:co2]:
                                          obuffer.append((token_class, 'del', token_text))
                                  elif ctag == 'insert':
                                      for token_class, token_text in new_char_tokens[cn1:cn2]:
                                          nbuffer.append((token_class, 'ins', token_text))
                                  elif ctag == 'replace':
                                      for token_class, token_text in old_char_tokens[co1:co2]:
                                          obuffer.append((token_class, 'del', token_text))
                                      for token_class, token_text in new_char_tokens[cn1:cn2]:
                                          nbuffer.append((token_class, 'ins', token_text))
                          old_tokens_result.extend(obuffer)
                          new_tokens_result.extend(nbuffer)
                  return old_tokens_result, new_tokens_result, similarity
              class DiffSet(object):
                  """
                  An object for parsing the diff result from diffs.DiffProcessor and
                  adding highlighting, side by side/unified renderings and line diffs
                  """
                  HL_REAL = 'REAL' # highlights using original file, slow
                  HL_FAST = 'FAST' # highlights using just the line, fast but not correct
                                   # in the case of multiline code
                  HL_NONE = 'NONE' # no highlighting, fastest
                  def __init__(self, highlight_mode=HL_REAL, repo_name=None,
                               source_repo_name=None,
                               source_node_getter=lambda filename: None,
                               target_node_getter=lambda filename: None,
                               source_nodes=None, target_nodes=None,
                               max_file_size_limit=150 * 1024, # files over this size will
                                                               # use fast highlighting
                               comments=None,
                               ):
                      self.highlight_mode = highlight_mode
                      self.highlighted_filenodes = {}
                      self.source_node_getter = source_node_getter
                      self.target_node_getter = target_node_getter
                      self.source_nodes = source_nodes or {}
                      self.target_nodes = target_nodes or {}
                      self.repo_name = repo_name
                      self.source_repo_name = source_repo_name or repo_name
                      self.comments = comments or {}
                      self.comments_store = self.comments.copy()
                      self.max_file_size_limit = max_file_size_limit
                  def render_patchset(self, patchset, source_ref=None, target_ref=None):
                      diffset = AttributeDict(dict(
                          lines_added=0,
                          lines_deleted=0,
                          changed_files=0,
                          files=[],
                          limited_diff=isinstance(patchset, LimitedDiffContainer),
                          repo_name=self.repo_name,
                          source_repo_name=self.source_repo_name,
                          source_ref=source_ref,
                          target_ref=target_ref,
                      ))
                      for patch in patchset:
                          filediff = self.render_patch(patch)
                          filediff.diffset = diffset
                          diffset.files.append(filediff)
                          diffset.changed_files += 1
                          if not patch['stats']['binary']:
                              diffset.lines_added += patch['stats']['added']
                              diffset.lines_deleted += patch['stats']['deleted']
                      return diffset
                  _lexer_cache = {}
-                 def _get_lexer_for_filename(self, filename):
+                 def _get_lexer_for_filename(self, filename, filenode=None):
                      # cached because we might need to call it twice for source/target
                      if filename not in self._lexer_cache:
-                         self._lexer_cache[filename] = get_lexer_safe(filepath=filename)
+                         if filenode:
+                             lexer = filenode.lexer
+                         else:
+                             lexer = get_lexer_safe(filepath=filename)
+                         self._lexer_cache[filename] = lexer
                      return self._lexer_cache[filename]
                  def render_patch(self, patch):
                      log.debug('rendering diff for %r' % patch['filename'])
                      source_filename = patch['original_filename']
                      target_filename = patch['filename']
                      source_lexer = plain_text_lexer
                      target_lexer = plain_text_lexer
                      if not patch['stats']['binary']:
                          if self.highlight_mode == self.HL_REAL:
                              if (source_filename and patch['operation'] in ('D', 'M')
                                  and source_filename not in self.source_nodes):
                                      self.source_nodes[source_filename] = (
                                          self.source_node_getter(source_filename))
                              if (target_filename and patch['operation'] in ('A', 'M')
                                  and target_filename not in self.target_nodes):
                                      self.target_nodes[target_filename] = (
                                          self.target_node_getter(target_filename))
                          elif self.highlight_mode == self.HL_FAST:
                              source_lexer = self._get_lexer_for_filename(source_filename)
                              target_lexer = self._get_lexer_for_filename(target_filename)
                      source_file = self.source_nodes.get(source_filename, source_filename)
                      target_file = self.target_nodes.get(target_filename, target_filename)
                      source_filenode, target_filenode = None, None
                      # TODO: dan: FileNode.lexer works on the content of the file - which
                      # can be slow - issue #4289 explains a lexer clean up - which once
                      # done can allow caching a lexer for a filenode to avoid the file lookup
                      if isinstance(source_file, FileNode):
                          source_filenode = source_file
-                         source_lexer = source_file.lexer
+                         #source_lexer = source_file.lexer
+                         source_lexer = self._get_lexer_for_filename(source_filename)
+                         source_file.lexer = source_lexer
                      if isinstance(target_file, FileNode):
                          target_filenode = target_file
-                         target_lexer = target_file.lexer
+                         #target_lexer = target_file.lexer
+                         target_lexer = self._get_lexer_for_filename(target_filename)
+                         target_file.lexer = target_lexer
                      source_file_path, target_file_path = None, None
                      if source_filename != '/dev/null':
                          source_file_path = source_filename
                      if target_filename != '/dev/null':
                          target_file_path = target_filename
                      source_file_type = source_lexer.name
                      target_file_type = target_lexer.name
                      op_hunks = patch['chunks'][0]
                      hunks = patch['chunks'][1:]
                      filediff = AttributeDict({
                          'source_file_path': source_file_path,
                          'target_file_path': target_file_path,
                          'source_filenode': source_filenode,
                          'target_filenode': target_filenode,
                          'hunks': [],
                          'source_file_type': target_file_type,
                          'target_file_type': source_file_type,
                          'patch': patch,
                          'source_mode': patch['stats']['old_mode'],
                          'target_mode': patch['stats']['new_mode'],
                          'limited_diff': isinstance(patch, LimitedDiffContainer),
                          'diffset': self,
                      })
                      for hunk in hunks:
                          hunkbit = self.parse_hunk(hunk, source_file, target_file)
                          hunkbit.filediff = filediff
                          filediff.hunks.append(hunkbit)
                      left_comments = {}
                      if source_file_path in self.comments_store:
                          for lineno, comments in self.comments_store[source_file_path].items():
                              left_comments[lineno] = comments
                      if target_file_path in self.comments_store:
                          for lineno, comments in self.comments_store[target_file_path].items():
                              left_comments[lineno] = comments
                      filediff.left_comments = left_comments
                      return filediff
                  def parse_hunk(self, hunk, source_file, target_file):
                      result = AttributeDict(dict(
                          source_start=hunk['source_start'],
                          source_length=hunk['source_length'],
                          target_start=hunk['target_start'],
                          target_length=hunk['target_length'],
                          section_header=hunk['section_header'],
                          lines=[],
                      ))
                      before, after = [], []
                      for line in hunk['lines']:
                          if line['action'] == 'unmod':
                              result.lines.extend(
                                  self.parse_lines(before, after, source_file, target_file))
                              after.append(line)
                              before.append(line)
                          elif line['action'] == 'add':
                              after.append(line)
                          elif line['action'] == 'del':
                              before.append(line)
                          elif line['action'] == 'old-no-nl':
                              before.append(line)
                          elif line['action'] == 'new-no-nl':
                              after.append(line)
                      result.lines.extend(
                          self.parse_lines(before, after, source_file, target_file))
                      result.unified = self.as_unified(result.lines)
                      result.sideside = result.lines
                      return result
                  def parse_lines(self, before_lines, after_lines, source_file, target_file):
                      # TODO: dan: investigate doing the diff comparison and fast highlighting
                      # on the entire before and after buffered block lines rather than by
                      # line, this means we can get better 'fast' highlighting if the context
                      # allows it - eg.
                      # line 4: """
                      # line 5: this gets highlighted as a string
                      # line 6: """
                      lines = []
                      while before_lines or after_lines:
                          before, after = None, None
                          before_tokens, after_tokens = None, None
                          if before_lines:
                              before = before_lines.pop(0)
                          if after_lines:
                              after = after_lines.pop(0)
                          original = AttributeDict()
                          modified = AttributeDict()
                          if before:
                              if before['action'] == 'old-no-nl':
                                  before_tokens = [('nonl', before['line'])]
                              else:
                                  before_tokens = self.get_line_tokens(
                                      line_text=before['line'], line_number=before['old_lineno'],
                                      file=source_file)
                              original.lineno = before['old_lineno']
                              original.content = before['line']
                              original.action = self.action_to_op(before['action'])
                              original.comments = self.get_comments_for('old',
                                  source_file, before['old_lineno'])
                          if after:
                              if after['action'] == 'new-no-nl':
                                  after_tokens = [('nonl', after['line'])]
                              else:
                                  after_tokens = self.get_line_tokens(
                                      line_text=after['line'], line_number=after['new_lineno'],
                                      file=target_file)
                              modified.lineno = after['new_lineno']
                              modified.content = after['line']
                              modified.action = self.action_to_op(after['action'])
                              modified.comments = self.get_comments_for('new',
                                  target_file, after['new_lineno'])
                          # diff the lines
                          if before_tokens and after_tokens:
                              o_tokens, m_tokens, similarity = tokens_diff(
                                  before_tokens, after_tokens)
                              original.content = render_tokenstream(o_tokens)
                              modified.content = render_tokenstream(m_tokens)
                          elif before_tokens:
                              original.content = render_tokenstream(
                                  [(x[0], '', x[1]) for x in before_tokens])
                          elif after_tokens:
                              modified.content = render_tokenstream(
                                  [(x[0], '', x[1]) for x in after_tokens])
                          lines.append(AttributeDict({
                              'original': original,
                              'modified': modified,
                          }))
                      return lines
                  def get_comments_for(self, version, file, line_number):
                      if hasattr(file, 'unicode_path'):
                          file = file.unicode_path
                      if not isinstance(file, basestring):
                          return None
                      line_key = {
                          'old': 'o',
                          'new': 'n',
                      }[version] + str(line_number)
                      if file in self.comments_store:
                          file_comments = self.comments_store[file]
                          if line_key in file_comments:
                              return file_comments.pop(line_key)
                  def get_line_tokens(self, line_text, line_number, file=None):
                      filenode = None
                      filename = None
                      if isinstance(file, basestring):
                          filename = file
                      elif isinstance(file, FileNode):
                          filenode = file
                          filename = file.unicode_path
                      if self.highlight_mode == self.HL_REAL and filenode:
-                         if line_number and file.size < self.max_file_size_limit:
-                             return self.get_tokenized_filenode_line(file, line_number)
+                         lexer = self._get_lexer_for_filename(filename)
+                         file_size_allowed = file.size < self.max_file_size_limit
+                         if line_number and file_size_allowed:
+                             return self.get_tokenized_filenode_line(
+                                 file, line_number, lexer)
                      if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
                          lexer = self._get_lexer_for_filename(filename)
                          return list(tokenize_string(line_text, lexer))
                      return list(tokenize_string(line_text, plain_text_lexer))
-                 def get_tokenized_filenode_line(self, filenode, line_number):
+                 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
                      if filenode not in self.highlighted_filenodes:
-                         tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)
+                         tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
                          self.highlighted_filenodes[filenode] = tokenized_lines
                      return self.highlighted_filenodes[filenode][line_number - 1]
                  def action_to_op(self, action):
                      return {
                          'add': '+',
                          'del': '-',
                          'unmod': ' ',
                          'old-no-nl': ' ',
                          'new-no-nl': ' ',
                      }.get(action, action)
                  def as_unified(self, lines):
                      """
                      Return a generator that yields the lines of a diff in unified order
                      """
                      def generator():
                          buf = []
                          for line in lines:
                              if buf and not line.original or line.original.action == ' ':
                                  for b in buf:
                                      yield b
                                  buf = []
                              if line.original:
                                  if line.original.action == ' ':
                                      yield (line.original.lineno, line.modified.lineno,
                                             line.original.action, line.original.content,
                                             line.original.comments)
                                      continue
                                  if line.original.action == '-':
                                      yield (line.original.lineno, None,
                                             line.original.action, line.original.content,
                                             line.original.comments)
                                  if line.modified.action == '+':
                                      buf.append((
                                          None, line.modified.lineno,
                                          line.modified.action, line.modified.content,
                                          line.modified.comments))
                                      continue
                              if line.modified:
                                  yield (None, line.modified.lineno,
                                         line.modified.action, line.modified.content,
                                         line.modified.comments)
                          for b in buf:
                              yield b
                      return generator()

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages