rhodecode-enterprise-ce Files · rhodecode/lib/codeblocks.py

todos: all todos needs to be resolved for merge to happen....

todos: all todos needs to be resolved for merge to happen. This will prevent the outdated todos beeing automatically marked as solved becuase of bigger diff changes. It's better to mark commits quickly as resolved instead of potentially have unresolved todos hidden because of invlidation logic.

marcink - - Load All Authors

File last commit:

r1271:47a44c03 default


                r1342:44fc3039

default

Download file

             codeblocks.py
        
                    687 lines
            
             | 25.4 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / rhodecode / lib / codeblocks.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # -*- coding: utf-8 -*-

      # Copyright (C) 2011-2017 RhodeCode GmbH

      #

      # This program is free software: you can redistribute it and/or modify

      # it under the terms of the GNU Affero General Public License, version 3

      # (only), as published by the Free Software Foundation.

      #

      # This program is distributed in the hope that it will be useful,

      # but WITHOUT ANY WARRANTY; without even the implied warranty of

      # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

      # GNU General Public License for more details.

      #

      # You should have received a copy of the GNU Affero General Public License

      # along with this program.  If not, see <http://www.gnu.org/licenses/>.

      #

      # This program is dual-licensed. If you wish to learn more about the

      # RhodeCode Enterprise Edition, including its added features, Support services,

      # and proprietary license terms, please see https://rhodecode.com/licenses/

      import logging

      import difflib

      from itertools import groupby

      from pygments import lex

      from pygments.formatters.html import _get_ttype_class as pygment_token_class

      from rhodecode.lib.helpers import (

          get_lexer_for_filenode, get_lexer_safe, html_escape)

      from rhodecode.lib.utils2 import AttributeDict

      from rhodecode.lib.vcs.nodes import FileNode

      from rhodecode.lib.diff_match_patch import diff_match_patch

      from rhodecode.lib.diffs import LimitedDiffContainer

      from pygments.lexers import get_lexer_by_name

      plain_text_lexer = get_lexer_by_name(

          'text', stripall=False, stripnl=False, ensurenl=False)

      log = logging.getLogger()

      def filenode_as_lines_tokens(filenode, lexer=None):

          lexer = lexer or get_lexer_for_filenode(filenode)

          log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)

          tokens = tokenize_string(filenode.content, lexer)

          lines = split_token_stream(tokens, split_string='\n')

          rv = list(lines)

          return rv

      def tokenize_string(content, lexer):

          """

          Use pygments to tokenize some content based on a lexer

          ensuring all original new lines and whitespace is preserved

          """

          lexer.stripall = False

          lexer.stripnl = False

          lexer.ensurenl = False

          for token_type, token_text in lex(content, lexer):

              yield pygment_token_class(token_type), token_text

      def split_token_stream(tokens, split_string=u'\n'):

          """

          Take a list of (TokenType, text) tuples and split them by a string

          >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

          [(TEXT, 'some'), (TEXT, 'text'),

           (TEXT, 'more'), (TEXT, 'text')]

          """

          buffer = []

          for token_class, token_text in tokens:

              parts = token_text.split(split_string)

              for part in parts[:-1]:

                  buffer.append((token_class, part))

                  yield buffer

                  buffer = []

              buffer.append((token_class, parts[-1]))

          if buffer:

              yield buffer

      def filenode_as_annotated_lines_tokens(filenode):

          """

          Take a file node and return a list of annotations => lines, if no annotation

          is found, it will be None.

          eg:

          [

              (annotation1, [

                  (1, line1_tokens_list),

                  (2, line2_tokens_list),

              ]),

              (annotation2, [

                  (3, line1_tokens_list),

              ]),

              (None, [

                  (4, line1_tokens_list),

              ]),

              (annotation1, [

                  (5, line1_tokens_list),

                  (6, line2_tokens_list),

              ])

          ]

          """

          commit_cache = {} # cache commit_getter lookups

          def _get_annotation(commit_id, commit_getter):

              if commit_id not in commit_cache:

                  commit_cache[commit_id] = commit_getter()

              return commit_cache[commit_id]

          annotation_lookup = {

              line_no: _get_annotation(commit_id, commit_getter)

              for line_no, commit_id, commit_getter, line_content

              in filenode.annotate

          }

          annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

                                for line_no, tokens

                                in enumerate(filenode_as_lines_tokens(filenode), 1))

          grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

          for annotation, group in grouped_annotations_lines:

              yield (

                  annotation, [(line_no, tokens)

                                for (_, line_no, tokens) in group]

              )

      def render_tokenstream(tokenstream):

          result = []

          for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

              if token_class:

                  result.append(u'<span class="%s">' % token_class)

              else:

                  result.append(u'<span>')

              for op_tag, token_text in token_ops_texts:

                  if op_tag:

                      result.append(u'<%s>' % op_tag)

                  escaped_text = html_escape(token_text)

                  # TODO: dan: investigate showing hidden characters like space/nl/tab

                  # escaped_text = escaped_text.replace(' ', '<sp> </sp>')

                  # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

                  # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

                  result.append(escaped_text)

                  if op_tag:

                      result.append(u'</%s>' % op_tag)

              result.append(u'</span>')

          html = ''.join(result)

          return html

      def rollup_tokenstream(tokenstream):

          """

          Group a token stream of the format:

              ('class', 'op', 'text')

          or

              ('class', 'text')

          into

              [('class1',

                  [('op1', 'text'),

                   ('op2', 'text')]),

               ('class2',

                  [('op3', 'text')])]

          This is used to get the minimal tags necessary when

          rendering to html eg for a token stream ie.

          <span class="A"><ins>he</ins>llo</span>

          vs

          <span class="A"><ins>he</ins></span><span class="A">llo</span>

          If a 2 tuple is passed in, the output op will be an empty string.

          eg:

          >>> rollup_tokenstream([('classA', '',      'h'),

                                  ('classA', 'del',   'ell'),

                                  ('classA', '',      'o'),

                                  ('classB', '',      ' '),

                                  ('classA', '',      'the'),

                                  ('classA', '',      're'),

                                  ])

          [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

           ('classB', [('', ' ')],

           ('classA', [('', 'there')]]

          """

          if tokenstream and len(tokenstream[0]) == 2:

              tokenstream = ((t[0], '', t[1]) for t in tokenstream)

          result = []

          for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

              ops = []

              for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

                  text_buffer = []

                  for t_class, t_op, t_text in token_text_list:

                      text_buffer.append(t_text)

                  ops.append((token_op, ''.join(text_buffer)))

              result.append((token_class, ops))

          return result

      def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

          """

          Converts a list of (token_class, token_text) tuples to a list of

          (token_class, token_op, token_text) tuples where token_op is one of

          ('ins', 'del', '')

          :param old_tokens: list of (token_class, token_text) tuples of old line

          :param new_tokens: list of (token_class, token_text) tuples of new line

          :param use_diff_match_patch: boolean, will use google's diff match patch

              library which has options to 'smooth' out the character by character

              differences making nicer ins/del blocks

          """

          old_tokens_result = []

          new_tokens_result = []

          similarity = difflib.SequenceMatcher(None,

              ''.join(token_text for token_class, token_text in old_tokens),

              ''.join(token_text for token_class, token_text in new_tokens)

          ).ratio()

          if similarity < 0.6: # return, the blocks are too different

              for token_class, token_text in old_tokens:

                  old_tokens_result.append((token_class, '', token_text))

              for token_class, token_text in new_tokens:

                  new_tokens_result.append((token_class, '', token_text))

              return old_tokens_result, new_tokens_result, similarity

          token_sequence_matcher = difflib.SequenceMatcher(None,

              [x[1] for x in old_tokens],

              [x[1] for x in new_tokens])

          for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

              # check the differences by token block types first to give a more

              # nicer "block" level replacement vs character diffs

              if tag == 'equal':

                  for token_class, token_text in old_tokens[o1:o2]:

                      old_tokens_result.append((token_class, '', token_text))

                  for token_class, token_text in new_tokens[n1:n2]:

                      new_tokens_result.append((token_class, '', token_text))

              elif tag == 'delete':

                  for token_class, token_text in old_tokens[o1:o2]:

                      old_tokens_result.append((token_class, 'del', token_text))

              elif tag == 'insert':

                  for token_class, token_text in new_tokens[n1:n2]:

                      new_tokens_result.append((token_class, 'ins', token_text))

              elif tag == 'replace':

                  # if same type token blocks must be replaced, do a diff on the

                  # characters in the token blocks to show individual changes

                  old_char_tokens = []

                  new_char_tokens = []

                  for token_class, token_text in old_tokens[o1:o2]:

                      for char in token_text:

                          old_char_tokens.append((token_class, char))

                  for token_class, token_text in new_tokens[n1:n2]:

                      for char in token_text:

                          new_char_tokens.append((token_class, char))

                  old_string = ''.join([token_text for

                      token_class, token_text in old_char_tokens])

                  new_string = ''.join([token_text for

                      token_class, token_text in new_char_tokens])

                  char_sequence = difflib.SequenceMatcher(

                      None, old_string, new_string)

                  copcodes = char_sequence.get_opcodes()

                  obuffer, nbuffer = [], []

                  if use_diff_match_patch:

                      dmp = diff_match_patch()

                      dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

                      reps = dmp.diff_main(old_string, new_string)

                      dmp.diff_cleanupEfficiency(reps)

                      a, b = 0, 0

                      for op, rep in reps:

                          l = len(rep)

                          if op == 0:

                              for i, c in enumerate(rep):

                                  obuffer.append((old_char_tokens[a+i][0], '', c))

                                  nbuffer.append((new_char_tokens[b+i][0], '', c))

                              a += l

                              b += l

                          elif op == -1:

                              for i, c in enumerate(rep):

                                  obuffer.append((old_char_tokens[a+i][0], 'del', c))

                              a += l

                          elif op == 1:

                              for i, c in enumerate(rep):

                                  nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

                              b += l

                  else:

                      for ctag, co1, co2, cn1, cn2 in copcodes:

                          if ctag == 'equal':

                              for token_class, token_text in old_char_tokens[co1:co2]:

                                  obuffer.append((token_class, '', token_text))

                              for token_class, token_text in new_char_tokens[cn1:cn2]:

                                  nbuffer.append((token_class, '', token_text))

                          elif ctag == 'delete':

                              for token_class, token_text in old_char_tokens[co1:co2]:

                                  obuffer.append((token_class, 'del', token_text))

                          elif ctag == 'insert':

                              for token_class, token_text in new_char_tokens[cn1:cn2]:

                                  nbuffer.append((token_class, 'ins', token_text))

                          elif ctag == 'replace':

                              for token_class, token_text in old_char_tokens[co1:co2]:

                                  obuffer.append((token_class, 'del', token_text))

                              for token_class, token_text in new_char_tokens[cn1:cn2]:

                                  nbuffer.append((token_class, 'ins', token_text))

                  old_tokens_result.extend(obuffer)

                  new_tokens_result.extend(nbuffer)

          return old_tokens_result, new_tokens_result, similarity

      class DiffSet(object):

          """

          An object for parsing the diff result from diffs.DiffProcessor and

          adding highlighting, side by side/unified renderings and line diffs

          """

          HL_REAL = 'REAL' # highlights using original file, slow

          HL_FAST = 'FAST' # highlights using just the line, fast but not correct

                           # in the case of multiline code

          HL_NONE = 'NONE' # no highlighting, fastest

          def __init__(self, highlight_mode=HL_REAL, repo_name=None,

                       source_repo_name=None,

                       source_node_getter=lambda filename: None,

                       target_node_getter=lambda filename: None,

                       source_nodes=None, target_nodes=None,

                       max_file_size_limit=150 * 1024, # files over this size will

                                                       # use fast highlighting

                       comments=None,

                       ):

              self.highlight_mode = highlight_mode

              self.highlighted_filenodes = {}

              self.source_node_getter = source_node_getter

              self.target_node_getter = target_node_getter

              self.source_nodes = source_nodes or {}

              self.target_nodes = target_nodes or {}

              self.repo_name = repo_name

              self.source_repo_name = source_repo_name or repo_name

              self.comments = comments or {}

              self.comments_store = self.comments.copy()

              self.max_file_size_limit = max_file_size_limit

          def render_patchset(self, patchset, source_ref=None, target_ref=None):

              diffset = AttributeDict(dict(

                  lines_added=0,

                  lines_deleted=0,

                  changed_files=0,

                  files=[],

                  limited_diff=isinstance(patchset, LimitedDiffContainer),

                  repo_name=self.repo_name,

                  source_repo_name=self.source_repo_name,

                  source_ref=source_ref,

                  target_ref=target_ref,

              ))

              for patch in patchset:

                  filediff = self.render_patch(patch)

                  filediff.diffset = diffset

                  diffset.files.append(filediff)

                  diffset.changed_files += 1

                  if not patch['stats']['binary']:

                      diffset.lines_added += patch['stats']['added']

                      diffset.lines_deleted += patch['stats']['deleted']

              return diffset

          _lexer_cache = {}

          def _get_lexer_for_filename(self, filename):

              # cached because we might need to call it twice for source/target

              if filename not in self._lexer_cache:

                  self._lexer_cache[filename] = get_lexer_safe(filepath=filename)

              return self._lexer_cache[filename]

          def render_patch(self, patch):

              log.debug('rendering diff for %r' % patch['filename'])

              source_filename = patch['original_filename']

              target_filename = patch['filename']

              source_lexer = plain_text_lexer

              target_lexer = plain_text_lexer

              if not patch['stats']['binary']:

                  if self.highlight_mode == self.HL_REAL:

                      if (source_filename and patch['operation'] in ('D', 'M')

                          and source_filename not in self.source_nodes):

                              self.source_nodes[source_filename] = (

                                  self.source_node_getter(source_filename))

                      if (target_filename and patch['operation'] in ('A', 'M')

                          and target_filename not in self.target_nodes):

                              self.target_nodes[target_filename] = (

                                  self.target_node_getter(target_filename))

                  elif self.highlight_mode == self.HL_FAST:

                      source_lexer = self._get_lexer_for_filename(source_filename)

                      target_lexer = self._get_lexer_for_filename(target_filename)

              source_file = self.source_nodes.get(source_filename, source_filename)

              target_file = self.target_nodes.get(target_filename, target_filename)

              source_filenode, target_filenode = None, None

              # TODO: dan: FileNode.lexer works on the content of the file - which

              # can be slow - issue #4289 explains a lexer clean up - which once

              # done can allow caching a lexer for a filenode to avoid the file lookup

              if isinstance(source_file, FileNode):

                  source_filenode = source_file

                  source_lexer = source_file.lexer

              if isinstance(target_file, FileNode):

                  target_filenode = target_file

                  target_lexer = target_file.lexer

              source_file_path, target_file_path = None, None

              if source_filename != '/dev/null':

                  source_file_path = source_filename

              if target_filename != '/dev/null':

                  target_file_path = target_filename

              source_file_type = source_lexer.name

              target_file_type = target_lexer.name

              op_hunks = patch['chunks'][0]

              hunks = patch['chunks'][1:]

              filediff = AttributeDict({

                  'source_file_path': source_file_path,

                  'target_file_path': target_file_path,

                  'source_filenode': source_filenode,

                  'target_filenode': target_filenode,

                  'hunks': [],

                  'source_file_type': target_file_type,

                  'target_file_type': source_file_type,

                  'patch': patch,

                  'source_mode': patch['stats']['old_mode'],

                  'target_mode': patch['stats']['new_mode'],

                  'limited_diff': isinstance(patch, LimitedDiffContainer),

                  'diffset': self,

              })

              for hunk in hunks:

                  hunkbit = self.parse_hunk(hunk, source_file, target_file)

                  hunkbit.filediff = filediff

                  filediff.hunks.append(hunkbit)

              left_comments = {}

              if source_file_path in self.comments_store:

                  for lineno, comments in self.comments_store[source_file_path].items():

                      left_comments[lineno] = comments

              if target_file_path in self.comments_store:

                  for lineno, comments in self.comments_store[target_file_path].items():

                      left_comments[lineno] = comments

              filediff.left_comments = left_comments

              return filediff

          def parse_hunk(self, hunk, source_file, target_file):

              result = AttributeDict(dict(

                  source_start=hunk['source_start'],

                  source_length=hunk['source_length'],

                  target_start=hunk['target_start'],

                  target_length=hunk['target_length'],

                  section_header=hunk['section_header'],

                  lines=[],

              ))

              before, after = [], []

              for line in hunk['lines']:

                  if line['action'] == 'unmod':

                      result.lines.extend(

                          self.parse_lines(before, after, source_file, target_file))

                      after.append(line)

                      before.append(line)

                  elif line['action'] == 'add':

                      after.append(line)

                  elif line['action'] == 'del':

                      before.append(line)

                  elif line['action'] == 'old-no-nl':

                      before.append(line)

                  elif line['action'] == 'new-no-nl':

                      after.append(line)

              result.lines.extend(

                  self.parse_lines(before, after, source_file, target_file))

              result.unified = self.as_unified(result.lines)

              result.sideside = result.lines

              return result

          def parse_lines(self, before_lines, after_lines, source_file, target_file):

              # TODO: dan: investigate doing the diff comparison and fast highlighting

              # on the entire before and after buffered block lines rather than by

              # line, this means we can get better 'fast' highlighting if the context

              # allows it - eg.

              # line 4: """

              # line 5: this gets highlighted as a string

              # line 6: """

              lines = []

              while before_lines or after_lines:

                  before, after = None, None

                  before_tokens, after_tokens = None, None

                  if before_lines:

                      before = before_lines.pop(0)

                  if after_lines:

                      after = after_lines.pop(0)

                  original = AttributeDict()

                  modified = AttributeDict()

                  if before:

                      if before['action'] == 'old-no-nl':

                          before_tokens = [('nonl', before['line'])]

                      else:

                          before_tokens = self.get_line_tokens(

                              line_text=before['line'], line_number=before['old_lineno'],

                              file=source_file)

                      original.lineno = before['old_lineno']

                      original.content = before['line']

                      original.action = self.action_to_op(before['action'])

                      original.comments = self.get_comments_for('old',

                          source_file, before['old_lineno'])

                  if after:

                      if after['action'] == 'new-no-nl':

                          after_tokens = [('nonl', after['line'])]

                      else:

                          after_tokens = self.get_line_tokens(

                              line_text=after['line'], line_number=after['new_lineno'],

                              file=target_file)

                      modified.lineno = after['new_lineno']

                      modified.content = after['line']

                      modified.action = self.action_to_op(after['action'])

                      modified.comments = self.get_comments_for('new',

                          target_file, after['new_lineno'])

                  # diff the lines

                  if before_tokens and after_tokens:

                      o_tokens, m_tokens, similarity = tokens_diff(

                          before_tokens, after_tokens)

                      original.content = render_tokenstream(o_tokens)

                      modified.content = render_tokenstream(m_tokens)

                  elif before_tokens:

                      original.content = render_tokenstream(

                          [(x[0], '', x[1]) for x in before_tokens])

                  elif after_tokens:

                      modified.content = render_tokenstream(

                          [(x[0], '', x[1]) for x in after_tokens])

                  lines.append(AttributeDict({

                      'original': original,

                      'modified': modified,

                  }))

              return lines

          def get_comments_for(self, version, file, line_number):

              if hasattr(file, 'unicode_path'):

                  file = file.unicode_path

              if not isinstance(file, basestring):

                  return None

              line_key = {

                  'old': 'o',

                  'new': 'n',

              }[version] + str(line_number)

              if file in self.comments_store:

                  file_comments = self.comments_store[file]

                  if line_key in file_comments:

                      return file_comments.pop(line_key)

          def get_line_tokens(self, line_text, line_number, file=None):

              filenode = None

              filename = None

              if isinstance(file, basestring):

                  filename = file

              elif isinstance(file, FileNode):

                  filenode = file

                  filename = file.unicode_path

              if self.highlight_mode == self.HL_REAL and filenode:

                  if line_number and file.size < self.max_file_size_limit:

                      return self.get_tokenized_filenode_line(file, line_number)

              if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

                  lexer = self._get_lexer_for_filename(filename)

                  return list(tokenize_string(line_text, lexer))

              return list(tokenize_string(line_text, plain_text_lexer))

          def get_tokenized_filenode_line(self, filenode, line_number):

              if filenode not in self.highlighted_filenodes:

                  tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)

                  self.highlighted_filenodes[filenode] = tokenized_lines

              return self.highlighted_filenodes[filenode][line_number - 1]

          def action_to_op(self, action):

              return {

                  'add': '+',

                  'del': '-',

                  'unmod': ' ',

                  'old-no-nl': ' ',

                  'new-no-nl': ' ',

              }.get(action, action)

          def as_unified(self, lines):

              """

              Return a generator that yields the lines of a diff in unified order

              """

              def generator():

                  buf = []

                  for line in lines:

                      if buf and not line.original or line.original.action == ' ':

                          for b in buf:

                              yield b

                          buf = []

                      if line.original:

                          if line.original.action == ' ':

                              yield (line.original.lineno, line.modified.lineno,

                                     line.original.action, line.original.content,

                                     line.original.comments)

                              continue

                          if line.original.action == '-':

                              yield (line.original.lineno, None,

                                     line.original.action, line.original.content,

                                     line.original.comments)

                          if line.modified.action == '+':

                              buf.append((

                                  None, line.modified.lineno,

                                  line.modified.action, line.modified.content,

                                  line.modified.comments))

                              continue

                      if line.modified:

                          yield (None, line.modified.lineno,

                                 line.modified.action, line.modified.content,

                                 line.modified.comments)

                  for b in buf:

                      yield b

              return generator()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# -- coding: utf-8 --

				# Copyright (C) 2011-2017 RhodeCode GmbH
				#
				# This program is free software: you can redistribute it and/or modify
				# it under the terms of the GNU Affero General Public License, version 3
				# (only), as published by the Free Software Foundation.
				#
				# This program is distributed in the hope that it will be useful,
				# but WITHOUT ANY WARRANTY; without even the implied warranty of
				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				# GNU General Public License for more details.
				#
				# You should have received a copy of the GNU Affero General Public License
				# along with this program. If not, see <http://www.gnu.org/licenses/>.
				#
				# This program is dual-licensed. If you wish to learn more about the
				# RhodeCode Enterprise Edition, including its added features, Support services,
				# and proprietary license terms, please see https://rhodecode.com/licenses/

				import logging
				import difflib
				from itertools import groupby

				from pygments import lex
				from pygments.formatters.html import _get_ttype_class as pygment_token_class
				from rhodecode.lib.helpers import (
				get_lexer_for_filenode, get_lexer_safe, html_escape)
				from rhodecode.lib.utils2 import AttributeDict
				from rhodecode.lib.vcs.nodes import FileNode
				from rhodecode.lib.diff_match_patch import diff_match_patch
				from rhodecode.lib.diffs import LimitedDiffContainer
				from pygments.lexers import get_lexer_by_name

				plain_text_lexer = get_lexer_by_name(
				'text', stripall=False, stripnl=False, ensurenl=False)


				log = logging.getLogger()


				def filenode_as_lines_tokens(filenode, lexer=None):
				lexer = lexer or get_lexer_for_filenode(filenode)
				log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)
				tokens = tokenize_string(filenode.content, lexer)
				lines = split_token_stream(tokens, split_string='\n')
				rv = list(lines)
				return rv


				def tokenize_string(content, lexer):
				"""
				Use pygments to tokenize some content based on a lexer
				ensuring all original new lines and whitespace is preserved
				"""

				lexer.stripall = False
				lexer.stripnl = False
				lexer.ensurenl = False
				for token_type, token_text in lex(content, lexer):
				yield pygment_token_class(token_type), token_text


				def split_token_stream(tokens, split_string=u'\n'):
				"""
				Take a list of (TokenType, text) tuples and split them by a string

				>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
				[(TEXT, 'some'), (TEXT, 'text'),
				(TEXT, 'more'), (TEXT, 'text')]
				"""

				buffer = []
				for token_class, token_text in tokens:
				parts = token_text.split(split_string)
				for part in parts[:-1]:
				buffer.append((token_class, part))
				yield buffer
				buffer = []

				buffer.append((token_class, parts[-1]))

				if buffer:
				yield buffer


				def filenode_as_annotated_lines_tokens(filenode):
				"""
				Take a file node and return a list of annotations => lines, if no annotation
				is found, it will be None.

				eg:

				[
				(annotation1, [
				(1, line1_tokens_list),
				(2, line2_tokens_list),
				]),
				(annotation2, [
				(3, line1_tokens_list),
				]),
				(None, [
				(4, line1_tokens_list),
				]),
				(annotation1, [
				(5, line1_tokens_list),
				(6, line2_tokens_list),
				])
				]
				"""

				commit_cache = {} # cache commit_getter lookups

				def _get_annotation(commit_id, commit_getter):
				if commit_id not in commit_cache:
				commit_cache[commit_id] = commit_getter()
				return commit_cache[commit_id]

				annotation_lookup = {
				line_no: _get_annotation(commit_id, commit_getter)
				for line_no, commit_id, commit_getter, line_content
				in filenode.annotate
				}

				annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
				for line_no, tokens
				in enumerate(filenode_as_lines_tokens(filenode), 1))

				grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

				for annotation, group in grouped_annotations_lines:
				yield (
				annotation, [(line_no, tokens)
				for (_, line_no, tokens) in group]
				)


				def render_tokenstream(tokenstream):
				result = []
				for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

				if token_class:
				result.append(u'<span class="%s">' % token_class)
				else:
				result.append(u'<span>')

				for op_tag, token_text in token_ops_texts:

				if op_tag:
				result.append(u'<%s>' % op_tag)

				escaped_text = html_escape(token_text)

				# TODO: dan: investigate showing hidden characters like space/nl/tab
				# escaped_text = escaped_text.replace(' ', '<sp> </sp>')
				# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
				# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

				result.append(escaped_text)

				if op_tag:
				result.append(u'</%s>' % op_tag)

				result.append(u'</span>')

				html = ''.join(result)
				return html


				def rollup_tokenstream(tokenstream):
				"""
				Group a token stream of the format:

				('class', 'op', 'text')
				or
				('class', 'text')

				into

				[('class1',
				[('op1', 'text'),
				('op2', 'text')]),
				('class2',
				[('op3', 'text')])]

				This is used to get the minimal tags necessary when
				rendering to html eg for a token stream ie.

				<span class="A"><ins>he</ins>llo</span>
				vs
				<span class="A"><ins>he</ins></span><span class="A">llo</span>

				If a 2 tuple is passed in, the output op will be an empty string.

				eg:

				>>> rollup_tokenstream([('classA', '', 'h'),
				('classA', 'del', 'ell'),
				('classA', '', 'o'),
				('classB', '', ' '),
				('classA', '', 'the'),
				('classA', '', 're'),
				])

				[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
				('classB', [('', ' ')],
				('classA', [('', 'there')]]

				"""
				if tokenstream and len(tokenstream[0]) == 2:
				tokenstream = ((t[0], '', t[1]) for t in tokenstream)

				result = []
				for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
				ops = []
				for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
				text_buffer = []
				for t_class, t_op, t_text in token_text_list:
				text_buffer.append(t_text)
				ops.append((token_op, ''.join(text_buffer)))
				result.append((token_class, ops))
				return result


				def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
				"""
				Converts a list of (token_class, token_text) tuples to a list of
				(token_class, token_op, token_text) tuples where token_op is one of
				('ins', 'del', '')

				:param old_tokens: list of (token_class, token_text) tuples of old line
				:param new_tokens: list of (token_class, token_text) tuples of new line
				:param use_diff_match_patch: boolean, will use google's diff match patch
				library which has options to 'smooth' out the character by character
				differences making nicer ins/del blocks
				"""

				old_tokens_result = []
				new_tokens_result = []

				similarity = difflib.SequenceMatcher(None,
				''.join(token_text for token_class, token_text in old_tokens),
				''.join(token_text for token_class, token_text in new_tokens)
				).ratio()

				if similarity < 0.6: # return, the blocks are too different
				for token_class, token_text in old_tokens:
				old_tokens_result.append((token_class, '', token_text))
				for token_class, token_text in new_tokens:
				new_tokens_result.append((token_class, '', token_text))
				return old_tokens_result, new_tokens_result, similarity

				token_sequence_matcher = difflib.SequenceMatcher(None,
				[x[1] for x in old_tokens],
				[x[1] for x in new_tokens])

				for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
				# check the differences by token block types first to give a more
				# nicer "block" level replacement vs character diffs

				if tag == 'equal':
				for token_class, token_text in old_tokens[o1:o2]:
				old_tokens_result.append((token_class, '', token_text))
				for token_class, token_text in new_tokens[n1:n2]:
				new_tokens_result.append((token_class, '', token_text))
				elif tag == 'delete':
				for token_class, token_text in old_tokens[o1:o2]:
				old_tokens_result.append((token_class, 'del', token_text))
				elif tag == 'insert':
				for token_class, token_text in new_tokens[n1:n2]:
				new_tokens_result.append((token_class, 'ins', token_text))
				elif tag == 'replace':
				# if same type token blocks must be replaced, do a diff on the
				# characters in the token blocks to show individual changes

				old_char_tokens = []
				new_char_tokens = []
				for token_class, token_text in old_tokens[o1:o2]:
				for char in token_text:
				old_char_tokens.append((token_class, char))

				for token_class, token_text in new_tokens[n1:n2]:
				for char in token_text:
				new_char_tokens.append((token_class, char))

				old_string = ''.join([token_text for
				token_class, token_text in old_char_tokens])
				new_string = ''.join([token_text for
				token_class, token_text in new_char_tokens])

				char_sequence = difflib.SequenceMatcher(
				None, old_string, new_string)
				copcodes = char_sequence.get_opcodes()
				obuffer, nbuffer = [], []

				if use_diff_match_patch:
				dmp = diff_match_patch()
				dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
				reps = dmp.diff_main(old_string, new_string)
				dmp.diff_cleanupEfficiency(reps)

				a, b = 0, 0
				for op, rep in reps:
				l = len(rep)
				if op == 0:
				for i, c in enumerate(rep):
				obuffer.append((old_char_tokens[a+i][0], '', c))
				nbuffer.append((new_char_tokens[b+i][0], '', c))
				a += l
				b += l
				elif op == -1:
				for i, c in enumerate(rep):
				obuffer.append((old_char_tokens[a+i][0], 'del', c))
				a += l
				elif op == 1:
				for i, c in enumerate(rep):
				nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
				b += l
				else:
				for ctag, co1, co2, cn1, cn2 in copcodes:
				if ctag == 'equal':
				for token_class, token_text in old_char_tokens[co1:co2]:
				obuffer.append((token_class, '', token_text))
				for token_class, token_text in new_char_tokens[cn1:cn2]:
				nbuffer.append((token_class, '', token_text))
				elif ctag == 'delete':
				for token_class, token_text in old_char_tokens[co1:co2]:
				obuffer.append((token_class, 'del', token_text))
				elif ctag == 'insert':
				for token_class, token_text in new_char_tokens[cn1:cn2]:
				nbuffer.append((token_class, 'ins', token_text))
				elif ctag == 'replace':
				for token_class, token_text in old_char_tokens[co1:co2]:
				obuffer.append((token_class, 'del', token_text))
				for token_class, token_text in new_char_tokens[cn1:cn2]:
				nbuffer.append((token_class, 'ins', token_text))

				old_tokens_result.extend(obuffer)
				new_tokens_result.extend(nbuffer)

				return old_tokens_result, new_tokens_result, similarity


				class DiffSet(object):
				"""
				An object for parsing the diff result from diffs.DiffProcessor and
				adding highlighting, side by side/unified renderings and line diffs
				"""

				HL_REAL = 'REAL' # highlights using original file, slow
				HL_FAST = 'FAST' # highlights using just the line, fast but not correct
				# in the case of multiline code
				HL_NONE = 'NONE' # no highlighting, fastest

				def __init__(self, highlight_mode=HL_REAL, repo_name=None,
				source_repo_name=None,
				source_node_getter=lambda filename: None,
				target_node_getter=lambda filename: None,
				source_nodes=None, target_nodes=None,
				max_file_size_limit=150 * 1024, # files over this size will
				# use fast highlighting
				comments=None,
				):

				self.highlight_mode = highlight_mode
				self.highlighted_filenodes = {}
				self.source_node_getter = source_node_getter
				self.target_node_getter = target_node_getter
				self.source_nodes = source_nodes or {}
				self.target_nodes = target_nodes or {}
				self.repo_name = repo_name
				self.source_repo_name = source_repo_name or repo_name
				self.comments = comments or {}
				self.comments_store = self.comments.copy()
				self.max_file_size_limit = max_file_size_limit

				def render_patchset(self, patchset, source_ref=None, target_ref=None):
				diffset = AttributeDict(dict(
				lines_added=0,
				lines_deleted=0,
				changed_files=0,
				files=[],
				limited_diff=isinstance(patchset, LimitedDiffContainer),
				repo_name=self.repo_name,
				source_repo_name=self.source_repo_name,
				source_ref=source_ref,
				target_ref=target_ref,
				))
				for patch in patchset:
				filediff = self.render_patch(patch)
				filediff.diffset = diffset
				diffset.files.append(filediff)
				diffset.changed_files += 1
				if not patch['stats']['binary']:
				diffset.lines_added += patch['stats']['added']
				diffset.lines_deleted += patch['stats']['deleted']

				return diffset

				_lexer_cache = {}
				def _get_lexer_for_filename(self, filename):
				# cached because we might need to call it twice for source/target
				if filename not in self._lexer_cache:
				self._lexer_cache[filename] = get_lexer_safe(filepath=filename)
				return self._lexer_cache[filename]

				def render_patch(self, patch):
				log.debug('rendering diff for %r' % patch['filename'])

				source_filename = patch['original_filename']
				target_filename = patch['filename']

				source_lexer = plain_text_lexer
				target_lexer = plain_text_lexer

				if not patch['stats']['binary']:
				if self.highlight_mode == self.HL_REAL:
				if (source_filename and patch['operation'] in ('D', 'M')
				and source_filename not in self.source_nodes):
				self.source_nodes[source_filename] = (
				self.source_node_getter(source_filename))

				if (target_filename and patch['operation'] in ('A', 'M')
				and target_filename not in self.target_nodes):
				self.target_nodes[target_filename] = (
				self.target_node_getter(target_filename))

				elif self.highlight_mode == self.HL_FAST:
				source_lexer = self._get_lexer_for_filename(source_filename)
				target_lexer = self._get_lexer_for_filename(target_filename)

				source_file = self.source_nodes.get(source_filename, source_filename)
				target_file = self.target_nodes.get(target_filename, target_filename)

				source_filenode, target_filenode = None, None

				# TODO: dan: FileNode.lexer works on the content of the file - which
				# can be slow - issue #4289 explains a lexer clean up - which once
				# done can allow caching a lexer for a filenode to avoid the file lookup
				if isinstance(source_file, FileNode):
				source_filenode = source_file
				source_lexer = source_file.lexer
				if isinstance(target_file, FileNode):
				target_filenode = target_file
				target_lexer = target_file.lexer

				source_file_path, target_file_path = None, None

				if source_filename != '/dev/null':
				source_file_path = source_filename
				if target_filename != '/dev/null':
				target_file_path = target_filename

				source_file_type = source_lexer.name
				target_file_type = target_lexer.name

				op_hunks = patch['chunks'][0]
				hunks = patch['chunks'][1:]

				filediff = AttributeDict({
				'source_file_path': source_file_path,
				'target_file_path': target_file_path,
				'source_filenode': source_filenode,
				'target_filenode': target_filenode,
				'hunks': [],
				'source_file_type': target_file_type,
				'target_file_type': source_file_type,
				'patch': patch,
				'source_mode': patch['stats']['old_mode'],
				'target_mode': patch['stats']['new_mode'],
				'limited_diff': isinstance(patch, LimitedDiffContainer),
				'diffset': self,
				})

				for hunk in hunks:
				hunkbit = self.parse_hunk(hunk, source_file, target_file)
				hunkbit.filediff = filediff
				filediff.hunks.append(hunkbit)

				left_comments = {}

				if source_file_path in self.comments_store:
				for lineno, comments in self.comments_store[source_file_path].items():
				left_comments[lineno] = comments

				if target_file_path in self.comments_store:
				for lineno, comments in self.comments_store[target_file_path].items():
				left_comments[lineno] = comments

				filediff.left_comments = left_comments
				return filediff

				def parse_hunk(self, hunk, source_file, target_file):
				result = AttributeDict(dict(
				source_start=hunk['source_start'],
				source_length=hunk['source_length'],
				target_start=hunk['target_start'],
				target_length=hunk['target_length'],
				section_header=hunk['section_header'],
				lines=[],
				))
				before, after = [], []

				for line in hunk['lines']:
				if line['action'] == 'unmod':
				result.lines.extend(
				self.parse_lines(before, after, source_file, target_file))
				after.append(line)
				before.append(line)
				elif line['action'] == 'add':
				after.append(line)
				elif line['action'] == 'del':
				before.append(line)
				elif line['action'] == 'old-no-nl':
				before.append(line)
				elif line['action'] == 'new-no-nl':
				after.append(line)

				result.lines.extend(
				self.parse_lines(before, after, source_file, target_file))
				result.unified = self.as_unified(result.lines)
				result.sideside = result.lines

				return result

				def parse_lines(self, before_lines, after_lines, source_file, target_file):
				# TODO: dan: investigate doing the diff comparison and fast highlighting
				# on the entire before and after buffered block lines rather than by
				# line, this means we can get better 'fast' highlighting if the context
				# allows it - eg.
				# line 4: """
				# line 5: this gets highlighted as a string
				# line 6: """

				lines = []
				while before_lines or after_lines:
				before, after = None, None
				before_tokens, after_tokens = None, None

				if before_lines:
				before = before_lines.pop(0)
				if after_lines:
				after = after_lines.pop(0)

				original = AttributeDict()
				modified = AttributeDict()

				if before:
				if before['action'] == 'old-no-nl':
				before_tokens = [('nonl', before['line'])]
				else:
				before_tokens = self.get_line_tokens(
				line_text=before['line'], line_number=before['old_lineno'],
				file=source_file)
				original.lineno = before['old_lineno']
				original.content = before['line']
				original.action = self.action_to_op(before['action'])
				original.comments = self.get_comments_for('old',
				source_file, before['old_lineno'])

				if after:
				if after['action'] == 'new-no-nl':
				after_tokens = [('nonl', after['line'])]
				else:
				after_tokens = self.get_line_tokens(
				line_text=after['line'], line_number=after['new_lineno'],
				file=target_file)
				modified.lineno = after['new_lineno']
				modified.content = after['line']
				modified.action = self.action_to_op(after['action'])
				modified.comments = self.get_comments_for('new',
				target_file, after['new_lineno'])

				# diff the lines
				if before_tokens and after_tokens:
				o_tokens, m_tokens, similarity = tokens_diff(
				before_tokens, after_tokens)
				original.content = render_tokenstream(o_tokens)
				modified.content = render_tokenstream(m_tokens)
				elif before_tokens:
				original.content = render_tokenstream(
				[(x[0], '', x[1]) for x in before_tokens])
				elif after_tokens:
				modified.content = render_tokenstream(
				[(x[0], '', x[1]) for x in after_tokens])

				lines.append(AttributeDict({
				'original': original,
				'modified': modified,
				}))

				return lines

				def get_comments_for(self, version, file, line_number):
				if hasattr(file, 'unicode_path'):
				file = file.unicode_path

				if not isinstance(file, basestring):
				return None

				line_key = {
				'old': 'o',
				'new': 'n',
				}[version] + str(line_number)

				if file in self.comments_store:
				file_comments = self.comments_store[file]
				if line_key in file_comments:
				return file_comments.pop(line_key)

				def get_line_tokens(self, line_text, line_number, file=None):
				filenode = None
				filename = None

				if isinstance(file, basestring):
				filename = file
				elif isinstance(file, FileNode):
				filenode = file
				filename = file.unicode_path

				if self.highlight_mode == self.HL_REAL and filenode:
				if line_number and file.size < self.max_file_size_limit:
				return self.get_tokenized_filenode_line(file, line_number)

				if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
				lexer = self._get_lexer_for_filename(filename)
				return list(tokenize_string(line_text, lexer))

				return list(tokenize_string(line_text, plain_text_lexer))

				def get_tokenized_filenode_line(self, filenode, line_number):

				if filenode not in self.highlighted_filenodes:
				tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)
				self.highlighted_filenodes[filenode] = tokenized_lines
				return self.highlighted_filenodes[filenode][line_number - 1]

				def action_to_op(self, action):
				return {
				'add': '+',
				'del': '-',
				'unmod': ' ',
				'old-no-nl': ' ',
				'new-no-nl': ' ',
				}.get(action, action)

				def as_unified(self, lines):
				"""
				Return a generator that yields the lines of a diff in unified order
				"""
				def generator():
				buf = []
				for line in lines:

				if buf and not line.original or line.original.action == ' ':
				for b in buf:
				yield b
				buf = []

				if line.original:
				if line.original.action == ' ':
				yield (line.original.lineno, line.modified.lineno,
				line.original.action, line.original.content,
				line.original.comments)
				continue

				if line.original.action == '-':
				yield (line.original.lineno, None,
				line.original.action, line.original.content,
				line.original.comments)

				if line.modified.action == '+':
				buf.append((
				None, line.modified.lineno,
				line.modified.action, line.modified.content,
				line.modified.comments))
				continue

				if line.modified:
				yield (None, line.modified.lineno,
				line.modified.action, line.modified.content,
				line.modified.comments)

				for b in buf:
				yield b

				return generator()