# HG changeset patch
# User Daniel Dourvaris <daniel@rhodecode.com>
# Date 2016-10-19 07:37:07
# Node ID 8ba7d01618d4bc7169a72cb087c8bbdce58dffb7
# Parent  56031659137ba81341b737d79e08408e9160e6aa

codeblocks: add new code token rendering function that
supports diff and normal tokens.

diff --git a/rhodecode/lib/codeblocks.py b/rhodecode/lib/codeblocks.py
--- a/rhodecode/lib/codeblocks.py
+++ b/rhodecode/lib/codeblocks.py
@@ -18,16 +18,33 @@
 # RhodeCode Enterprise Edition, including its added features, Support services,
 # and proprietary license terms, please see https://rhodecode.com/licenses/
 
-
+import logging
 from itertools import groupby
 
 from pygments import lex
-# PYGMENTS_TOKEN_TYPES is used in a hot loop keep attribute lookups to a minimum
-from pygments.token import STANDARD_TYPES as PYGMENTS_TOKEN_TYPES
+from pygments.formatters.html import _get_ttype_class as pygment_token_class
+from rhodecode.lib.helpers import get_lexer_for_filenode, html_escape
+from rhodecode.lib.utils2 import AttributeDict
+from rhodecode.lib.vcs.nodes import FileNode
+from pygments.lexers import get_lexer_by_name
+
+plain_text_lexer = get_lexer_by_name(
+    'text', stripall=False, stripnl=False, ensurenl=False)
+
+
+log = logging.getLogger()
 
-from rhodecode.lib.helpers import get_lexer_for_filenode
 
-def tokenize_file(content, lexer):
+def filenode_as_lines_tokens(filenode, lexer=None):
+    lexer = lexer or get_lexer_for_filenode(filenode)
+    log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)
+    tokens = tokenize_string(filenode.content, get_lexer_for_filenode(filenode))
+    lines = split_token_stream(tokens, split_string='\n')
+    rv = list(lines)
+    return rv
+
+
+def tokenize_string(content, lexer):
     """
     Use pygments to tokenize some content based on a lexer
     ensuring all original new lines and whitespace is preserved
@@ -36,65 +53,33 @@ def tokenize_file(content, lexer):
     lexer.stripall = False
     lexer.stripnl = False
     lexer.ensurenl = False
-    return lex(content, lexer)
+    for token_type, token_text in lex(content, lexer):
+        yield pygment_token_class(token_type), token_text
 
 
-def pygment_token_class(token_type):
-    """ Convert a pygments token type to html class name """
-
-    fname = PYGMENTS_TOKEN_TYPES.get(token_type)
-    if fname:
-        return fname
-
-    aname = ''
-    while fname is None:
-        aname = '-' + token_type[-1] + aname
-        token_type = token_type.parent
-        fname = PYGMENTS_TOKEN_TYPES.get(token_type)
-
-    return fname + aname
-
-
-def tokens_as_lines(tokens, split_string=u'\n'):
+def split_token_stream(tokens, split_string=u'\n'):
     """
     Take a list of (TokenType, text) tuples and split them by a string
 
-    eg. [(TEXT, 'some\ntext')] => [(TEXT, 'some'), (TEXT, 'text')]
+    >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
+    [(TEXT, 'some'), (TEXT, 'text'),
+     (TEXT, 'more'), (TEXT, 'text')]
     """
 
     buffer = []
-    for token_type, token_text in tokens:
+    for token_class, token_text in tokens:
         parts = token_text.split(split_string)
         for part in parts[:-1]:
-            buffer.append((token_type, part))
+            buffer.append((token_class, part))
             yield buffer
             buffer = []
 
-        buffer.append((token_type, parts[-1]))
+        buffer.append((token_class, parts[-1]))
 
     if buffer:
         yield buffer
 
 
-def filenode_as_lines_tokens(filenode):
-    """
-    Return a generator of lines with pygment tokens for a filenode eg:
-
-    [
-        (1, line1_tokens_list),
-        (2, line1_tokens_list]),
-    ]
-    """
-
-    return enumerate(
-      tokens_as_lines(
-        tokenize_file(
-          filenode.content, get_lexer_for_filenode(filenode)
-        )
-      ),
-    1)
-
-
 def filenode_as_annotated_lines_tokens(filenode):
     """
     Take a file node and return a list of annotations => lines, if no annotation
@@ -120,9 +105,8 @@ def filenode_as_annotated_lines_tokens(f
     ]
     """
 
+    commit_cache = {} # cache commit_getter lookups
 
-    # cache commit_getter lookups
-    commit_cache = {}
     def _get_annotation(commit_id, commit_getter):
         if commit_id not in commit_cache:
             commit_cache[commit_id] = commit_getter()
@@ -136,7 +120,7 @@ def filenode_as_annotated_lines_tokens(f
 
     annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                           for line_no, tokens
-                          in filenode_as_lines_tokens(filenode))
+                          in enumerate(filenode_as_lines_tokens(filenode), 1))
 
     grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
 
@@ -145,3 +129,86 @@ def filenode_as_annotated_lines_tokens(f
             annotation, [(line_no, tokens)
                           for (_, line_no, tokens) in group]
         )
+
+
+def render_tokenstream(tokenstream):
+    result = []
+    for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
+
+        if token_class:
+            result.append(u'<span class="%s">' % token_class)
+        else:
+            result.append(u'<span>')
+
+        for op_tag, token_text in token_ops_texts:
+
+            if op_tag:
+                result.append(u'<%s>' % op_tag)
+
+            escaped_text = html_escape(token_text)
+            escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
+
+            result.append(escaped_text)
+
+            if op_tag:
+                result.append(u'</%s>' % op_tag)
+
+        result.append(u'</span>')
+
+    html = ''.join(result)
+    return html
+
+
+def rollup_tokenstream(tokenstream):
+    """
+    Group a token stream of the format:
+
+        ('class', 'op', 'text')
+    or
+        ('class', 'text')
+
+    into
+
+        [('class1',
+            [('op1', 'text'),
+             ('op2', 'text')]),
+         ('class2',
+            [('op3', 'text')])]
+
+    This is used to get the minimal tags necessary when
+    rendering to html eg for a token stream ie.
+
+    <span class="A"><ins>he</ins>llo</span>
+    vs
+    <span class="A"><ins>he</ins></span><span class="A">llo</span>
+
+    If a 2 tuple is passed in, the output op will be an empty string.
+
+    eg:
+
+    >>> rollup_tokenstream([('classA', '',      'h'),
+                            ('classA', 'del',   'ell'),
+                            ('classA', '',      'o'),
+                            ('classB', '',      ' '),
+                            ('classA', '',      'the'),
+                            ('classA', '',      're'),
+                            ])
+
+    [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
+     ('classB', [('', ' ')],
+     ('classA', [('', 'there')]]
+
+    """
+    if tokenstream and len(tokenstream[0]) == 2:
+        tokenstream = ((t[0], '', t[1]) for t in tokenstream)
+
+    result = []
+    for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
+        ops = []
+        for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
+            text_buffer = []
+            for t_class, t_op, t_text in token_text_list:
+                text_buffer.append(t_text)
+            ops.append((token_op, ''.join(text_buffer)))
+        result.append((token_class, ops))
+    return result
diff --git a/rhodecode/public/css/code-block.less b/rhodecode/public/css/code-block.less
--- a/rhodecode/public/css/code-block.less
+++ b/rhodecode/public/css/code-block.less
@@ -644,6 +644,9 @@ pre.literal-block, .codehilite pre{
 
 /* START NEW CODE BLOCK CSS */
 
+@cb-line-height: 18px;
+@cb-line-code-padding: 10px;
+
 table.cb {
     width: 100%;
     border-collapse: collapse;
@@ -678,21 +681,23 @@ table.cb {
 
     td {
         vertical-align: top;
-        padding: 2px 10px;
+        padding: 0;
 
         &.cb-content {
-            white-space: pre-wrap;
-            font-family: @font-family-monospace;
             font-size: 12.35px;
 
-            span {
+            span.cb-code {
+                line-height: @cb-line-height;
+                padding-left: @cb-line-code-padding;
+                display: block;
+                white-space: pre-wrap;
+                font-family: @font-family-monospace;
                 word-break: break-word;
             }
         }
 
         &.cb-lineno {
             padding: 0;
-            height: 1px; /* this allows the <a> link to fill to 100% height of the td */
             width: 50px;
             color: rgba(0, 0, 0, 0.3);
             text-align: right;
@@ -702,21 +707,20 @@ table.cb {
             a::before {
                 content: attr(data-line-no);
             }
-            &.cb-line-selected {
+            &.cb-line-selected a {
                 background: @comment-highlight-color !important;
             }
 
             a {
                 display: block;
-                height: 100%;
+                padding-right: @cb-line-code-padding;
+                line-height: @cb-line-height;
                 color: rgba(0, 0, 0, 0.3);
-                padding: 0 10px; /* vertical padding is 0 so that height: 100% works */
-                line-height: 18px; /* use this instead of vertical padding */
             }
         }
 
         &.cb-content {
-            &.cb-line-selected {
+            &.cb-line-selected .cb-code {
                 background: @comment-highlight-color !important;
             }
         }
diff --git a/rhodecode/templates/codeblocks/source.html b/rhodecode/templates/codeblocks/source.html
--- a/rhodecode/templates/codeblocks/source.html
+++ b/rhodecode/templates/codeblocks/source.html
@@ -2,9 +2,9 @@
                         annotation=None,
                         bgcolor=None)">
     <%
-    # avoid module lookups for performance
-    from rhodecode.lib.codeblocks import pygment_token_class
-    from rhodecode.lib.helpers import html_escape
+    from rhodecode.lib.codeblocks import render_tokenstream
+    # avoid module lookup for performance
+    html_escape = h.html_escape
     %>
     <tr class="cb-line cb-line-fresh"
     %if annotation:
@@ -18,13 +18,11 @@
     %if bgcolor:
     style="background: ${bgcolor}"
     %endif
-    >${
-      ''.join(
-       '<span class="%s">%s</span>' %
-        (pygment_token_class(token_type), html_escape(token_text))
-        for token_type, token_text in tokens) + '\n' | n
-    }</td>
-    ## this ugly list comp is necessary for performance
+    >
+    ## newline at end is necessary for highlight to work when line is empty
+    ## and for copy pasting code to work as expected
+      <span class="cb-code">${render_tokenstream(tokens)|n}${'\n'}</span>
+    </td>
   </tr>
 </%def>
 
diff --git a/rhodecode/templates/files/files_source.html b/rhodecode/templates/files/files_source.html
--- a/rhodecode/templates/files/files_source.html
+++ b/rhodecode/templates/files/files_source.html
@@ -62,7 +62,7 @@
                     ${sourceblock.render_annotation_lines(annotation, lines, color_hasher)}
                   %endfor
                 %else:
-                  %for line_num, tokens in c.lines:
+                  %for line_num, tokens in enumerate(c.lines, 1):
                     ${sourceblock.render_line(line_num, tokens)}
                   %endfor
                 %endif
diff --git a/rhodecode/tests/lib/test_codeblocks.py b/rhodecode/tests/lib/test_codeblocks.py
new file mode 100644
--- /dev/null
+++ b/rhodecode/tests/lib/test_codeblocks.py
@@ -0,0 +1,330 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2016-2016  RhodeCode GmbH
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License, version 3
+# (only), as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# This program is dual-licensed. If you wish to learn more about the
+# RhodeCode Enterprise Edition, including its added features, Support services,
+# and proprietary license terms, please see https://rhodecode.com/licenses/
+
+import pytest
+
+from rhodecode.lib.codeblocks import (
+    tokenize_string, split_token_stream, rollup_tokenstream,
+    render_tokenstream)
+from pygments.lexers import get_lexer_by_name
+
+
+class TestTokenizeString(object):
+
+    python_code = '''
+    import this
+
+    var = 6
+    print "this"
+
+    '''
+
+    def test_tokenize_as_python(self):
+        lexer = get_lexer_by_name('python')
+        tokens = list(tokenize_string(self.python_code, lexer))
+
+        assert tokens == [
+            ('',    u'\n'),
+            ('',    u'    '),
+            ('kn',  u'import'),
+            ('',    u' '),
+            ('nn',  u'this'),
+            ('',    u'\n'),
+            ('',    u'\n'),
+            ('',    u'    '),
+            ('n',   u'var'),
+            ('',    u' '),
+            ('o',   u'='),
+            ('',    u' '),
+            ('mi',  u'6'),
+            ('',    u'\n'),
+            ('',    u'    '),
+            ('k',   u'print'),
+            ('',    u' '),
+            ('s2',  u'"'),
+            ('s2',  u'this'),
+            ('s2',  u'"'),
+            ('',    u'\n'),
+            ('',    u'\n'),
+            ('',    u'    ')
+        ]
+
+    def test_tokenize_as_text(self):
+        lexer = get_lexer_by_name('text')
+        tokens = list(tokenize_string(self.python_code, lexer))
+
+        assert tokens == [
+            ('',
+            u'\n    import this\n\n    var = 6\n    print "this"\n\n    ')
+        ]
+
+
+class TestSplitTokenStream(object):
+
+    def test_split_token_stream(self):
+        lines = list(split_token_stream(
+            [('type1', 'some\ntext'), ('type2', 'more\n')]))
+
+        assert lines == [
+            [('type1', u'some')],
+            [('type1', u'text'), ('type2', u'more')],
+            [('type2', u'')],
+        ]
+
+    def test_split_token_stream_other_char(self):
+        lines = list(split_token_stream(
+            [('type1', 'some\ntext'), ('type2', 'more\n')],
+            split_string='m'))
+
+        assert lines == [
+            [('type1', 'so')],
+            [('type1', 'e\ntext'), ('type2', '')],
+            [('type2', 'ore\n')],
+        ]
+
+    def test_split_token_stream_without_char(self):
+        lines = list(split_token_stream(
+            [('type1', 'some\ntext'), ('type2', 'more\n')],
+            split_string='z'))
+
+        assert lines == [
+            [('type1', 'some\ntext'), ('type2', 'more\n')]
+        ]
+
+    def test_split_token_stream_single(self):
+        lines = list(split_token_stream(
+            [('type1', '\n')], split_string='\n'))
+
+        assert lines == [
+            [('type1', '')],
+            [('type1', '')],
+        ]
+
+    def test_split_token_stream_single_repeat(self):
+        lines = list(split_token_stream(
+            [('type1', '\n\n\n')], split_string='\n'))
+
+        assert lines == [
+            [('type1', '')],
+            [('type1', '')],
+            [('type1', '')],
+            [('type1', '')],
+        ]
+
+    def test_split_token_stream_multiple_repeat(self):
+        lines = list(split_token_stream(
+            [('type1', '\n\n'), ('type2', '\n\n')], split_string='\n'))
+
+        assert lines == [
+            [('type1', '')],
+            [('type1', '')],
+            [('type1', ''), ('type2', '')],
+            [('type2', '')],
+            [('type2', '')],
+        ]
+
+
+class TestRollupTokens(object):
+
+    @pytest.mark.parametrize('tokenstream,output', [
+        ([],
+            []),
+        ([('A', 'hell'), ('A', 'o')], [
+            ('A', [
+                ('', 'hello')]),
+        ]),
+        ([('A', 'hell'), ('B', 'o')], [
+            ('A', [
+                ('', 'hell')]),
+            ('B', [
+                ('', 'o')]),
+        ]),
+        ([('A', 'hel'), ('A', 'lo'), ('B', ' '), ('A', 'there')], [
+            ('A', [
+                ('', 'hello')]),
+            ('B', [
+                ('', ' ')]),
+            ('A', [
+                ('', 'there')]),
+        ]),
+    ])
+    def test_rollup_tokenstream_without_ops(self, tokenstream, output):
+        assert list(rollup_tokenstream(tokenstream)) == output
+
+    @pytest.mark.parametrize('tokenstream,output', [
+        ([],
+            []),
+        ([('A', '', 'hell'), ('A', '', 'o')], [
+            ('A', [
+                ('', 'hello')]),
+        ]),
+        ([('A', '', 'hell'), ('B', '', 'o')], [
+            ('A', [
+                ('', 'hell')]),
+            ('B', [
+                ('', 'o')]),
+        ]),
+        ([('A', '', 'h'), ('B', '', 'e'), ('C', '', 'y')], [
+            ('A', [
+                ('', 'h')]),
+            ('B', [
+                ('', 'e')]),
+            ('C', [
+                ('', 'y')]),
+        ]),
+        ([('A', '', 'h'), ('A', '', 'e'), ('C', '', 'y')], [
+            ('A', [
+                ('', 'he')]),
+            ('C', [
+                ('', 'y')]),
+        ]),
+        ([('A', 'ins', 'h'), ('A', 'ins', 'e')], [
+            ('A', [
+                ('ins', 'he')
+            ]),
+        ]),
+        ([('A', 'ins', 'h'), ('A', 'del', 'e')], [
+            ('A', [
+                ('ins', 'h'),
+                ('del', 'e')
+            ]),
+        ]),
+        ([('A', 'ins', 'h'), ('B', 'del', 'e'), ('B', 'del', 'y')], [
+            ('A', [
+                ('ins', 'h'),
+            ]),
+            ('B', [
+                ('del', 'ey'),
+            ]),
+        ]),
+        ([('A', 'ins', 'h'), ('A', 'del', 'e'), ('B', 'del', 'y')], [
+            ('A', [
+                ('ins', 'h'),
+                ('del', 'e'),
+            ]),
+            ('B', [
+                ('del', 'y'),
+            ]),
+        ]),
+        ([('A', '', 'some'), ('A', 'ins', 'new'), ('A', '', 'name')], [
+            ('A', [
+                ('', 'some'),
+                ('ins', 'new'),
+                ('', 'name'),
+            ]),
+        ]),
+    ])
+    def test_rollup_tokenstream_with_ops(self, tokenstream, output):
+        assert list(rollup_tokenstream(tokenstream)) == output
+
+
+class TestRenderTokenStream(object):
+
+    @pytest.mark.parametrize('tokenstream,output', [
+        (
+            [],
+            '',
+        ),
+        (
+            [('', '', u'')],
+            '<span></span>',
+        ),
+        (
+            [('', '', u'text')],
+            '<span>text</span>',
+        ),
+        (
+            [('A', '', u'')],
+            '<span class="A"></span>',
+        ),
+        (
+            [('A', '', u'hello')],
+            '<span class="A">hello</span>',
+        ),
+        (
+            [('A', '', u'hel'), ('A', '', u'lo')],
+            '<span class="A">hello</span>',
+        ),
+        (
+            [('A', '', u'two\n'), ('A', '', u'lines')],
+            '<span class="A">two<nl>\n</nl>lines</span>',
+        ),
+        (
+            [('A', '', u'\nthree\n'), ('A', '', u'lines')],
+            '<span class="A"><nl>\n</nl>three<nl>\n</nl>lines</span>',
+        ),
+        (
+            [('', '', u'\n'), ('A', '', u'line')],
+            '<span><nl>\n</nl></span><span class="A">line</span>',
+        ),
+        (
+            [('', 'ins', u'\n'), ('A', '', u'line')],
+            '<span><ins><nl>\n</nl></ins></span><span class="A">line</span>',
+        ),
+        (
+            [('A', '', u'hel'), ('A', 'ins', u'lo')],
+            '<span class="A">hel<ins>lo</ins></span>',
+        ),
+        (
+            [('A', '', u'hel'), ('A', 'ins', u'l'), ('A', 'ins', u'o')],
+            '<span class="A">hel<ins>lo</ins></span>',
+        ),
+        (
+            [('A', '', u'hel'), ('A', 'ins', u'l'), ('A', 'del', u'o')],
+            '<span class="A">hel<ins>l</ins><del>o</del></span>',
+        ),
+        (
+            [('A', '', u'hel'), ('B', '', u'lo')],
+            '<span class="A">hel</span><span class="B">lo</span>',
+        ),
+        (
+            [('A', '', u'hel'), ('B', 'ins', u'lo')],
+            '<span class="A">hel</span><span class="B"><ins>lo</ins></span>',
+        ),
+    ])
+    def test_render_tokenstream_with_ops(self, tokenstream, output):
+        html = render_tokenstream(tokenstream)
+        assert html == output
+
+    @pytest.mark.parametrize('tokenstream,output', [
+        (
+            [('A', u'hel'), ('A', u'lo')],
+            '<span class="A">hello</span>',
+        ),
+        (
+            [('A', u'hel'), ('A', u'l'), ('A', u'o')],
+            '<span class="A">hello</span>',
+        ),
+        (
+            [('A', u'hel'), ('A', u'l'), ('A', u'o')],
+            '<span class="A">hello</span>',
+        ),
+        (
+            [('A', u'hel'), ('B', u'lo')],
+            '<span class="A">hel</span><span class="B">lo</span>',
+        ),
+        (
+            [('A', u'hel'), ('B', u'lo')],
+            '<span class="A">hel</span><span class="B">lo</span>',
+        ),
+    ])
+    def test_render_tokenstream_without_ops(self, tokenstream, output):
+        html = render_tokenstream(tokenstream)
+        assert html == output