From 017de3a74c810420f1be2ad0de19fdff2b8dfe17 2014-02-04 23:02:04
From: Brian E. Granger <ellisonbg@gmail.com>
Date: 2014-02-04 23:02:04
Subject: [PATCH] Merge pull request #5025 from jdfreder/jakob-html

citation2latex filter (using HTMLParser)
---
diff --git a/IPython/nbconvert/filters/citation.py b/IPython/nbconvert/filters/citation.py
index 1442d55..2ecd717 100644
--- a/IPython/nbconvert/filters/citation.py
+++ b/IPython/nbconvert/filters/citation.py
@@ -9,9 +9,17 @@
 #-----------------------------------------------------------------------------
 
 #-----------------------------------------------------------------------------
-# Code
+# Imports
 #-----------------------------------------------------------------------------
+from IPython.utils.py3compat import PY3
+if PY3:
+    from html.parser import HTMLParser
+else:
+    from HTMLParser import HTMLParser
 
+#-----------------------------------------------------------------------------
+# Functions
+#-----------------------------------------------------------------------------
 
 __all__ = ['citation2latex']
 
@@ -32,41 +40,72 @@ def citation2latex(s):
     Any HTML tag can be used, which allows the citations to be formatted
     in HTML in any manner.
     """
-    try:
-        from lxml import html
-    except ImportError:
-        return s
-
-    tree = html.fragment_fromstring(s, create_parent='div')
-    _process_node_cite(tree)
-    s = html.tostring(tree, encoding='unicode')
-    if s.endswith('</div>'):
-        s = s[:-6]
-    if s.startswith('<div>'):
-        s = s[5:]
-    return s
+    parser = CitationParser()
+    parser.feed(s)
+    parser.close()
+    outtext = u''
+    startpos = 0
+    for citation in parser.citelist:
+            outtext += s[startpos:citation[1]]
+            outtext += '\\cite{%s}'%citation[0]
+            startpos = citation[2] if len(citation)==3 else -1
+    outtext += s[startpos:] if startpos != -1 else ''
+    return outtext
 
+#-----------------------------------------------------------------------------
+# Classes
+#-----------------------------------------------------------------------------
+class CitationParser(HTMLParser):
+    """Citation Parser
 
-def _process_node_cite(node):
-    """Do the citation replacement as we walk the lxml tree."""
+    Replaces html tags with data-cite attribute with respective latex \\cite.
     
-    def _get(o, name):
-        value = getattr(o, name, None)
-        return '' if value is None else value
+    Inherites from HTMLParser, overrides:
+     - handle_starttag
+     - handle_endtag
+    """
+    # number of open tags
+    opentags = None
+    # list of found citations
+    citelist = None
+    # active citation tag
+    citetag = None
+
+    def __init__(self):
+        self.citelist = []
+        self.opentags = 0
+        HTMLParser.__init__(self)
     
-    if 'data-cite' in node.attrib:
-        cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}
-        prev = node.getprevious()
-        if prev is not None:
-            prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')
-        else:
-            parent = node.getparent()
-            if parent is not None:
-                parent.text = _get(parent, 'text') + cite + _get(node, 'tail')
-        try:
-            node.getparent().remove(node)
-        except AttributeError:
-            pass
-    else:
-        for child in node:
-            _process_node_cite(child)
+    def get_offset(self):
+        # Compute startposition in source
+        lin, offset = self.getpos()
+        pos = 0
+        for i in range(lin-1):
+            pos = self.data.find('\n',pos) + 1
+        return pos + offset
+        
+    def handle_starttag(self, tag, attrs):
+        # for each tag check if attributes are present and if no citation is active
+        if self.opentags == 0 and len(attrs)>0:
+            for atr, data in attrs:
+                if atr.lower() == 'data-cite':
+                    self.citetag = tag
+                    self.opentags = 1
+                    self.citelist.append([data, self.get_offset()])
+                    return
+                
+        if tag == self.citetag:
+            # found an open citation tag but not the starting one  
+            self.opentags += 1
+  
+    def handle_endtag(self, tag):
+        if tag == self.citetag:
+            # found citation tag check if starting one
+            if self.opentags == 1:
+                pos = self.get_offset()
+                self.citelist[-1].append(pos+len(tag)+3)
+            self.opentags -= 1
+        
+    def feed(self, data):
+        self.data = data
+        HTMLParser.feed(self, data)
diff --git a/IPython/nbconvert/filters/tests/test_citation.py b/IPython/nbconvert/filters/tests/test_citation.py
index f36c9ac..3fc898c 100644
--- a/IPython/nbconvert/filters/tests/test_citation.py
+++ b/IPython/nbconvert/filters/tests/test_citation.py
@@ -9,15 +9,13 @@
 #-----------------------------------------------------------------------------
 # Imports
 #-----------------------------------------------------------------------------
-
 from ..citation import citation2latex
 from nose.tools import assert_equal
 
 #-----------------------------------------------------------------------------
 # Tests
 #-----------------------------------------------------------------------------
-
-test_md = """
+test_md = {"""
 # My Heading
 
 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
@@ -26,14 +24,13 @@ velit, lobortis sed interdum at, vestibulum vitae libero <strong data-cite="fper
 Lorem ipsum dolor sit amet, consectetur adipiscing elit
 <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra.
 
-<p>Here is a plain paragraph that should be unaffected.</p>
+<p>Here is a plain paragraph that should be unaffected. It contains simple
+relations like 1<2 & 4>5.</p>
 
 * One <cite data-cite="jdfreder">Jonathan</cite>.
 * Two <cite data-cite="carreau">Matthias</cite>.
 * Three <cite data-cite="ivanov">Paul</cite>.
-"""
-
-test_md_parsed = """
+""":  """
 # My Heading
 
 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
@@ -42,18 +39,112 @@ velit, lobortis sed interdum at, vestibulum vitae libero \cite{fperez}.
 Lorem ipsum dolor sit amet, consectetur adipiscing elit
 \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra.
 
-<p>Here is a plain paragraph that should be unaffected.</p>
+<p>Here is a plain paragraph that should be unaffected. It contains simple
+relations like 1<2 & 4>5.</p>
 
 * One \cite{jdfreder}.
 * Two \cite{carreau}.
 * Three \cite{ivanov}.
-"""
+""",
+
+# No citations
+r"""The quick brown fox jumps over the lazy dog.""": 
+r"""The quick brown fox jumps over the lazy dog.""",
+
+# Simple inline
+r"""Foo <cite data-cite=asdf>Text</cite> bar""":
+r"""Foo \cite{asdf} bar""",
+
+# Multiline
+r"""<cite data-cite=ewqr>Text
+</cite>Foo""":
+r"""\cite{ewqr}Foo""",
+
+# Nested tags
+r"""<div><div data-cite=Foo><div>Text</div></div></div> Bar""":
+r"""<div>\cite{Foo}</div> Bar""",
+
+# Including Maths
+r"""Foo $3*2*1$ <div data-cite=Foo>Text</div> Bar""":
+r"""Foo $3*2*1$ \cite{Foo} Bar""",
+
+# Missing end tag
+r"""<cite data-cite=asdf>Test Foo""":
+r"""\cite{asdf}""",
+
+r"""<cite data-cite=asdf><cite>Test Foo""":
+r"""\cite{asdf}""",
+
+r"""<cite data-cite=asdf><cite>Test</cite> Foo""":
+r"""\cite{asdf}""",
+
+# Multiple arguments
+r"""<cite width=qwer data-cite=asdf>Test</cite> Foo""":
+r"""\cite{asdf} Foo""",
+
+# Wrong capitalization
+r"""<CITE data-cite=asdf>Test</cite> Foo""":
+r"""\cite{asdf} Foo""",
+
+r"""<cite DATA-CITE=asdf>Test</cite> Foo""":
+r"""\cite{asdf} Foo""",
+
+# Wrong end tag
+r"""<asd data-cite=wer> ksjfs </asdf> sdf ds """:
+r"""\cite{wer}""",
+
+r"""<asd data-cite=wer>""":
+r"""\cite{wer}""",
+
+# Invalid tag names
+r"""<frog> <foo data-cite=wer></foo>""":
+r"""<frog> \cite{wer}""",
+
+# Non-nested tags
+r"""<strong> <h1> <cite data-cite=asdf></cite>Test</strong> Foo </h1>""":
+r"""<strong> <h1> \cite{asdf}Test</strong> Foo </h1>""",
+
+# LXML errors
+r"""Foo
+\begin{eqnarray}
+1 & <cite data-cite=bar>bar1</cite> \\
+3 & 4 \\
+\end{eqnarray}""":
+r"""Foo
+\begin{eqnarray}
+1 & \cite{bar} \\
+3 & 4 \\
+\end{eqnarray}""",
+
+r"""
+1<2 is true, but 3>4 is false.
+
+$1<2$ is true, but $3>4$ is false.
+
+1<2 it is even worse if it is alone in a line.""":
+r"""
+1<2 is true, but 3>4 is false.
+
+$1<2$ is true, but $3>4$ is false.
+
+1<2 it is even worse if it is alone in a line.""",
+
+r"""
+1 < 2 is true, but 3 > 4 is false
+
+$1 < 2$ is true, but $3 > 4$ is false
+
+1 < 2 it is even worse if it is alone in a line.
+""":
+r"""
+1 < 2 is true, but 3 > 4 is false
+
+$1 < 2$ is true, but $3 > 4$ is false
+
+1 < 2 it is even worse if it is alone in a line.
+"""}
 
 def test_citation2latex():
     """Are citations parsed properly?"""
-    try:
-        from lxml import html  #analysis:ignore
-    except ImportError:
-        assert_equal(test_md, citation2latex(test_md))
-    else:
-        assert_equal(test_md_parsed, citation2latex(test_md))
+    for input, output in test_md.items():
+        yield (assert_equal, citation2latex(input), output)