From c6fe9d0291bc6b3819722775aa02cbd24671938a 2014-02-04 20:20:40
From: jakobgager <jakobgager@hotmail.com>
Date: 2014-02-04 20:20:40
Subject: [PATCH] Replace lxml with HTMLParser in citation2latex

---
diff --git a/IPython/nbconvert/filters/citation.py b/IPython/nbconvert/filters/citation.py
index 1442d55..2ee7e13 100644
--- a/IPython/nbconvert/filters/citation.py
+++ b/IPython/nbconvert/filters/citation.py
@@ -9,9 +9,13 @@
 #-----------------------------------------------------------------------------
 
 #-----------------------------------------------------------------------------
-# Code
+# Imports
 #-----------------------------------------------------------------------------
+from HTMLParser import HTMLParser
 
+#-----------------------------------------------------------------------------
+# Functions
+#-----------------------------------------------------------------------------
 
 __all__ = ['citation2latex']
 
@@ -32,41 +36,72 @@ def citation2latex(s):
     Any HTML tag can be used, which allows the citations to be formatted
     in HTML in any manner.
     """
-    try:
-        from lxml import html
-    except ImportError:
-        return s
-
-    tree = html.fragment_fromstring(s, create_parent='div')
-    _process_node_cite(tree)
-    s = html.tostring(tree, encoding='unicode')
-    if s.endswith('</div>'):
-        s = s[:-6]
-    if s.startswith('<div>'):
-        s = s[5:]
-    return s
+    parser = CitationParser()
+    parser.feed(s)
+    parser.close()
+    outtext = u''
+    startpos = 0
+    for citation in parser.citelist:
+            outtext += s[startpos:citation[1]]
+            outtext += '\\cite{%s}'%citation[0]
+            startpos = citation[2]
+    outtext += s[startpos:]
+    return outtext
 
+#-----------------------------------------------------------------------------
+# Classes
+#-----------------------------------------------------------------------------
+class CitationParser(HTMLParser):
+    """Citation Parser
 
-def _process_node_cite(node):
-    """Do the citation replacement as we walk the lxml tree."""
+    Replaces html tags with data-cite attribute with respective latex \\cite.
     
-    def _get(o, name):
-        value = getattr(o, name, None)
-        return '' if value is None else value
+    Inherites from HTMLParser, overrides:
+     - handle_starttag
+     - handle_endtag
+    """
+    # number of open tags
+    opentags = None
+    # list of found citations
+    citelist = None
+    # active citation tag
+    citetag = None
+
+    def __init__(self):
+        self.citelist = []
+        self.opentags = 0
+        HTMLParser.__init__(self)
     
-    if 'data-cite' in node.attrib:
-        cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}
-        prev = node.getprevious()
-        if prev is not None:
-            prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')
-        else:
-            parent = node.getparent()
-            if parent is not None:
-                parent.text = _get(parent, 'text') + cite + _get(node, 'tail')
-        try:
-            node.getparent().remove(node)
-        except AttributeError:
-            pass
-    else:
-        for child in node:
-            _process_node_cite(child)
+    def get_offset(self):
+        # Compute startposition in source
+        lin, offset = self.getpos()
+        pos = 0
+        for i in range(lin-1):
+            pos = self.data.find('\n',pos) + 1
+        return pos + offset
+        
+    def handle_starttag(self, tag, attrs):
+        # for each tag check if attributes are present and if no citation is active
+        if self.opentags == 0 and len(attrs)>0:
+            for atr, data in attrs:
+                if atr.lower() == 'data-cite':
+                    self.citetag = tag
+                    self.opentags = 1
+                    self.citelist.append([data, self.get_offset()])
+                    return
+                
+        if tag == self.citetag:
+            # found an open citation tag but not the starting one  
+            self.opentags += 1
+  
+    def handle_endtag(self, tag):
+        if tag == self.citetag:
+            # found citation tag check if starting one
+            if self.opentags == 1:
+                pos = self.get_offset()
+                self.citelist[-1].append(pos+len(tag)+3)
+            self.opentags -= 1
+        
+    def feed(self, data):
+        self.data = data
+        HTMLParser.feed(self, data)