From c6fe9d0291bc6b3819722775aa02cbd24671938a 2014-02-04 20:20:40 From: jakobgager Date: 2014-02-04 20:20:40 Subject: [PATCH] Replace lxml with HTMLParser in citation2latex --- diff --git a/IPython/nbconvert/filters/citation.py b/IPython/nbconvert/filters/citation.py index 1442d55..2ee7e13 100644 --- a/IPython/nbconvert/filters/citation.py +++ b/IPython/nbconvert/filters/citation.py @@ -9,9 +9,13 @@ #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- -# Code +# Imports #----------------------------------------------------------------------------- +from HTMLParser import HTMLParser +#----------------------------------------------------------------------------- +# Functions +#----------------------------------------------------------------------------- __all__ = ['citation2latex'] @@ -32,41 +36,72 @@ def citation2latex(s): Any HTML tag can be used, which allows the citations to be formatted in HTML in any manner. """ - try: - from lxml import html - except ImportError: - return s - - tree = html.fragment_fromstring(s, create_parent='div') - _process_node_cite(tree) - s = html.tostring(tree, encoding='unicode') - if s.endswith(''): - s = s[:-6] - if s.startswith('
'): - s = s[5:] - return s + parser = CitationParser() + parser.feed(s) + parser.close() + outtext = u'' + startpos = 0 + for citation in parser.citelist: + outtext += s[startpos:citation[1]] + outtext += '\\cite{%s}'%citation[0] + startpos = citation[2] + outtext += s[startpos:] + return outtext +#----------------------------------------------------------------------------- +# Classes +#----------------------------------------------------------------------------- +class CitationParser(HTMLParser): + """Citation Parser -def _process_node_cite(node): - """Do the citation replacement as we walk the lxml tree.""" + Replaces html tags with data-cite attribute with respective latex \\cite. - def _get(o, name): - value = getattr(o, name, None) - return '' if value is None else value + Inherites from HTMLParser, overrides: + - handle_starttag + - handle_endtag + """ + # number of open tags + opentags = None + # list of found citations + citelist = None + # active citation tag + citetag = None + + def __init__(self): + self.citelist = [] + self.opentags = 0 + HTMLParser.__init__(self) - if 'data-cite' in node.attrib: - cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']} - prev = node.getprevious() - if prev is not None: - prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail') - else: - parent = node.getparent() - if parent is not None: - parent.text = _get(parent, 'text') + cite + _get(node, 'tail') - try: - node.getparent().remove(node) - except AttributeError: - pass - else: - for child in node: - _process_node_cite(child) + def get_offset(self): + # Compute startposition in source + lin, offset = self.getpos() + pos = 0 + for i in range(lin-1): + pos = self.data.find('\n',pos) + 1 + return pos + offset + + def handle_starttag(self, tag, attrs): + # for each tag check if attributes are present and if no citation is active + if self.opentags == 0 and len(attrs)>0: + for atr, data in attrs: + if atr.lower() == 'data-cite': + self.citetag = tag + self.opentags = 1 + self.citelist.append([data, self.get_offset()]) + return + + if tag == self.citetag: + # found an open citation tag but not the starting one + self.opentags += 1 + + def handle_endtag(self, tag): + if tag == self.citetag: + # found citation tag check if starting one + if self.opentags == 1: + pos = self.get_offset() + self.citelist[-1].append(pos+len(tag)+3) + self.opentags -= 1 + + def feed(self, data): + self.data = data + HTMLParser.feed(self, data)