From 017de3a74c810420f1be2ad0de19fdff2b8dfe17 2014-02-04 23:02:04 From: Brian E. Granger Date: 2014-02-04 23:02:04 Subject: [PATCH] Merge pull request #5025 from jdfreder/jakob-html citation2latex filter (using HTMLParser) --- diff --git a/IPython/nbconvert/filters/citation.py b/IPython/nbconvert/filters/citation.py index 1442d55..2ecd717 100644 --- a/IPython/nbconvert/filters/citation.py +++ b/IPython/nbconvert/filters/citation.py @@ -9,9 +9,17 @@ #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- -# Code +# Imports #----------------------------------------------------------------------------- +from IPython.utils.py3compat import PY3 +if PY3: + from html.parser import HTMLParser +else: + from HTMLParser import HTMLParser +#----------------------------------------------------------------------------- +# Functions +#----------------------------------------------------------------------------- __all__ = ['citation2latex'] @@ -32,41 +40,72 @@ def citation2latex(s): Any HTML tag can be used, which allows the citations to be formatted in HTML in any manner. """ - try: - from lxml import html - except ImportError: - return s - - tree = html.fragment_fromstring(s, create_parent='div') - _process_node_cite(tree) - s = html.tostring(tree, encoding='unicode') - if s.endswith(''): - s = s[:-6] - if s.startswith('
'): - s = s[5:] - return s + parser = CitationParser() + parser.feed(s) + parser.close() + outtext = u'' + startpos = 0 + for citation in parser.citelist: + outtext += s[startpos:citation[1]] + outtext += '\\cite{%s}'%citation[0] + startpos = citation[2] if len(citation)==3 else -1 + outtext += s[startpos:] if startpos != -1 else '' + return outtext +#----------------------------------------------------------------------------- +# Classes +#----------------------------------------------------------------------------- +class CitationParser(HTMLParser): + """Citation Parser -def _process_node_cite(node): - """Do the citation replacement as we walk the lxml tree.""" + Replaces html tags with data-cite attribute with respective latex \\cite. - def _get(o, name): - value = getattr(o, name, None) - return '' if value is None else value + Inherites from HTMLParser, overrides: + - handle_starttag + - handle_endtag + """ + # number of open tags + opentags = None + # list of found citations + citelist = None + # active citation tag + citetag = None + + def __init__(self): + self.citelist = [] + self.opentags = 0 + HTMLParser.__init__(self) - if 'data-cite' in node.attrib: - cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']} - prev = node.getprevious() - if prev is not None: - prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail') - else: - parent = node.getparent() - if parent is not None: - parent.text = _get(parent, 'text') + cite + _get(node, 'tail') - try: - node.getparent().remove(node) - except AttributeError: - pass - else: - for child in node: - _process_node_cite(child) + def get_offset(self): + # Compute startposition in source + lin, offset = self.getpos() + pos = 0 + for i in range(lin-1): + pos = self.data.find('\n',pos) + 1 + return pos + offset + + def handle_starttag(self, tag, attrs): + # for each tag check if attributes are present and if no citation is active + if self.opentags == 0 and len(attrs)>0: + for atr, data in attrs: + if atr.lower() == 'data-cite': + self.citetag = tag + self.opentags = 1 + self.citelist.append([data, self.get_offset()]) + return + + if tag == self.citetag: + # found an open citation tag but not the starting one + self.opentags += 1 + + def handle_endtag(self, tag): + if tag == self.citetag: + # found citation tag check if starting one + if self.opentags == 1: + pos = self.get_offset() + self.citelist[-1].append(pos+len(tag)+3) + self.opentags -= 1 + + def feed(self, data): + self.data = data + HTMLParser.feed(self, data) diff --git a/IPython/nbconvert/filters/tests/test_citation.py b/IPython/nbconvert/filters/tests/test_citation.py index f36c9ac..3fc898c 100644 --- a/IPython/nbconvert/filters/tests/test_citation.py +++ b/IPython/nbconvert/filters/tests/test_citation.py @@ -9,15 +9,13 @@ #----------------------------------------------------------------------------- # Imports #----------------------------------------------------------------------------- - from ..citation import citation2latex from nose.tools import assert_equal #----------------------------------------------------------------------------- # Tests #----------------------------------------------------------------------------- - -test_md = """ +test_md = {""" # My Heading Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue @@ -26,14 +24,13 @@ velit, lobortis sed interdum at, vestibulum vitae libero Thomas. Quisque iaculis ligula ut ipsum mattis viverra. -

Here is a plain paragraph that should be unaffected.

+

Here is a plain paragraph that should be unaffected. It contains simple +relations like 1<2 & 4>5.

* One Jonathan. * Two Matthias. * Three Paul. -""" - -test_md_parsed = """ +""": """ # My Heading Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue @@ -42,18 +39,112 @@ velit, lobortis sed interdum at, vestibulum vitae libero \cite{fperez}. Lorem ipsum dolor sit amet, consectetur adipiscing elit \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra. -

Here is a plain paragraph that should be unaffected.

+

Here is a plain paragraph that should be unaffected. It contains simple +relations like 1<2 & 4>5.

* One \cite{jdfreder}. * Two \cite{carreau}. * Three \cite{ivanov}. -""" +""", + +# No citations +r"""The quick brown fox jumps over the lazy dog.""": +r"""The quick brown fox jumps over the lazy dog.""", + +# Simple inline +r"""Foo Text bar""": +r"""Foo \cite{asdf} bar""", + +# Multiline +r"""Text +Foo""": +r"""\cite{ewqr}Foo""", + +# Nested tags +r"""
Text
Bar""": +r"""
\cite{Foo}
Bar""", + +# Including Maths +r"""Foo $3*2*1$
Text
Bar""": +r"""Foo $3*2*1$ \cite{Foo} Bar""", + +# Missing end tag +r"""Test Foo""": +r"""\cite{asdf}""", + +r"""Test Foo""": +r"""\cite{asdf}""", + +r"""Test Foo""": +r"""\cite{asdf}""", + +# Multiple arguments +r"""Test Foo""": +r"""\cite{asdf} Foo""", + +# Wrong capitalization +r"""Test Foo""": +r"""\cite{asdf} Foo""", + +r"""Test Foo""": +r"""\cite{asdf} Foo""", + +# Wrong end tag +r""" ksjfs sdf ds """: +r"""\cite{wer}""", + +r"""""": +r"""\cite{wer}""", + +# Invalid tag names +r""" """: +r""" \cite{wer}""", + +# Non-nested tags +r"""

Test Foo

""": +r"""

\cite{asdf}Test Foo

""", + +# LXML errors +r"""Foo +\begin{eqnarray} +1 & bar1 \\ +3 & 4 \\ +\end{eqnarray}""": +r"""Foo +\begin{eqnarray} +1 & \cite{bar} \\ +3 & 4 \\ +\end{eqnarray}""", + +r""" +1<2 is true, but 3>4 is false. + +$1<2$ is true, but $3>4$ is false. + +1<2 it is even worse if it is alone in a line.""": +r""" +1<2 is true, but 3>4 is false. + +$1<2$ is true, but $3>4$ is false. + +1<2 it is even worse if it is alone in a line.""", + +r""" +1 < 2 is true, but 3 > 4 is false + +$1 < 2$ is true, but $3 > 4$ is false + +1 < 2 it is even worse if it is alone in a line. +""": +r""" +1 < 2 is true, but 3 > 4 is false + +$1 < 2$ is true, but $3 > 4$ is false + +1 < 2 it is even worse if it is alone in a line. +"""} def test_citation2latex(): """Are citations parsed properly?""" - try: - from lxml import html #analysis:ignore - except ImportError: - assert_equal(test_md, citation2latex(test_md)) - else: - assert_equal(test_md_parsed, citation2latex(test_md)) + for input, output in test_md.items(): + yield (assert_equal, citation2latex(input), output)