##// END OF EJS Templates
Merge pull request #5025 from jdfreder/jakob-html...
Brian E. Granger -
r15061:017de3a7 merge
parent child Browse files
Show More
@@ -9,9 +9,17 b''
9 9 #-----------------------------------------------------------------------------
10 10
11 11 #-----------------------------------------------------------------------------
12 # Code
12 # Imports
13 13 #-----------------------------------------------------------------------------
14 from IPython.utils.py3compat import PY3
15 if PY3:
16 from html.parser import HTMLParser
17 else:
18 from HTMLParser import HTMLParser
14 19
20 #-----------------------------------------------------------------------------
21 # Functions
22 #-----------------------------------------------------------------------------
15 23
16 24 __all__ = ['citation2latex']
17 25
@@ -32,41 +40,72 b' def citation2latex(s):'
32 40 Any HTML tag can be used, which allows the citations to be formatted
33 41 in HTML in any manner.
34 42 """
35 try:
36 from lxml import html
37 except ImportError:
38 return s
39
40 tree = html.fragment_fromstring(s, create_parent='div')
41 _process_node_cite(tree)
42 s = html.tostring(tree, encoding='unicode')
43 if s.endswith('</div>'):
44 s = s[:-6]
45 if s.startswith('<div>'):
46 s = s[5:]
47 return s
43 parser = CitationParser()
44 parser.feed(s)
45 parser.close()
46 outtext = u''
47 startpos = 0
48 for citation in parser.citelist:
49 outtext += s[startpos:citation[1]]
50 outtext += '\\cite{%s}'%citation[0]
51 startpos = citation[2] if len(citation)==3 else -1
52 outtext += s[startpos:] if startpos != -1 else ''
53 return outtext
48 54
55 #-----------------------------------------------------------------------------
56 # Classes
57 #-----------------------------------------------------------------------------
58 class CitationParser(HTMLParser):
59 """Citation Parser
49 60
50 def _process_node_cite(node):
51 """Do the citation replacement as we walk the lxml tree."""
61 Replaces html tags with data-cite attribute with respective latex \\cite.
52 62
53 def _get(o, name):
54 value = getattr(o, name, None)
55 return '' if value is None else value
63 Inherites from HTMLParser, overrides:
64 - handle_starttag
65 - handle_endtag
66 """
67 # number of open tags
68 opentags = None
69 # list of found citations
70 citelist = None
71 # active citation tag
72 citetag = None
73
74 def __init__(self):
75 self.citelist = []
76 self.opentags = 0
77 HTMLParser.__init__(self)
56 78
57 if 'data-cite' in node.attrib:
58 cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}
59 prev = node.getprevious()
60 if prev is not None:
61 prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')
62 else:
63 parent = node.getparent()
64 if parent is not None:
65 parent.text = _get(parent, 'text') + cite + _get(node, 'tail')
66 try:
67 node.getparent().remove(node)
68 except AttributeError:
69 pass
70 else:
71 for child in node:
72 _process_node_cite(child)
79 def get_offset(self):
80 # Compute startposition in source
81 lin, offset = self.getpos()
82 pos = 0
83 for i in range(lin-1):
84 pos = self.data.find('\n',pos) + 1
85 return pos + offset
86
87 def handle_starttag(self, tag, attrs):
88 # for each tag check if attributes are present and if no citation is active
89 if self.opentags == 0 and len(attrs)>0:
90 for atr, data in attrs:
91 if atr.lower() == 'data-cite':
92 self.citetag = tag
93 self.opentags = 1
94 self.citelist.append([data, self.get_offset()])
95 return
96
97 if tag == self.citetag:
98 # found an open citation tag but not the starting one
99 self.opentags += 1
100
101 def handle_endtag(self, tag):
102 if tag == self.citetag:
103 # found citation tag check if starting one
104 if self.opentags == 1:
105 pos = self.get_offset()
106 self.citelist[-1].append(pos+len(tag)+3)
107 self.opentags -= 1
108
109 def feed(self, data):
110 self.data = data
111 HTMLParser.feed(self, data)
@@ -9,15 +9,13 b''
9 9 #-----------------------------------------------------------------------------
10 10 # Imports
11 11 #-----------------------------------------------------------------------------
12
13 12 from ..citation import citation2latex
14 13 from nose.tools import assert_equal
15 14
16 15 #-----------------------------------------------------------------------------
17 16 # Tests
18 17 #-----------------------------------------------------------------------------
19
20 test_md = """
18 test_md = {"""
21 19 # My Heading
22 20
23 21 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
@@ -26,14 +24,13 b' velit, lobortis sed interdum at, vestibulum vitae libero <strong data-cite="fper'
26 24 Lorem ipsum dolor sit amet, consectetur adipiscing elit
27 25 <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra.
28 26
29 <p>Here is a plain paragraph that should be unaffected.</p>
27 <p>Here is a plain paragraph that should be unaffected. It contains simple
28 relations like 1<2 & 4>5.</p>
30 29
31 30 * One <cite data-cite="jdfreder">Jonathan</cite>.
32 31 * Two <cite data-cite="carreau">Matthias</cite>.
33 32 * Three <cite data-cite="ivanov">Paul</cite>.
34 """
35
36 test_md_parsed = """
33 """: """
37 34 # My Heading
38 35
39 36 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
@@ -42,18 +39,112 b' velit, lobortis sed interdum at, vestibulum vitae libero \\cite{fperez}.'
42 39 Lorem ipsum dolor sit amet, consectetur adipiscing elit
43 40 \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra.
44 41
45 <p>Here is a plain paragraph that should be unaffected.</p>
42 <p>Here is a plain paragraph that should be unaffected. It contains simple
43 relations like 1<2 & 4>5.</p>
46 44
47 45 * One \cite{jdfreder}.
48 46 * Two \cite{carreau}.
49 47 * Three \cite{ivanov}.
50 """
48 """,
49
50 # No citations
51 r"""The quick brown fox jumps over the lazy dog.""":
52 r"""The quick brown fox jumps over the lazy dog.""",
53
54 # Simple inline
55 r"""Foo <cite data-cite=asdf>Text</cite> bar""":
56 r"""Foo \cite{asdf} bar""",
57
58 # Multiline
59 r"""<cite data-cite=ewqr>Text
60 </cite>Foo""":
61 r"""\cite{ewqr}Foo""",
62
63 # Nested tags
64 r"""<div><div data-cite=Foo><div>Text</div></div></div> Bar""":
65 r"""<div>\cite{Foo}</div> Bar""",
66
67 # Including Maths
68 r"""Foo $3*2*1$ <div data-cite=Foo>Text</div> Bar""":
69 r"""Foo $3*2*1$ \cite{Foo} Bar""",
70
71 # Missing end tag
72 r"""<cite data-cite=asdf>Test Foo""":
73 r"""\cite{asdf}""",
74
75 r"""<cite data-cite=asdf><cite>Test Foo""":
76 r"""\cite{asdf}""",
77
78 r"""<cite data-cite=asdf><cite>Test</cite> Foo""":
79 r"""\cite{asdf}""",
80
81 # Multiple arguments
82 r"""<cite width=qwer data-cite=asdf>Test</cite> Foo""":
83 r"""\cite{asdf} Foo""",
84
85 # Wrong capitalization
86 r"""<CITE data-cite=asdf>Test</cite> Foo""":
87 r"""\cite{asdf} Foo""",
88
89 r"""<cite DATA-CITE=asdf>Test</cite> Foo""":
90 r"""\cite{asdf} Foo""",
91
92 # Wrong end tag
93 r"""<asd data-cite=wer> ksjfs </asdf> sdf ds """:
94 r"""\cite{wer}""",
95
96 r"""<asd data-cite=wer>""":
97 r"""\cite{wer}""",
98
99 # Invalid tag names
100 r"""<frog> <foo data-cite=wer></foo>""":
101 r"""<frog> \cite{wer}""",
102
103 # Non-nested tags
104 r"""<strong> <h1> <cite data-cite=asdf></cite>Test</strong> Foo </h1>""":
105 r"""<strong> <h1> \cite{asdf}Test</strong> Foo </h1>""",
106
107 # LXML errors
108 r"""Foo
109 \begin{eqnarray}
110 1 & <cite data-cite=bar>bar1</cite> \\
111 3 & 4 \\
112 \end{eqnarray}""":
113 r"""Foo
114 \begin{eqnarray}
115 1 & \cite{bar} \\
116 3 & 4 \\
117 \end{eqnarray}""",
118
119 r"""
120 1<2 is true, but 3>4 is false.
121
122 $1<2$ is true, but $3>4$ is false.
123
124 1<2 it is even worse if it is alone in a line.""":
125 r"""
126 1<2 is true, but 3>4 is false.
127
128 $1<2$ is true, but $3>4$ is false.
129
130 1<2 it is even worse if it is alone in a line.""",
131
132 r"""
133 1 < 2 is true, but 3 > 4 is false
134
135 $1 < 2$ is true, but $3 > 4$ is false
136
137 1 < 2 it is even worse if it is alone in a line.
138 """:
139 r"""
140 1 < 2 is true, but 3 > 4 is false
141
142 $1 < 2$ is true, but $3 > 4$ is false
143
144 1 < 2 it is even worse if it is alone in a line.
145 """}
51 146
52 147 def test_citation2latex():
53 148 """Are citations parsed properly?"""
54 try:
55 from lxml import html #analysis:ignore
56 except ImportError:
57 assert_equal(test_md, citation2latex(test_md))
58 else:
59 assert_equal(test_md_parsed, citation2latex(test_md))
149 for input, output in test_md.items():
150 yield (assert_equal, citation2latex(input), output)
General Comments 0
You need to be logged in to leave comments. Login now