Show More
@@ -9,9 +9,17 b'' | |||
|
9 | 9 | #----------------------------------------------------------------------------- |
|
10 | 10 | |
|
11 | 11 | #----------------------------------------------------------------------------- |
|
12 | # Code | |
|
12 | # Imports | |
|
13 | 13 | #----------------------------------------------------------------------------- |
|
14 | from IPython.utils.py3compat import PY3 | |
|
15 | if PY3: | |
|
16 | from html.parser import HTMLParser | |
|
17 | else: | |
|
18 | from HTMLParser import HTMLParser | |
|
14 | 19 | |
|
20 | #----------------------------------------------------------------------------- | |
|
21 | # Functions | |
|
22 | #----------------------------------------------------------------------------- | |
|
15 | 23 | |
|
16 | 24 | __all__ = ['citation2latex'] |
|
17 | 25 | |
@@ -32,41 +40,72 b' def citation2latex(s):' | |||
|
32 | 40 | Any HTML tag can be used, which allows the citations to be formatted |
|
33 | 41 | in HTML in any manner. |
|
34 | 42 | """ |
|
35 | try: | |
|
36 | from lxml import html | |
|
37 | except ImportError: | |
|
38 | return s | |
|
39 | ||
|
40 | tree = html.fragment_fromstring(s, create_parent='div') | |
|
41 | _process_node_cite(tree) | |
|
42 | s = html.tostring(tree, encoding='unicode') | |
|
43 | if s.endswith('</div>'): | |
|
44 | s = s[:-6] | |
|
45 | if s.startswith('<div>'): | |
|
46 | s = s[5:] | |
|
47 | return s | |
|
48 | ||
|
49 | ||
|
50 | def _process_node_cite(node): | |
|
51 | """Do the citation replacement as we walk the lxml tree.""" | |
|
52 | ||
|
53 | def _get(o, name): | |
|
54 | value = getattr(o, name, None) | |
|
55 | return '' if value is None else value | |
|
56 | ||
|
57 | if 'data-cite' in node.attrib: | |
|
58 | cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']} | |
|
59 | prev = node.getprevious() | |
|
60 | if prev is not None: | |
|
61 | prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail') | |
|
62 | else: | |
|
63 | parent = node.getparent() | |
|
64 | if parent is not None: | |
|
65 | parent.text = _get(parent, 'text') + cite + _get(node, 'tail') | |
|
66 | try: | |
|
67 | node.getparent().remove(node) | |
|
68 | except AttributeError: | |
|
69 | pass | |
|
70 |
|
|
|
71 | for child in node: | |
|
72 | _process_node_cite(child) | |
|
43 | parser = CitationParser() | |
|
44 | parser.feed(s) | |
|
45 | parser.close() | |
|
46 | outtext = u'' | |
|
47 | startpos = 0 | |
|
48 | for citation in parser.citelist: | |
|
49 | outtext += s[startpos:citation[1]] | |
|
50 | outtext += '\\cite{%s}'%citation[0] | |
|
51 | startpos = citation[2] if len(citation)==3 else -1 | |
|
52 | outtext += s[startpos:] if startpos != -1 else '' | |
|
53 | return outtext | |
|
54 | ||
|
55 | #----------------------------------------------------------------------------- | |
|
56 | # Classes | |
|
57 | #----------------------------------------------------------------------------- | |
|
58 | class CitationParser(HTMLParser): | |
|
59 | """Citation Parser | |
|
60 | ||
|
61 | Replaces html tags with data-cite attribute with respective latex \\cite. | |
|
62 | ||
|
63 | Inherites from HTMLParser, overrides: | |
|
64 | - handle_starttag | |
|
65 | - handle_endtag | |
|
66 | """ | |
|
67 | # number of open tags | |
|
68 | opentags = None | |
|
69 | # list of found citations | |
|
70 | citelist = None | |
|
71 | # active citation tag | |
|
72 | citetag = None | |
|
73 | ||
|
74 | def __init__(self): | |
|
75 | self.citelist = [] | |
|
76 | self.opentags = 0 | |
|
77 | HTMLParser.__init__(self) | |
|
78 | ||
|
79 | def get_offset(self): | |
|
80 | # Compute startposition in source | |
|
81 | lin, offset = self.getpos() | |
|
82 | pos = 0 | |
|
83 | for i in range(lin-1): | |
|
84 | pos = self.data.find('\n',pos) + 1 | |
|
85 | return pos + offset | |
|
86 | ||
|
87 | def handle_starttag(self, tag, attrs): | |
|
88 | # for each tag check if attributes are present and if no citation is active | |
|
89 | if self.opentags == 0 and len(attrs)>0: | |
|
90 | for atr, data in attrs: | |
|
91 | if atr.lower() == 'data-cite': | |
|
92 | self.citetag = tag | |
|
93 | self.opentags = 1 | |
|
94 | self.citelist.append([data, self.get_offset()]) | |
|
95 | return | |
|
96 | ||
|
97 | if tag == self.citetag: | |
|
98 | # found an open citation tag but not the starting one | |
|
99 | self.opentags += 1 | |
|
100 | ||
|
101 | def handle_endtag(self, tag): | |
|
102 | if tag == self.citetag: | |
|
103 | # found citation tag check if starting one | |
|
104 | if self.opentags == 1: | |
|
105 | pos = self.get_offset() | |
|
106 | self.citelist[-1].append(pos+len(tag)+3) | |
|
107 | self.opentags -= 1 | |
|
108 | ||
|
109 | def feed(self, data): | |
|
110 | self.data = data | |
|
111 | HTMLParser.feed(self, data) |
@@ -9,15 +9,13 b'' | |||
|
9 | 9 | #----------------------------------------------------------------------------- |
|
10 | 10 | # Imports |
|
11 | 11 | #----------------------------------------------------------------------------- |
|
12 | ||
|
13 | 12 | from ..citation import citation2latex |
|
14 | 13 | from nose.tools import assert_equal |
|
15 | 14 | |
|
16 | 15 | #----------------------------------------------------------------------------- |
|
17 | 16 | # Tests |
|
18 | 17 | #----------------------------------------------------------------------------- |
|
19 | ||
|
20 | test_md = """ | |
|
18 | test_md = {""" | |
|
21 | 19 | # My Heading |
|
22 | 20 | |
|
23 | 21 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue |
@@ -26,14 +24,13 b' velit, lobortis sed interdum at, vestibulum vitae libero <strong data-cite="fper' | |||
|
26 | 24 | Lorem ipsum dolor sit amet, consectetur adipiscing elit |
|
27 | 25 | <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra. |
|
28 | 26 | |
|
29 |
<p>Here is a plain paragraph that should be unaffected. |
|
|
27 | <p>Here is a plain paragraph that should be unaffected. It contains simple | |
|
28 | relations like 1<2 & 4>5.</p> | |
|
30 | 29 | |
|
31 | 30 | * One <cite data-cite="jdfreder">Jonathan</cite>. |
|
32 | 31 | * Two <cite data-cite="carreau">Matthias</cite>. |
|
33 | 32 | * Three <cite data-cite="ivanov">Paul</cite>. |
|
34 | """ | |
|
35 | ||
|
36 | test_md_parsed = """ | |
|
33 | """: """ | |
|
37 | 34 | # My Heading |
|
38 | 35 | |
|
39 | 36 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue |
@@ -42,18 +39,112 b' velit, lobortis sed interdum at, vestibulum vitae libero \\cite{fperez}.' | |||
|
42 | 39 | Lorem ipsum dolor sit amet, consectetur adipiscing elit |
|
43 | 40 | \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra. |
|
44 | 41 | |
|
45 |
<p>Here is a plain paragraph that should be unaffected. |
|
|
42 | <p>Here is a plain paragraph that should be unaffected. It contains simple | |
|
43 | relations like 1<2 & 4>5.</p> | |
|
46 | 44 | |
|
47 | 45 | * One \cite{jdfreder}. |
|
48 | 46 | * Two \cite{carreau}. |
|
49 | 47 | * Three \cite{ivanov}. |
|
50 | """ | |
|
48 | """, | |
|
49 | ||
|
50 | # No citations | |
|
51 | r"""The quick brown fox jumps over the lazy dog.""": | |
|
52 | r"""The quick brown fox jumps over the lazy dog.""", | |
|
53 | ||
|
54 | # Simple inline | |
|
55 | r"""Foo <cite data-cite=asdf>Text</cite> bar""": | |
|
56 | r"""Foo \cite{asdf} bar""", | |
|
57 | ||
|
58 | # Multiline | |
|
59 | r"""<cite data-cite=ewqr>Text | |
|
60 | </cite>Foo""": | |
|
61 | r"""\cite{ewqr}Foo""", | |
|
62 | ||
|
63 | # Nested tags | |
|
64 | r"""<div><div data-cite=Foo><div>Text</div></div></div> Bar""": | |
|
65 | r"""<div>\cite{Foo}</div> Bar""", | |
|
66 | ||
|
67 | # Including Maths | |
|
68 | r"""Foo $3*2*1$ <div data-cite=Foo>Text</div> Bar""": | |
|
69 | r"""Foo $3*2*1$ \cite{Foo} Bar""", | |
|
70 | ||
|
71 | # Missing end tag | |
|
72 | r"""<cite data-cite=asdf>Test Foo""": | |
|
73 | r"""\cite{asdf}""", | |
|
74 | ||
|
75 | r"""<cite data-cite=asdf><cite>Test Foo""": | |
|
76 | r"""\cite{asdf}""", | |
|
77 | ||
|
78 | r"""<cite data-cite=asdf><cite>Test</cite> Foo""": | |
|
79 | r"""\cite{asdf}""", | |
|
80 | ||
|
81 | # Multiple arguments | |
|
82 | r"""<cite width=qwer data-cite=asdf>Test</cite> Foo""": | |
|
83 | r"""\cite{asdf} Foo""", | |
|
84 | ||
|
85 | # Wrong capitalization | |
|
86 | r"""<CITE data-cite=asdf>Test</cite> Foo""": | |
|
87 | r"""\cite{asdf} Foo""", | |
|
88 | ||
|
89 | r"""<cite DATA-CITE=asdf>Test</cite> Foo""": | |
|
90 | r"""\cite{asdf} Foo""", | |
|
91 | ||
|
92 | # Wrong end tag | |
|
93 | r"""<asd data-cite=wer> ksjfs </asdf> sdf ds """: | |
|
94 | r"""\cite{wer}""", | |
|
95 | ||
|
96 | r"""<asd data-cite=wer>""": | |
|
97 | r"""\cite{wer}""", | |
|
98 | ||
|
99 | # Invalid tag names | |
|
100 | r"""<frog> <foo data-cite=wer></foo>""": | |
|
101 | r"""<frog> \cite{wer}""", | |
|
102 | ||
|
103 | # Non-nested tags | |
|
104 | r"""<strong> <h1> <cite data-cite=asdf></cite>Test</strong> Foo </h1>""": | |
|
105 | r"""<strong> <h1> \cite{asdf}Test</strong> Foo </h1>""", | |
|
106 | ||
|
107 | # LXML errors | |
|
108 | r"""Foo | |
|
109 | \begin{eqnarray} | |
|
110 | 1 & <cite data-cite=bar>bar1</cite> \\ | |
|
111 | 3 & 4 \\ | |
|
112 | \end{eqnarray}""": | |
|
113 | r"""Foo | |
|
114 | \begin{eqnarray} | |
|
115 | 1 & \cite{bar} \\ | |
|
116 | 3 & 4 \\ | |
|
117 | \end{eqnarray}""", | |
|
118 | ||
|
119 | r""" | |
|
120 | 1<2 is true, but 3>4 is false. | |
|
121 | ||
|
122 | $1<2$ is true, but $3>4$ is false. | |
|
123 | ||
|
124 | 1<2 it is even worse if it is alone in a line.""": | |
|
125 | r""" | |
|
126 | 1<2 is true, but 3>4 is false. | |
|
127 | ||
|
128 | $1<2$ is true, but $3>4$ is false. | |
|
129 | ||
|
130 | 1<2 it is even worse if it is alone in a line.""", | |
|
131 | ||
|
132 | r""" | |
|
133 | 1 < 2 is true, but 3 > 4 is false | |
|
134 | ||
|
135 | $1 < 2$ is true, but $3 > 4$ is false | |
|
136 | ||
|
137 | 1 < 2 it is even worse if it is alone in a line. | |
|
138 | """: | |
|
139 | r""" | |
|
140 | 1 < 2 is true, but 3 > 4 is false | |
|
141 | ||
|
142 | $1 < 2$ is true, but $3 > 4$ is false | |
|
143 | ||
|
144 | 1 < 2 it is even worse if it is alone in a line. | |
|
145 | """} | |
|
51 | 146 | |
|
52 | 147 | def test_citation2latex(): |
|
53 | 148 | """Are citations parsed properly?""" |
|
54 | try: | |
|
55 | from lxml import html #analysis:ignore | |
|
56 | except ImportError: | |
|
57 | assert_equal(test_md, citation2latex(test_md)) | |
|
58 | else: | |
|
59 | assert_equal(test_md_parsed, citation2latex(test_md)) | |
|
149 | for input, output in test_md.items(): | |
|
150 | yield (assert_equal, citation2latex(input), output) |
General Comments 0
You need to be logged in to leave comments.
Login now