Show More
@@ -1,72 +1,111 b'' | |||||
1 | """Citation handling for LaTeX output.""" |
|
1 | """Citation handling for LaTeX output.""" | |
2 |
|
2 | |||
3 | #----------------------------------------------------------------------------- |
|
3 | #----------------------------------------------------------------------------- | |
4 | # Copyright (c) 2013, the IPython Development Team. |
|
4 | # Copyright (c) 2013, the IPython Development Team. | |
5 | # |
|
5 | # | |
6 | # Distributed under the terms of the Modified BSD License. |
|
6 | # Distributed under the terms of the Modified BSD License. | |
7 | # |
|
7 | # | |
8 | # The full license is in the file COPYING.txt, distributed with this software. |
|
8 | # The full license is in the file COPYING.txt, distributed with this software. | |
9 | #----------------------------------------------------------------------------- |
|
9 | #----------------------------------------------------------------------------- | |
10 |
|
10 | |||
11 | #----------------------------------------------------------------------------- |
|
11 | #----------------------------------------------------------------------------- | |
12 | # Code |
|
12 | # Imports | |
13 | #----------------------------------------------------------------------------- |
|
13 | #----------------------------------------------------------------------------- | |
|
14 | from IPython.utils.py3compat import PY3 | |||
|
15 | if PY3: | |||
|
16 | from html.parser import HTMLParser | |||
|
17 | else: | |||
|
18 | from HTMLParser import HTMLParser | |||
14 |
|
19 | |||
|
20 | #----------------------------------------------------------------------------- | |||
|
21 | # Functions | |||
|
22 | #----------------------------------------------------------------------------- | |||
15 |
|
23 | |||
16 | __all__ = ['citation2latex'] |
|
24 | __all__ = ['citation2latex'] | |
17 |
|
25 | |||
18 |
|
26 | |||
19 | def citation2latex(s): |
|
27 | def citation2latex(s): | |
20 | """Parse citations in Markdown cells. |
|
28 | """Parse citations in Markdown cells. | |
21 |
|
29 | |||
22 | This looks for HTML tags having a data attribute names `data-cite` |
|
30 | This looks for HTML tags having a data attribute names `data-cite` | |
23 | and replaces it by the call to LaTeX cite command. The tranformation |
|
31 | and replaces it by the call to LaTeX cite command. The tranformation | |
24 | looks like this: |
|
32 | looks like this: | |
25 |
|
33 | |||
26 | `<cite data-cite="granger">(Granger, 2013)</cite>` |
|
34 | `<cite data-cite="granger">(Granger, 2013)</cite>` | |
27 |
|
35 | |||
28 | Becomes |
|
36 | Becomes | |
29 |
|
37 | |||
30 | `\\cite{granger}` |
|
38 | `\\cite{granger}` | |
31 |
|
39 | |||
32 | Any HTML tag can be used, which allows the citations to be formatted |
|
40 | Any HTML tag can be used, which allows the citations to be formatted | |
33 | in HTML in any manner. |
|
41 | in HTML in any manner. | |
34 | """ |
|
42 | """ | |
35 | try: |
|
43 | parser = CitationParser() | |
36 | from lxml import html |
|
44 | parser.feed(s) | |
37 | except ImportError: |
|
45 | parser.close() | |
38 | return s |
|
46 | outtext = u'' | |
39 |
|
47 | startpos = 0 | ||
40 | tree = html.fragment_fromstring(s, create_parent='div') |
|
48 | for citation in parser.citelist: | |
41 | _process_node_cite(tree) |
|
49 | outtext += s[startpos:citation[1]] | |
42 | s = html.tostring(tree, encoding='unicode') |
|
50 | outtext += '\\cite{%s}'%citation[0] | |
43 | if s.endswith('</div>'): |
|
51 | startpos = citation[2] if len(citation)==3 else -1 | |
44 | s = s[:-6] |
|
52 | outtext += s[startpos:] if startpos != -1 else '' | |
45 | if s.startswith('<div>'): |
|
53 | return outtext | |
46 | s = s[5:] |
|
|||
47 | return s |
|
|||
48 |
|
54 | |||
|
55 | #----------------------------------------------------------------------------- | |||
|
56 | # Classes | |||
|
57 | #----------------------------------------------------------------------------- | |||
|
58 | class CitationParser(HTMLParser): | |||
|
59 | """Citation Parser | |||
49 |
|
|
60 | ||
50 | def _process_node_cite(node): |
|
61 | Replaces html tags with data-cite attribute with respective latex \\cite. | |
51 | """Do the citation replacement as we walk the lxml tree.""" |
|
|||
52 |
|
|
62 | ||
53 | def _get(o, name): |
|
63 | Inherites from HTMLParser, overrides: | |
54 | value = getattr(o, name, None) |
|
64 | - handle_starttag | |
55 | return '' if value is None else value |
|
65 | - handle_endtag | |
|
66 | """ | |||
|
67 | # number of open tags | |||
|
68 | opentags = None | |||
|
69 | # list of found citations | |||
|
70 | citelist = None | |||
|
71 | # active citation tag | |||
|
72 | citetag = None | |||
|
73 | ||||
|
74 | def __init__(self): | |||
|
75 | self.citelist = [] | |||
|
76 | self.opentags = 0 | |||
|
77 | HTMLParser.__init__(self) | |||
56 |
|
78 | |||
57 | if 'data-cite' in node.attrib: |
|
79 | def get_offset(self): | |
58 | cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']} |
|
80 | # Compute startposition in source | |
59 |
|
|
81 | lin, offset = self.getpos() | |
60 | if prev is not None: |
|
82 | pos = 0 | |
61 | prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail') |
|
83 | for i in range(lin-1): | |
62 | else: |
|
84 | pos = self.data.find('\n',pos) + 1 | |
63 | parent = node.getparent() |
|
85 | return pos + offset | |
64 | if parent is not None: |
|
86 | ||
65 | parent.text = _get(parent, 'text') + cite + _get(node, 'tail') |
|
87 | def handle_starttag(self, tag, attrs): | |
66 | try: |
|
88 | # for each tag check if attributes are present and if no citation is active | |
67 | node.getparent().remove(node) |
|
89 | if self.opentags == 0 and len(attrs)>0: | |
68 | except AttributeError: |
|
90 | for atr, data in attrs: | |
69 | pass |
|
91 | if atr.lower() == 'data-cite': | |
70 | else: |
|
92 | self.citetag = tag | |
71 | for child in node: |
|
93 | self.opentags = 1 | |
72 | _process_node_cite(child) |
|
94 | self.citelist.append([data, self.get_offset()]) | |
|
95 | return | |||
|
96 | ||||
|
97 | if tag == self.citetag: | |||
|
98 | # found an open citation tag but not the starting one | |||
|
99 | self.opentags += 1 | |||
|
100 | ||||
|
101 | def handle_endtag(self, tag): | |||
|
102 | if tag == self.citetag: | |||
|
103 | # found citation tag check if starting one | |||
|
104 | if self.opentags == 1: | |||
|
105 | pos = self.get_offset() | |||
|
106 | self.citelist[-1].append(pos+len(tag)+3) | |||
|
107 | self.opentags -= 1 | |||
|
108 | ||||
|
109 | def feed(self, data): | |||
|
110 | self.data = data | |||
|
111 | HTMLParser.feed(self, data) |
@@ -1,59 +1,150 b'' | |||||
1 | #----------------------------------------------------------------------------- |
|
1 | #----------------------------------------------------------------------------- | |
2 | # Copyright (c) 2013, the IPython Development Team. |
|
2 | # Copyright (c) 2013, the IPython Development Team. | |
3 | # |
|
3 | # | |
4 | # Distributed under the terms of the Modified BSD License. |
|
4 | # Distributed under the terms of the Modified BSD License. | |
5 | # |
|
5 | # | |
6 | # The full license is in the file COPYING.txt, distributed with this software. |
|
6 | # The full license is in the file COPYING.txt, distributed with this software. | |
7 | #----------------------------------------------------------------------------- |
|
7 | #----------------------------------------------------------------------------- | |
8 |
|
8 | |||
9 | #----------------------------------------------------------------------------- |
|
9 | #----------------------------------------------------------------------------- | |
10 | # Imports |
|
10 | # Imports | |
11 | #----------------------------------------------------------------------------- |
|
11 | #----------------------------------------------------------------------------- | |
12 |
|
||||
13 | from ..citation import citation2latex |
|
12 | from ..citation import citation2latex | |
14 | from nose.tools import assert_equal |
|
13 | from nose.tools import assert_equal | |
15 |
|
14 | |||
16 | #----------------------------------------------------------------------------- |
|
15 | #----------------------------------------------------------------------------- | |
17 | # Tests |
|
16 | # Tests | |
18 | #----------------------------------------------------------------------------- |
|
17 | #----------------------------------------------------------------------------- | |
19 |
|
18 | test_md = {""" | ||
20 | test_md = """ |
|
|||
21 | # My Heading |
|
19 | # My Heading | |
22 |
|
20 | |||
23 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue |
|
21 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue | |
24 | porttitor scelerisque ac id diam <cite data-cite="granger">Granger</cite>. Mauris elit |
|
22 | porttitor scelerisque ac id diam <cite data-cite="granger">Granger</cite>. Mauris elit | |
25 | velit, lobortis sed interdum at, vestibulum vitae libero <strong data-cite="fperez">Perez</strong>. |
|
23 | velit, lobortis sed interdum at, vestibulum vitae libero <strong data-cite="fperez">Perez</strong>. | |
26 | Lorem ipsum dolor sit amet, consectetur adipiscing elit |
|
24 | Lorem ipsum dolor sit amet, consectetur adipiscing elit | |
27 | <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra. |
|
25 | <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra. | |
28 |
|
26 | |||
29 |
<p>Here is a plain paragraph that should be unaffected. |
|
27 | <p>Here is a plain paragraph that should be unaffected. It contains simple | |
|
28 | relations like 1<2 & 4>5.</p> | |||
30 |
|
29 | |||
31 | * One <cite data-cite="jdfreder">Jonathan</cite>. |
|
30 | * One <cite data-cite="jdfreder">Jonathan</cite>. | |
32 | * Two <cite data-cite="carreau">Matthias</cite>. |
|
31 | * Two <cite data-cite="carreau">Matthias</cite>. | |
33 | * Three <cite data-cite="ivanov">Paul</cite>. |
|
32 | * Three <cite data-cite="ivanov">Paul</cite>. | |
34 | """ |
|
33 | """: """ | |
35 |
|
||||
36 | test_md_parsed = """ |
|
|||
37 | # My Heading |
|
34 | # My Heading | |
38 |
|
35 | |||
39 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue |
|
36 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue | |
40 | porttitor scelerisque ac id diam \cite{granger}. Mauris elit |
|
37 | porttitor scelerisque ac id diam \cite{granger}. Mauris elit | |
41 | velit, lobortis sed interdum at, vestibulum vitae libero \cite{fperez}. |
|
38 | velit, lobortis sed interdum at, vestibulum vitae libero \cite{fperez}. | |
42 | Lorem ipsum dolor sit amet, consectetur adipiscing elit |
|
39 | Lorem ipsum dolor sit amet, consectetur adipiscing elit | |
43 | \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra. |
|
40 | \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra. | |
44 |
|
41 | |||
45 |
<p>Here is a plain paragraph that should be unaffected. |
|
42 | <p>Here is a plain paragraph that should be unaffected. It contains simple | |
|
43 | relations like 1<2 & 4>5.</p> | |||
46 |
|
44 | |||
47 | * One \cite{jdfreder}. |
|
45 | * One \cite{jdfreder}. | |
48 | * Two \cite{carreau}. |
|
46 | * Two \cite{carreau}. | |
49 | * Three \cite{ivanov}. |
|
47 | * Three \cite{ivanov}. | |
50 | """ |
|
48 | """, | |
|
49 | ||||
|
50 | # No citations | |||
|
51 | r"""The quick brown fox jumps over the lazy dog.""": | |||
|
52 | r"""The quick brown fox jumps over the lazy dog.""", | |||
|
53 | ||||
|
54 | # Simple inline | |||
|
55 | r"""Foo <cite data-cite=asdf>Text</cite> bar""": | |||
|
56 | r"""Foo \cite{asdf} bar""", | |||
|
57 | ||||
|
58 | # Multiline | |||
|
59 | r"""<cite data-cite=ewqr>Text | |||
|
60 | </cite>Foo""": | |||
|
61 | r"""\cite{ewqr}Foo""", | |||
|
62 | ||||
|
63 | # Nested tags | |||
|
64 | r"""<div><div data-cite=Foo><div>Text</div></div></div> Bar""": | |||
|
65 | r"""<div>\cite{Foo}</div> Bar""", | |||
|
66 | ||||
|
67 | # Including Maths | |||
|
68 | r"""Foo $3*2*1$ <div data-cite=Foo>Text</div> Bar""": | |||
|
69 | r"""Foo $3*2*1$ \cite{Foo} Bar""", | |||
|
70 | ||||
|
71 | # Missing end tag | |||
|
72 | r"""<cite data-cite=asdf>Test Foo""": | |||
|
73 | r"""\cite{asdf}""", | |||
|
74 | ||||
|
75 | r"""<cite data-cite=asdf><cite>Test Foo""": | |||
|
76 | r"""\cite{asdf}""", | |||
|
77 | ||||
|
78 | r"""<cite data-cite=asdf><cite>Test</cite> Foo""": | |||
|
79 | r"""\cite{asdf}""", | |||
|
80 | ||||
|
81 | # Multiple arguments | |||
|
82 | r"""<cite width=qwer data-cite=asdf>Test</cite> Foo""": | |||
|
83 | r"""\cite{asdf} Foo""", | |||
|
84 | ||||
|
85 | # Wrong capitalization | |||
|
86 | r"""<CITE data-cite=asdf>Test</cite> Foo""": | |||
|
87 | r"""\cite{asdf} Foo""", | |||
|
88 | ||||
|
89 | r"""<cite DATA-CITE=asdf>Test</cite> Foo""": | |||
|
90 | r"""\cite{asdf} Foo""", | |||
|
91 | ||||
|
92 | # Wrong end tag | |||
|
93 | r"""<asd data-cite=wer> ksjfs </asdf> sdf ds """: | |||
|
94 | r"""\cite{wer}""", | |||
|
95 | ||||
|
96 | r"""<asd data-cite=wer>""": | |||
|
97 | r"""\cite{wer}""", | |||
|
98 | ||||
|
99 | # Invalid tag names | |||
|
100 | r"""<frog> <foo data-cite=wer></foo>""": | |||
|
101 | r"""<frog> \cite{wer}""", | |||
|
102 | ||||
|
103 | # Non-nested tags | |||
|
104 | r"""<strong> <h1> <cite data-cite=asdf></cite>Test</strong> Foo </h1>""": | |||
|
105 | r"""<strong> <h1> \cite{asdf}Test</strong> Foo </h1>""", | |||
|
106 | ||||
|
107 | # LXML errors | |||
|
108 | r"""Foo | |||
|
109 | \begin{eqnarray} | |||
|
110 | 1 & <cite data-cite=bar>bar1</cite> \\ | |||
|
111 | 3 & 4 \\ | |||
|
112 | \end{eqnarray}""": | |||
|
113 | r"""Foo | |||
|
114 | \begin{eqnarray} | |||
|
115 | 1 & \cite{bar} \\ | |||
|
116 | 3 & 4 \\ | |||
|
117 | \end{eqnarray}""", | |||
|
118 | ||||
|
119 | r""" | |||
|
120 | 1<2 is true, but 3>4 is false. | |||
|
121 | ||||
|
122 | $1<2$ is true, but $3>4$ is false. | |||
|
123 | ||||
|
124 | 1<2 it is even worse if it is alone in a line.""": | |||
|
125 | r""" | |||
|
126 | 1<2 is true, but 3>4 is false. | |||
|
127 | ||||
|
128 | $1<2$ is true, but $3>4$ is false. | |||
|
129 | ||||
|
130 | 1<2 it is even worse if it is alone in a line.""", | |||
|
131 | ||||
|
132 | r""" | |||
|
133 | 1 < 2 is true, but 3 > 4 is false | |||
|
134 | ||||
|
135 | $1 < 2$ is true, but $3 > 4$ is false | |||
|
136 | ||||
|
137 | 1 < 2 it is even worse if it is alone in a line. | |||
|
138 | """: | |||
|
139 | r""" | |||
|
140 | 1 < 2 is true, but 3 > 4 is false | |||
|
141 | ||||
|
142 | $1 < 2$ is true, but $3 > 4$ is false | |||
|
143 | ||||
|
144 | 1 < 2 it is even worse if it is alone in a line. | |||
|
145 | """} | |||
51 |
|
146 | |||
52 | def test_citation2latex(): |
|
147 | def test_citation2latex(): | |
53 | """Are citations parsed properly?""" |
|
148 | """Are citations parsed properly?""" | |
54 | try: |
|
149 | for input, output in test_md.items(): | |
55 | from lxml import html #analysis:ignore |
|
150 | yield (assert_equal, citation2latex(input), output) | |
56 | except ImportError: |
|
|||
57 | assert_equal(test_md, citation2latex(test_md)) |
|
|||
58 | else: |
|
|||
59 | assert_equal(test_md_parsed, citation2latex(test_md)) |
|
General Comments 0
You need to be logged in to leave comments.
Login now