##// END OF EJS Templates
Merge pull request #5025 from jdfreder/jakob-html...
Brian E. Granger -
r15061:017de3a7 merge
parent child Browse files
Show More
@@ -9,9 +9,17 b''
9 #-----------------------------------------------------------------------------
9 #-----------------------------------------------------------------------------
10
10
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12 # Code
12 # Imports
13 #-----------------------------------------------------------------------------
13 #-----------------------------------------------------------------------------
14 from IPython.utils.py3compat import PY3
15 if PY3:
16 from html.parser import HTMLParser
17 else:
18 from HTMLParser import HTMLParser
14
19
20 #-----------------------------------------------------------------------------
21 # Functions
22 #-----------------------------------------------------------------------------
15
23
16 __all__ = ['citation2latex']
24 __all__ = ['citation2latex']
17
25
@@ -32,41 +40,72 b' def citation2latex(s):'
32 Any HTML tag can be used, which allows the citations to be formatted
40 Any HTML tag can be used, which allows the citations to be formatted
33 in HTML in any manner.
41 in HTML in any manner.
34 """
42 """
35 try:
43 parser = CitationParser()
36 from lxml import html
44 parser.feed(s)
37 except ImportError:
45 parser.close()
38 return s
46 outtext = u''
39
47 startpos = 0
40 tree = html.fragment_fromstring(s, create_parent='div')
48 for citation in parser.citelist:
41 _process_node_cite(tree)
49 outtext += s[startpos:citation[1]]
42 s = html.tostring(tree, encoding='unicode')
50 outtext += '\\cite{%s}'%citation[0]
43 if s.endswith('</div>'):
51 startpos = citation[2] if len(citation)==3 else -1
44 s = s[:-6]
52 outtext += s[startpos:] if startpos != -1 else ''
45 if s.startswith('<div>'):
53 return outtext
46 s = s[5:]
47 return s
48
54
55 #-----------------------------------------------------------------------------
56 # Classes
57 #-----------------------------------------------------------------------------
58 class CitationParser(HTMLParser):
59 """Citation Parser
49
60
50 def _process_node_cite(node):
61 Replaces html tags with data-cite attribute with respective latex \\cite.
51 """Do the citation replacement as we walk the lxml tree."""
52
62
53 def _get(o, name):
63 Inherites from HTMLParser, overrides:
54 value = getattr(o, name, None)
64 - handle_starttag
55 return '' if value is None else value
65 - handle_endtag
66 """
67 # number of open tags
68 opentags = None
69 # list of found citations
70 citelist = None
71 # active citation tag
72 citetag = None
73
74 def __init__(self):
75 self.citelist = []
76 self.opentags = 0
77 HTMLParser.__init__(self)
56
78
57 if 'data-cite' in node.attrib:
79 def get_offset(self):
58 cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}
80 # Compute startposition in source
59 prev = node.getprevious()
81 lin, offset = self.getpos()
60 if prev is not None:
82 pos = 0
61 prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')
83 for i in range(lin-1):
62 else:
84 pos = self.data.find('\n',pos) + 1
63 parent = node.getparent()
85 return pos + offset
64 if parent is not None:
86
65 parent.text = _get(parent, 'text') + cite + _get(node, 'tail')
87 def handle_starttag(self, tag, attrs):
66 try:
88 # for each tag check if attributes are present and if no citation is active
67 node.getparent().remove(node)
89 if self.opentags == 0 and len(attrs)>0:
68 except AttributeError:
90 for atr, data in attrs:
69 pass
91 if atr.lower() == 'data-cite':
70 else:
92 self.citetag = tag
71 for child in node:
93 self.opentags = 1
72 _process_node_cite(child)
94 self.citelist.append([data, self.get_offset()])
95 return
96
97 if tag == self.citetag:
98 # found an open citation tag but not the starting one
99 self.opentags += 1
100
101 def handle_endtag(self, tag):
102 if tag == self.citetag:
103 # found citation tag check if starting one
104 if self.opentags == 1:
105 pos = self.get_offset()
106 self.citelist[-1].append(pos+len(tag)+3)
107 self.opentags -= 1
108
109 def feed(self, data):
110 self.data = data
111 HTMLParser.feed(self, data)
@@ -9,15 +9,13 b''
9 #-----------------------------------------------------------------------------
9 #-----------------------------------------------------------------------------
10 # Imports
10 # Imports
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12
13 from ..citation import citation2latex
12 from ..citation import citation2latex
14 from nose.tools import assert_equal
13 from nose.tools import assert_equal
15
14
16 #-----------------------------------------------------------------------------
15 #-----------------------------------------------------------------------------
17 # Tests
16 # Tests
18 #-----------------------------------------------------------------------------
17 #-----------------------------------------------------------------------------
19
18 test_md = {"""
20 test_md = """
21 # My Heading
19 # My Heading
22
20
23 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
21 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
@@ -26,14 +24,13 b' velit, lobortis sed interdum at, vestibulum vitae libero <strong data-cite="fper'
26 Lorem ipsum dolor sit amet, consectetur adipiscing elit
24 Lorem ipsum dolor sit amet, consectetur adipiscing elit
27 <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra.
25 <em data-cite="takluyver">Thomas</em>. Quisque iaculis ligula ut ipsum mattis viverra.
28
26
29 <p>Here is a plain paragraph that should be unaffected.</p>
27 <p>Here is a plain paragraph that should be unaffected. It contains simple
28 relations like 1<2 & 4>5.</p>
30
29
31 * One <cite data-cite="jdfreder">Jonathan</cite>.
30 * One <cite data-cite="jdfreder">Jonathan</cite>.
32 * Two <cite data-cite="carreau">Matthias</cite>.
31 * Two <cite data-cite="carreau">Matthias</cite>.
33 * Three <cite data-cite="ivanov">Paul</cite>.
32 * Three <cite data-cite="ivanov">Paul</cite>.
34 """
33 """: """
35
36 test_md_parsed = """
37 # My Heading
34 # My Heading
38
35
39 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
36 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus ac magna non augue
@@ -42,18 +39,112 b' velit, lobortis sed interdum at, vestibulum vitae libero \\cite{fperez}.'
42 Lorem ipsum dolor sit amet, consectetur adipiscing elit
39 Lorem ipsum dolor sit amet, consectetur adipiscing elit
43 \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra.
40 \cite{takluyver}. Quisque iaculis ligula ut ipsum mattis viverra.
44
41
45 <p>Here is a plain paragraph that should be unaffected.</p>
42 <p>Here is a plain paragraph that should be unaffected. It contains simple
43 relations like 1<2 & 4>5.</p>
46
44
47 * One \cite{jdfreder}.
45 * One \cite{jdfreder}.
48 * Two \cite{carreau}.
46 * Two \cite{carreau}.
49 * Three \cite{ivanov}.
47 * Three \cite{ivanov}.
50 """
48 """,
49
50 # No citations
51 r"""The quick brown fox jumps over the lazy dog.""":
52 r"""The quick brown fox jumps over the lazy dog.""",
53
54 # Simple inline
55 r"""Foo <cite data-cite=asdf>Text</cite> bar""":
56 r"""Foo \cite{asdf} bar""",
57
58 # Multiline
59 r"""<cite data-cite=ewqr>Text
60 </cite>Foo""":
61 r"""\cite{ewqr}Foo""",
62
63 # Nested tags
64 r"""<div><div data-cite=Foo><div>Text</div></div></div> Bar""":
65 r"""<div>\cite{Foo}</div> Bar""",
66
67 # Including Maths
68 r"""Foo $3*2*1$ <div data-cite=Foo>Text</div> Bar""":
69 r"""Foo $3*2*1$ \cite{Foo} Bar""",
70
71 # Missing end tag
72 r"""<cite data-cite=asdf>Test Foo""":
73 r"""\cite{asdf}""",
74
75 r"""<cite data-cite=asdf><cite>Test Foo""":
76 r"""\cite{asdf}""",
77
78 r"""<cite data-cite=asdf><cite>Test</cite> Foo""":
79 r"""\cite{asdf}""",
80
81 # Multiple arguments
82 r"""<cite width=qwer data-cite=asdf>Test</cite> Foo""":
83 r"""\cite{asdf} Foo""",
84
85 # Wrong capitalization
86 r"""<CITE data-cite=asdf>Test</cite> Foo""":
87 r"""\cite{asdf} Foo""",
88
89 r"""<cite DATA-CITE=asdf>Test</cite> Foo""":
90 r"""\cite{asdf} Foo""",
91
92 # Wrong end tag
93 r"""<asd data-cite=wer> ksjfs </asdf> sdf ds """:
94 r"""\cite{wer}""",
95
96 r"""<asd data-cite=wer>""":
97 r"""\cite{wer}""",
98
99 # Invalid tag names
100 r"""<frog> <foo data-cite=wer></foo>""":
101 r"""<frog> \cite{wer}""",
102
103 # Non-nested tags
104 r"""<strong> <h1> <cite data-cite=asdf></cite>Test</strong> Foo </h1>""":
105 r"""<strong> <h1> \cite{asdf}Test</strong> Foo </h1>""",
106
107 # LXML errors
108 r"""Foo
109 \begin{eqnarray}
110 1 & <cite data-cite=bar>bar1</cite> \\
111 3 & 4 \\
112 \end{eqnarray}""":
113 r"""Foo
114 \begin{eqnarray}
115 1 & \cite{bar} \\
116 3 & 4 \\
117 \end{eqnarray}""",
118
119 r"""
120 1<2 is true, but 3>4 is false.
121
122 $1<2$ is true, but $3>4$ is false.
123
124 1<2 it is even worse if it is alone in a line.""":
125 r"""
126 1<2 is true, but 3>4 is false.
127
128 $1<2$ is true, but $3>4$ is false.
129
130 1<2 it is even worse if it is alone in a line.""",
131
132 r"""
133 1 < 2 is true, but 3 > 4 is false
134
135 $1 < 2$ is true, but $3 > 4$ is false
136
137 1 < 2 it is even worse if it is alone in a line.
138 """:
139 r"""
140 1 < 2 is true, but 3 > 4 is false
141
142 $1 < 2$ is true, but $3 > 4$ is false
143
144 1 < 2 it is even worse if it is alone in a line.
145 """}
51
146
52 def test_citation2latex():
147 def test_citation2latex():
53 """Are citations parsed properly?"""
148 """Are citations parsed properly?"""
54 try:
149 for input, output in test_md.items():
55 from lxml import html #analysis:ignore
150 yield (assert_equal, citation2latex(input), output)
56 except ImportError:
57 assert_equal(test_md, citation2latex(test_md))
58 else:
59 assert_equal(test_md_parsed, citation2latex(test_md))
General Comments 0
You need to be logged in to leave comments. Login now