##// END OF EJS Templates
Replace lxml with HTMLParser in citation2latex
jakobgager -
Show More
@@ -9,9 +9,13 b''
9 #-----------------------------------------------------------------------------
9 #-----------------------------------------------------------------------------
10
10
11 #-----------------------------------------------------------------------------
11 #-----------------------------------------------------------------------------
12 # Code
12 # Imports
13 #-----------------------------------------------------------------------------
13 #-----------------------------------------------------------------------------
14 from HTMLParser import HTMLParser
14
15
16 #-----------------------------------------------------------------------------
17 # Functions
18 #-----------------------------------------------------------------------------
15
19
16 __all__ = ['citation2latex']
20 __all__ = ['citation2latex']
17
21
@@ -32,41 +36,72 b' def citation2latex(s):'
32 Any HTML tag can be used, which allows the citations to be formatted
36 Any HTML tag can be used, which allows the citations to be formatted
33 in HTML in any manner.
37 in HTML in any manner.
34 """
38 """
35 try:
39 parser = CitationParser()
36 from lxml import html
40 parser.feed(s)
37 except ImportError:
41 parser.close()
38 return s
42 outtext = u''
39
43 startpos = 0
40 tree = html.fragment_fromstring(s, create_parent='div')
44 for citation in parser.citelist:
41 _process_node_cite(tree)
45 outtext += s[startpos:citation[1]]
42 s = html.tostring(tree, encoding='unicode')
46 outtext += '\\cite{%s}'%citation[0]
43 if s.endswith('</div>'):
47 startpos = citation[2]
44 s = s[:-6]
48 outtext += s[startpos:]
45 if s.startswith('<div>'):
49 return outtext
46 s = s[5:]
47 return s
48
50
51 #-----------------------------------------------------------------------------
52 # Classes
53 #-----------------------------------------------------------------------------
54 class CitationParser(HTMLParser):
55 """Citation Parser
49
56
50 def _process_node_cite(node):
57 Replaces html tags with data-cite attribute with respective latex \\cite.
51 """Do the citation replacement as we walk the lxml tree."""
52
58
53 def _get(o, name):
59 Inherites from HTMLParser, overrides:
54 value = getattr(o, name, None)
60 - handle_starttag
55 return '' if value is None else value
61 - handle_endtag
62 """
63 # number of open tags
64 opentags = None
65 # list of found citations
66 citelist = None
67 # active citation tag
68 citetag = None
69
70 def __init__(self):
71 self.citelist = []
72 self.opentags = 0
73 HTMLParser.__init__(self)
56
74
57 if 'data-cite' in node.attrib:
75 def get_offset(self):
58 cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}
76 # Compute startposition in source
59 prev = node.getprevious()
77 lin, offset = self.getpos()
60 if prev is not None:
78 pos = 0
61 prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')
79 for i in range(lin-1):
62 else:
80 pos = self.data.find('\n',pos) + 1
63 parent = node.getparent()
81 return pos + offset
64 if parent is not None:
82
65 parent.text = _get(parent, 'text') + cite + _get(node, 'tail')
83 def handle_starttag(self, tag, attrs):
66 try:
84 # for each tag check if attributes are present and if no citation is active
67 node.getparent().remove(node)
85 if self.opentags == 0 and len(attrs)>0:
68 except AttributeError:
86 for atr, data in attrs:
69 pass
87 if atr.lower() == 'data-cite':
70 else:
88 self.citetag = tag
71 for child in node:
89 self.opentags = 1
72 _process_node_cite(child)
90 self.citelist.append([data, self.get_offset()])
91 return
92
93 if tag == self.citetag:
94 # found an open citation tag but not the starting one
95 self.opentags += 1
96
97 def handle_endtag(self, tag):
98 if tag == self.citetag:
99 # found citation tag check if starting one
100 if self.opentags == 1:
101 pos = self.get_offset()
102 self.citelist[-1].append(pos+len(tag)+3)
103 self.opentags -= 1
104
105 def feed(self, data):
106 self.data = data
107 HTMLParser.feed(self, data)
General Comments 0
You need to be logged in to leave comments. Login now