##// END OF EJS Templates
Replace lxml with HTMLParser in citation2latex
jakobgager -
Show More
@@ -9,9 +9,13
9 9 #-----------------------------------------------------------------------------
10 10
11 11 #-----------------------------------------------------------------------------
12 # Code
12 # Imports
13 13 #-----------------------------------------------------------------------------
14 from HTMLParser import HTMLParser
14 15
16 #-----------------------------------------------------------------------------
17 # Functions
18 #-----------------------------------------------------------------------------
15 19
16 20 __all__ = ['citation2latex']
17 21
@@ -32,41 +36,72 def citation2latex(s):
32 36 Any HTML tag can be used, which allows the citations to be formatted
33 37 in HTML in any manner.
34 38 """
35 try:
36 from lxml import html
37 except ImportError:
38 return s
39
40 tree = html.fragment_fromstring(s, create_parent='div')
41 _process_node_cite(tree)
42 s = html.tostring(tree, encoding='unicode')
43 if s.endswith('</div>'):
44 s = s[:-6]
45 if s.startswith('<div>'):
46 s = s[5:]
47 return s
48
49
50 def _process_node_cite(node):
51 """Do the citation replacement as we walk the lxml tree."""
52
53 def _get(o, name):
54 value = getattr(o, name, None)
55 return '' if value is None else value
56
57 if 'data-cite' in node.attrib:
58 cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}
59 prev = node.getprevious()
60 if prev is not None:
61 prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')
62 else:
63 parent = node.getparent()
64 if parent is not None:
65 parent.text = _get(parent, 'text') + cite + _get(node, 'tail')
66 try:
67 node.getparent().remove(node)
68 except AttributeError:
69 pass
70 else:
71 for child in node:
72 _process_node_cite(child)
39 parser = CitationParser()
40 parser.feed(s)
41 parser.close()
42 outtext = u''
43 startpos = 0
44 for citation in parser.citelist:
45 outtext += s[startpos:citation[1]]
46 outtext += '\\cite{%s}'%citation[0]
47 startpos = citation[2]
48 outtext += s[startpos:]
49 return outtext
50
51 #-----------------------------------------------------------------------------
52 # Classes
53 #-----------------------------------------------------------------------------
54 class CitationParser(HTMLParser):
55 """Citation Parser
56
57 Replaces html tags with data-cite attribute with respective latex \\cite.
58
59 Inherites from HTMLParser, overrides:
60 - handle_starttag
61 - handle_endtag
62 """
63 # number of open tags
64 opentags = None
65 # list of found citations
66 citelist = None
67 # active citation tag
68 citetag = None
69
70 def __init__(self):
71 self.citelist = []
72 self.opentags = 0
73 HTMLParser.__init__(self)
74
75 def get_offset(self):
76 # Compute startposition in source
77 lin, offset = self.getpos()
78 pos = 0
79 for i in range(lin-1):
80 pos = self.data.find('\n',pos) + 1
81 return pos + offset
82
83 def handle_starttag(self, tag, attrs):
84 # for each tag check if attributes are present and if no citation is active
85 if self.opentags == 0 and len(attrs)>0:
86 for atr, data in attrs:
87 if atr.lower() == 'data-cite':
88 self.citetag = tag
89 self.opentags = 1
90 self.citelist.append([data, self.get_offset()])
91 return
92
93 if tag == self.citetag:
94 # found an open citation tag but not the starting one
95 self.opentags += 1
96
97 def handle_endtag(self, tag):
98 if tag == self.citetag:
99 # found citation tag check if starting one
100 if self.opentags == 1:
101 pos = self.get_offset()
102 self.citelist[-1].append(pos+len(tag)+3)
103 self.opentags -= 1
104
105 def feed(self, data):
106 self.data = data
107 HTMLParser.feed(self, data)
General Comments 0
You need to be logged in to leave comments. Login now