Show More
@@ -9,9 +9,13 | |||
|
9 | 9 | #----------------------------------------------------------------------------- |
|
10 | 10 | |
|
11 | 11 | #----------------------------------------------------------------------------- |
|
12 | # Code | |
|
12 | # Imports | |
|
13 | 13 | #----------------------------------------------------------------------------- |
|
14 | from HTMLParser import HTMLParser | |
|
14 | 15 | |
|
16 | #----------------------------------------------------------------------------- | |
|
17 | # Functions | |
|
18 | #----------------------------------------------------------------------------- | |
|
15 | 19 | |
|
16 | 20 | __all__ = ['citation2latex'] |
|
17 | 21 | |
@@ -32,41 +36,72 def citation2latex(s): | |||
|
32 | 36 | Any HTML tag can be used, which allows the citations to be formatted |
|
33 | 37 | in HTML in any manner. |
|
34 | 38 | """ |
|
35 | try: | |
|
36 | from lxml import html | |
|
37 | except ImportError: | |
|
38 | return s | |
|
39 | ||
|
40 | tree = html.fragment_fromstring(s, create_parent='div') | |
|
41 | _process_node_cite(tree) | |
|
42 | s = html.tostring(tree, encoding='unicode') | |
|
43 | if s.endswith('</div>'): | |
|
44 | s = s[:-6] | |
|
45 | if s.startswith('<div>'): | |
|
46 | s = s[5:] | |
|
47 | return s | |
|
48 | ||
|
49 | ||
|
50 | def _process_node_cite(node): | |
|
51 | """Do the citation replacement as we walk the lxml tree.""" | |
|
52 | ||
|
53 | def _get(o, name): | |
|
54 | value = getattr(o, name, None) | |
|
55 | return '' if value is None else value | |
|
56 | ||
|
57 | if 'data-cite' in node.attrib: | |
|
58 | cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']} | |
|
59 | prev = node.getprevious() | |
|
60 | if prev is not None: | |
|
61 | prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail') | |
|
62 | else: | |
|
63 | parent = node.getparent() | |
|
64 | if parent is not None: | |
|
65 | parent.text = _get(parent, 'text') + cite + _get(node, 'tail') | |
|
66 | try: | |
|
67 | node.getparent().remove(node) | |
|
68 | except AttributeError: | |
|
69 | pass | |
|
70 |
|
|
|
71 | for child in node: | |
|
72 | _process_node_cite(child) | |
|
39 | parser = CitationParser() | |
|
40 | parser.feed(s) | |
|
41 | parser.close() | |
|
42 | outtext = u'' | |
|
43 | startpos = 0 | |
|
44 | for citation in parser.citelist: | |
|
45 | outtext += s[startpos:citation[1]] | |
|
46 | outtext += '\\cite{%s}'%citation[0] | |
|
47 | startpos = citation[2] | |
|
48 | outtext += s[startpos:] | |
|
49 | return outtext | |
|
50 | ||
|
51 | #----------------------------------------------------------------------------- | |
|
52 | # Classes | |
|
53 | #----------------------------------------------------------------------------- | |
|
54 | class CitationParser(HTMLParser): | |
|
55 | """Citation Parser | |
|
56 | ||
|
57 | Replaces html tags with data-cite attribute with respective latex \\cite. | |
|
58 | ||
|
59 | Inherites from HTMLParser, overrides: | |
|
60 | - handle_starttag | |
|
61 | - handle_endtag | |
|
62 | """ | |
|
63 | # number of open tags | |
|
64 | opentags = None | |
|
65 | # list of found citations | |
|
66 | citelist = None | |
|
67 | # active citation tag | |
|
68 | citetag = None | |
|
69 | ||
|
70 | def __init__(self): | |
|
71 | self.citelist = [] | |
|
72 | self.opentags = 0 | |
|
73 | HTMLParser.__init__(self) | |
|
74 | ||
|
75 | def get_offset(self): | |
|
76 | # Compute startposition in source | |
|
77 | lin, offset = self.getpos() | |
|
78 | pos = 0 | |
|
79 | for i in range(lin-1): | |
|
80 | pos = self.data.find('\n',pos) + 1 | |
|
81 | return pos + offset | |
|
82 | ||
|
83 | def handle_starttag(self, tag, attrs): | |
|
84 | # for each tag check if attributes are present and if no citation is active | |
|
85 | if self.opentags == 0 and len(attrs)>0: | |
|
86 | for atr, data in attrs: | |
|
87 | if atr.lower() == 'data-cite': | |
|
88 | self.citetag = tag | |
|
89 | self.opentags = 1 | |
|
90 | self.citelist.append([data, self.get_offset()]) | |
|
91 | return | |
|
92 | ||
|
93 | if tag == self.citetag: | |
|
94 | # found an open citation tag but not the starting one | |
|
95 | self.opentags += 1 | |
|
96 | ||
|
97 | def handle_endtag(self, tag): | |
|
98 | if tag == self.citetag: | |
|
99 | # found citation tag check if starting one | |
|
100 | if self.opentags == 1: | |
|
101 | pos = self.get_offset() | |
|
102 | self.citelist[-1].append(pos+len(tag)+3) | |
|
103 | self.opentags -= 1 | |
|
104 | ||
|
105 | def feed(self, data): | |
|
106 | self.data = data | |
|
107 | HTMLParser.feed(self, data) |
General Comments 0
You need to be logged in to leave comments.
Login now