Show More
@@ -9,9 +9,13 | |||||
9 | #----------------------------------------------------------------------------- |
|
9 | #----------------------------------------------------------------------------- | |
10 |
|
10 | |||
11 | #----------------------------------------------------------------------------- |
|
11 | #----------------------------------------------------------------------------- | |
12 | # Code |
|
12 | # Imports | |
13 | #----------------------------------------------------------------------------- |
|
13 | #----------------------------------------------------------------------------- | |
|
14 | from HTMLParser import HTMLParser | |||
14 |
|
15 | |||
|
16 | #----------------------------------------------------------------------------- | |||
|
17 | # Functions | |||
|
18 | #----------------------------------------------------------------------------- | |||
15 |
|
19 | |||
16 | __all__ = ['citation2latex'] |
|
20 | __all__ = ['citation2latex'] | |
17 |
|
21 | |||
@@ -32,41 +36,72 def citation2latex(s): | |||||
32 | Any HTML tag can be used, which allows the citations to be formatted |
|
36 | Any HTML tag can be used, which allows the citations to be formatted | |
33 | in HTML in any manner. |
|
37 | in HTML in any manner. | |
34 | """ |
|
38 | """ | |
35 | try: |
|
39 | parser = CitationParser() | |
36 | from lxml import html |
|
40 | parser.feed(s) | |
37 | except ImportError: |
|
41 | parser.close() | |
38 | return s |
|
42 | outtext = u'' | |
39 |
|
43 | startpos = 0 | ||
40 | tree = html.fragment_fromstring(s, create_parent='div') |
|
44 | for citation in parser.citelist: | |
41 | _process_node_cite(tree) |
|
45 | outtext += s[startpos:citation[1]] | |
42 | s = html.tostring(tree, encoding='unicode') |
|
46 | outtext += '\\cite{%s}'%citation[0] | |
43 | if s.endswith('</div>'): |
|
47 | startpos = citation[2] | |
44 | s = s[:-6] |
|
48 | outtext += s[startpos:] | |
45 | if s.startswith('<div>'): |
|
49 | return outtext | |
46 | s = s[5:] |
|
50 | ||
47 | return s |
|
51 | #----------------------------------------------------------------------------- | |
48 |
|
52 | # Classes | ||
49 |
|
53 | #----------------------------------------------------------------------------- | ||
50 | def _process_node_cite(node): |
|
54 | class CitationParser(HTMLParser): | |
51 | """Do the citation replacement as we walk the lxml tree.""" |
|
55 | """Citation Parser | |
52 |
|
56 | |||
53 | def _get(o, name): |
|
57 | Replaces html tags with data-cite attribute with respective latex \\cite. | |
54 | value = getattr(o, name, None) |
|
58 | ||
55 | return '' if value is None else value |
|
59 | Inherites from HTMLParser, overrides: | |
56 |
|
60 | - handle_starttag | ||
57 | if 'data-cite' in node.attrib: |
|
61 | - handle_endtag | |
58 | cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']} |
|
62 | """ | |
59 | prev = node.getprevious() |
|
63 | # number of open tags | |
60 | if prev is not None: |
|
64 | opentags = None | |
61 | prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail') |
|
65 | # list of found citations | |
62 | else: |
|
66 | citelist = None | |
63 | parent = node.getparent() |
|
67 | # active citation tag | |
64 | if parent is not None: |
|
68 | citetag = None | |
65 | parent.text = _get(parent, 'text') + cite + _get(node, 'tail') |
|
69 | ||
66 | try: |
|
70 | def __init__(self): | |
67 | node.getparent().remove(node) |
|
71 | self.citelist = [] | |
68 | except AttributeError: |
|
72 | self.opentags = 0 | |
69 | pass |
|
73 | HTMLParser.__init__(self) | |
70 |
|
|
74 | ||
71 | for child in node: |
|
75 | def get_offset(self): | |
72 | _process_node_cite(child) |
|
76 | # Compute startposition in source | |
|
77 | lin, offset = self.getpos() | |||
|
78 | pos = 0 | |||
|
79 | for i in range(lin-1): | |||
|
80 | pos = self.data.find('\n',pos) + 1 | |||
|
81 | return pos + offset | |||
|
82 | ||||
|
83 | def handle_starttag(self, tag, attrs): | |||
|
84 | # for each tag check if attributes are present and if no citation is active | |||
|
85 | if self.opentags == 0 and len(attrs)>0: | |||
|
86 | for atr, data in attrs: | |||
|
87 | if atr.lower() == 'data-cite': | |||
|
88 | self.citetag = tag | |||
|
89 | self.opentags = 1 | |||
|
90 | self.citelist.append([data, self.get_offset()]) | |||
|
91 | return | |||
|
92 | ||||
|
93 | if tag == self.citetag: | |||
|
94 | # found an open citation tag but not the starting one | |||
|
95 | self.opentags += 1 | |||
|
96 | ||||
|
97 | def handle_endtag(self, tag): | |||
|
98 | if tag == self.citetag: | |||
|
99 | # found citation tag check if starting one | |||
|
100 | if self.opentags == 1: | |||
|
101 | pos = self.get_offset() | |||
|
102 | self.citelist[-1].append(pos+len(tag)+3) | |||
|
103 | self.opentags -= 1 | |||
|
104 | ||||
|
105 | def feed(self, data): | |||
|
106 | self.data = data | |||
|
107 | HTMLParser.feed(self, data) |
General Comments 0
You need to be logged in to leave comments.
Login now