upstream/ipython Commit - r15051:c6fe9d02

Replace lxml with HTMLParser in citation2latex

jakobgager -

r15051:c6fe9d02

parent child

IPython/nbconvert/filters/citation.py

0 +70 -35

@@ -1,72 +1,107 b''
1	"""Citation handling for LaTeX output."""	1	"""Citation handling for LaTeX output."""
2		2
3	#-----------------------------------------------------------------------------	3	#-----------------------------------------------------------------------------
4	# Copyright (c) 2013, the IPython Development Team.	4	# Copyright (c) 2013, the IPython Development Team.
5	#	5	#
6	# Distributed under the terms of the Modified BSD License.	6	# Distributed under the terms of the Modified BSD License.
7	#	7	#
8	# The full license is in the file COPYING.txt, distributed with this software.	8	# The full license is in the file COPYING.txt, distributed with this software.
9	#-----------------------------------------------------------------------------	9	#-----------------------------------------------------------------------------
10		10
11	#-----------------------------------------------------------------------------	11	#-----------------------------------------------------------------------------
12	# Code	12	# Imports
13	#-----------------------------------------------------------------------------	13	#-----------------------------------------------------------------------------
		14	from HTMLParser import HTMLParser
14		15
		16	#-----------------------------------------------------------------------------
		17	# Functions
		18	#-----------------------------------------------------------------------------
15		19
16	__all__ = ['citation2latex']	20	__all__ = ['citation2latex']
17		21
18		22
19	def citation2latex(s):	23	def citation2latex(s):
20	"""Parse citations in Markdown cells.	24	"""Parse citations in Markdown cells.
21		25
22	This looks for HTML tags having a data attribute names `data-cite`	26	This looks for HTML tags having a data attribute names `data-cite`
23	and replaces it by the call to LaTeX cite command. The tranformation	27	and replaces it by the call to LaTeX cite command. The tranformation
24	looks like this:	28	looks like this:
25		29
26	`<cite data-cite="granger">(Granger, 2013)</cite>`	30	`<cite data-cite="granger">(Granger, 2013)</cite>`
27		31
28	Becomes	32	Becomes
29		33
30	`\\cite{granger}`	34	`\\cite{granger}`
31		35
32	Any HTML tag can be used, which allows the citations to be formatted	36	Any HTML tag can be used, which allows the citations to be formatted
33	in HTML in any manner.	37	in HTML in any manner.
34	"""	38	"""
35	try:	39	parser = CitationParser()
36	from lxml import html	40	parser.feed(s)
37	except ImportError:	41	parser.close()
38	return s	42	outtext = u''
39		43	startpos = 0
40	tree = html.fragment_fromstring(s, create_parent='div')	44	for citation in parser.citelist:
41	_process_node_cite(tree)	45	outtext += s[startpos:citation[1]]
42	s = html.tostring(tree, encoding='unicode')	46	outtext += '\\cite{%s}'%citation[0]
43	if s.endswith('</div>'):	47	startpos = citation[2]
44	s = s[:-6]	48	outtext += s[startpos:]
45	if s.startswith('<div>'):	49	return outtext
46	s = s[5:]
47	return s
48		50
		51	#-----------------------------------------------------------------------------
		52	# Classes
		53	#-----------------------------------------------------------------------------
		54	class CitationParser(HTMLParser):
		55	"""Citation Parser
49		56
50	def _process_node_cite(node):	57	Replaces html tags with data-cite attribute with respective latex \\cite.
51	"""Do the citation replacement as we walk the lxml tree."""
52		58
53	def _get(o, name):	59	Inherites from HTMLParser, overrides:
54	value = getattr(o, name, None)	60	- handle_starttag
55	return '' if value is None else value	61	- handle_endtag
		62	"""
		63	# number of open tags
		64	opentags = None
		65	# list of found citations
		66	citelist = None
		67	# active citation tag
		68	citetag = None
		69
		70	def __init__(self):
		71	self.citelist = []
		72	self.opentags = 0
		73	HTMLParser.__init__(self)
56		74
57	if 'data-cite' in node.attrib:	75	def get_offset(self):
58	cite = '\cite{%(ref)s}' % {'ref': node.attrib['data-cite']}	76	# Compute startposition in source
59	~~prev~~ = ~~node~~.getp~~revi~~ous()	77	lin, offset = self.getpos()
60	if prev is not None:	78	pos = 0
61	prev.tail = _get(prev, 'tail') + cite + _get(node, 'tail')	79	for i in range(lin-1):
62	else:	80	pos = self.data.find('\n',pos) + 1
63	parent = node.getparent()	81	return pos + offset
64	if parent is not None:	82
65	parent.text = _get(parent, 'text') + cite + _get(node, 'tail')	83	def handle_starttag(self, tag, attrs):
66	try:	84	# for each tag check if attributes are present and if no citation is active
67	node.getparent().remove(node)	85	if self.opentags == 0 and len(attrs)>0:
68	except AttributeError:	86	for atr, data in attrs:
69	pass	87	if atr.lower() == 'data-cite':
70	else:	88	self.citetag = tag
71	for child in node:	89	self.opentags = 1
72	_process_node_cite(child)	90	self.citelist.append([data, self.get_offset()])
		91	return
		92
		93	if tag == self.citetag:
		94	# found an open citation tag but not the starting one
		95	self.opentags += 1
		96
		97	def handle_endtag(self, tag):
		98	if tag == self.citetag:
		99	# found citation tag check if starting one
		100	if self.opentags == 1:
		101	pos = self.get_offset()
		102	self.citelist[-1].append(pos+len(tag)+3)
		103	self.opentags -= 1
		104
		105	def feed(self, data):
		106	self.data = data
		107	HTMLParser.feed(self, data)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages