upstream/ipython Files · converters/base.py

Allow to build a converter without input file

Matthias BUSSONNIER - - Load All Authors

File last commit:

r9571:48f7d4d6


                r9571:48f7d4d6

Download file

             base.py
        
                    433 lines
            
             | 14.2 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / converters / base.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      """Base classes for the notebook conversion pipeline.

      This module defines Converter, from which all objects designed to implement

      a conversion of IPython notebooks to some other format should inherit.

      """

      #-----------------------------------------------------------------------------

      # Copyright (c) 2012, the IPython Development Team.

      #

      # Distributed under the terms of the Modified BSD License.

      #

      # The full license is in the file COPYING.txt, distributed with this software.

      #-----------------------------------------------------------------------------

      #-----------------------------------------------------------------------------

      # Imports

      #-----------------------------------------------------------------------------

      from __future__ import print_function, absolute_import

      # Stdlib imports

      import codecs

      import io

      import logging

      import os

      import pprint

      import re

      from types import FunctionType

      # IPython imports

      from IPython.nbformat import current as nbformat

      from IPython.config.configurable import Configurable, SingletonConfigurable

      from IPython.utils.traitlets import (List, Unicode, Type, Bool, Dict, CaselessStrEnum,

                                          Any)

      # Our own imports

      from .utils import remove_fake_files_url

      #-----------------------------------------------------------------------------

      # Local utilities

      #-----------------------------------------------------------------------------

      def clean_filename(filename):

          """

          Remove non-alphanumeric characters from filenames.

          Parameters

          ----------

          filename : str

              The filename to be sanitized.

          Returns

          -------

          clean : str

              A sanitized filename that contains only alphanumeric

              characters and underscores.

          """

          filename = re.sub(r'[^a-zA-Z0-9_]', '_', filename)

          return filename

      #-----------------------------------------------------------------------------

      # Class declarations

      #-----------------------------------------------------------------------------

      class ConversionException(Exception):

          pass

      class DocStringInheritor(type):

          """

          This metaclass will walk the list of bases until the desired

          superclass method is found AND if that method has a docstring and only

          THEN does it attach the superdocstring to the derived class method.

          Please use carefully, I just did the metaclass thing by following

          Michael Foord's Metaclass tutorial

          (http://www.voidspace.org.uk/python/articles/metaclasses.shtml), I may

          have missed a step or two.

          source:

          http://groups.google.com/group/comp.lang.python/msg/26f7b4fcb4d66c95

          by Paul McGuire

          """

          def __new__(meta, classname, bases, classDict):

              newClassDict = {}

              for attributeName, attribute in classDict.items():

                  if type(attribute) == FunctionType:

                      # look through bases for matching function by name

                      for baseclass in bases:

                          if hasattr(baseclass, attributeName):

                              basefn = getattr(baseclass, attributeName)

                              if basefn.__doc__:

                                  attribute.__doc__ = basefn.__doc__

                                  break

                  newClassDict[attributeName] = attribute

              return type.__new__(meta, classname, bases, newClassDict)

      class Converter(Configurable):

          #__metaclass__ = DocStringInheritor

          #-------------------------------------------------------------------------

          # Class-level attributes determining the behaviour of the class but

          # probably not varying from instance to instance.

          #-------------------------------------------------------------------------

          default_encoding = 'utf-8'

          extension = str()

          blank_symbol = " "

          # Which display data format is best? Subclasses can override if

          # they have specific requirements.

          display_data_priority = ['pdf', 'svg', 'png', 'jpg', 'text']

          #-------------------------------------------------------------------------

          # Instance-level attributes that are set in the constructor for this

          # class.

          #-------------------------------------------------------------------------

          infile = Any()

          highlight_source = Bool(True,

                           config=True,

                           help="Enable syntax highlighting for code blocks.")

          preamble = Unicode( "" ,

                              config=True,

                              help="Path to a user-specified preamble file")

          extract_figures = Bool( True,

                                  config=True,

                                  help="""extract base-64 encoded figures of the notebook into separate files,

                                       replace by link to corresponding file in source.""")

          infile_dir = Unicode()

          infile_root = Unicode()

          clean_name = Unicode()

          files_dir = Unicode()

          outbase = Unicode()

          #-------------------------------------------------------------------------

          # Instance-level attributes that are set by other methods in the base

          # class.

          #-------------------------------------------------------------------------

          figures_counter = 0

          output = Unicode()

          #-------------------------------------------------------------------------

          # Instance-level attributes that are not actually mentioned further

          # in this class. TODO: Could they be usefully moved to a subclass?

          #-------------------------------------------------------------------------

          with_preamble = Bool(True,config=True)

          user_preamble = None

          raw_as_verbatim = False

          def __init__(self, infile=None, config=None, exclude=[] **kw):

              super(Converter,self).__init__(config=config)

              #DocStringInheritor.__init__(self=config)

              # N.B. Initialized in the same order as defined above. Please try to

              # keep in this way for readability's sake.

              self.exclude_cells = exclude

              self.infile = infile

              if infile:

                  self.infile_dir, infile_root = os.path.split(infile)

                  self.infile_root = os.path.splitext(infile_root)[0]

                  self.clean_name = clean_filename(self.infile_root)

                  # Handle the creation of a directory for ancillary files, for

                  # formats that need one.

                  files_dir = os.path.join(self.infile_dir, self.clean_name + '_files')

                  if not os.path.isdir(files_dir):

                      os.mkdir(files_dir)

                  self.files_dir = files_dir

                  self.outbase = os.path.join(self.infile_dir, self.infile_root)

          def __del__(self):

              if os.path.isdir(self.files_dir) and not os.listdir(self.files_dir):

                  os.rmdir(self.files_dir)

          def _get_prompt_number(self, cell):

              return cell.prompt_number if hasattr(cell, 'prompt_number') \

                  else self.blank_symbol

          def dispatch(self, cell_type):

              """return cell_type dependent render method,  for example render_code

              """

              return getattr(self, 'render_' + cell_type, self.render_unknown)

          def dispatch_display_format(self, format):

              """

              return output_type dependent render method,  for example

              render_output_text

              """

              return getattr(self, 'render_display_format_' + format,

                             self.render_unknown_display)

          def convert(self, cell_separator='\n'):

              """

              Generic method to converts notebook to a string representation.

              This is accomplished by dispatching on the cell_type, so subclasses of

              Convereter class do not need to re-implement this method, but just

              need implementation for the methods that will be dispatched.

              Parameters

              ----------

              cell_separator : string

                Character or string to join cells with. Default is "\n"

              Returns

              -------

              out : string

              """

              lines = []

              lines.extend(self.optional_header())

              lines.extend(self.main_body(cell_separator))

              lines.extend(self.optional_footer())

              return u'\n'.join(lines)

          def main_body(self, cell_separator='\n'):

              converted_cells = []

              for worksheet in self.nb.worksheets:

                  for cell in worksheet.cells:

                      #print(cell.cell_type)  # dbg

                      conv_fn = self.dispatch(cell.cell_type)

                      if cell.cell_type in ('markdown', 'raw'):

                          remove_fake_files_url(cell)

                      converted_cells.append('\n'.join(conv_fn(cell)))

              cell_lines = cell_separator.join(converted_cells).split('\n')

              return cell_lines

          def render(self):

              "read, convert, and save self.infile"

              if not hasattr(self, 'nb'):

                  self.read()

              self.output = self.convert()

              assert(type(self.output) == unicode)

              return self.save()

          def read(self):

              "read and parse notebook into NotebookNode called self.nb"

              with open(self.infile) as f:

                  self.nb = nbformat.read(f, 'json')

          def save(self, outfile=None, encoding=None):

              "read and parse notebook into self.nb"

              if outfile is None:

                  outfile = self.outbase + '.' + self.extension

              if encoding is None:

                  encoding = self.default_encoding

              with io.open(outfile, 'w', encoding=encoding) as f:

                  f.write(self.output)

              return os.path.abspath(outfile)

          def optional_header(self):

              """

              Optional header to insert at the top of the converted notebook

              Returns a list

              """

              return []

          def optional_footer(self):

              """

              Optional footer to insert at the end of the converted notebook

              Returns a list

              """

              return []

          def _new_figure(self, data, fmt):

              """Create a new figure file in the given format.

              Returns a path relative to the input file.

              """

              figname = '%s_fig_%02i.%s' % (self.clean_name,

                                            self.figures_counter, fmt)

              self.figures_counter += 1

              fullname = os.path.join(self.files_dir, figname)

              # Binary files are base64-encoded, SVG is already XML

              if fmt in ('png', 'jpg', 'pdf'):

                  data = data.decode('base64')

                  fopen = lambda fname: open(fname, 'wb')

              else:

                  fopen = lambda fname: codecs.open(fname, 'wb',

                                                    self.default_encoding)

              with fopen(fullname) as f:

                  f.write(data)

              return fullname

          def render_heading(self, cell):

              """convert a heading cell

              Returns list."""

              raise NotImplementedError

          def render_code(self, cell):

              """Convert a code cell

              Returns list."""

              raise NotImplementedError

          def render_markdown(self, cell):

              """convert a markdown cell

              Returns list."""

              raise NotImplementedError

          def _img_lines(self, img_file):

              """Return list of lines to include an image file."""

              # Note: subclasses may choose to implement format-specific _FMT_lines

              # methods if they so choose (FMT in {png, svg, jpg, pdf}).

              raise NotImplementedError

          def render_display_data(self, output):

              """convert display data from the output of a code cell

              Returns list.

              """

              for fmt in self.display_data_priority:

                  if fmt in output:

                      break

              else:

                  for fmt in output:

                      if fmt != 'output_type':

                          break

                  else:

                      raise RuntimeError('no display data')

              # Is it an image?

              if fmt in ['png', 'svg', 'jpg', 'pdf'] and self.extract_figures:

                  print('I will extract this', fmt)

                  img_file = self._new_figure(output[fmt], fmt)

                  # Subclasses can have format-specific render functions (e.g.,

                  # latex has to auto-convert all SVG to PDF first).

                  lines_fun = getattr(self, '_%s_lines' % fmt, None)

                  if not lines_fun:

                      lines_fun = self._img_lines

                  lines = lines_fun(img_file)

              else:

                  print('I will NOT extract this', fmt)

                  lines_fun = self.dispatch_display_format(fmt)

                  lines = lines_fun(output)

              return lines

          def render_raw(self, cell):

              """convert a cell with raw text

              Returns list."""

              raise NotImplementedError

          def render_unknown(self, cell):

              """Render cells of unkown type

              Returns list."""

              data = pprint.pformat(cell)

              logging.warning('Unknown cell: %s' % cell.cell_type)

              return self._unknown_lines(data)

          def render_unknown_display(self, output, type):

              """Render cells of unkown type

              Returns list."""

              data = pprint.pformat(output)

              logging.warning('Unknown output: %s' % output.output_type)

              return self._unknown_lines(data)

          def render_stream(self, output):

              """render the stream part of an output

              Returns list.

              Identical to render_display_format_text

              """

              return self.render_display_format_text(output)

          def render_pyout(self, output):

              """convert pyout part of a code cell

              Returns list."""

              raise NotImplementedError

          def render_pyerr(self, output):

              """convert pyerr part of a code cell

              Returns list."""

              raise NotImplementedError

          def _unknown_lines(self, data):

              """Return list of lines for an unknown cell.

              Parameters

              ----------

              data : str

                The content of the unknown data as a single string.

              """

              raise NotImplementedError

          # These are the possible format types in an output node

          def render_display_format_text(self, output):

              """render the text part of an output

              Returns list.

              """

              raise NotImplementedError

          def render_display_format_html(self, output):

              """render the html part of an output

              Returns list.

              """

              raise NotImplementedError

          def render_display_format_latex(self, output):

              """render the latex part of an output

              Returns list.

              """

              raise NotImplementedError

          def render_display_format_json(self, output):

              """render the json part of an output

              Returns list.

              """

              raise NotImplementedError

          def render_display_format_javascript(self, output):

              """render the javascript part of an output

              Returns list.

              """

              raise NotImplementedError

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				"""Base classes for the notebook conversion pipeline.

				This module defines Converter, from which all objects designed to implement
				a conversion of IPython notebooks to some other format should inherit.
				"""
				#-----------------------------------------------------------------------------
				# Copyright (c) 2012, the IPython Development Team.
				#
				# Distributed under the terms of the Modified BSD License.
				#
				# The full license is in the file COPYING.txt, distributed with this software.
				#-----------------------------------------------------------------------------

				#-----------------------------------------------------------------------------
				# Imports
				#-----------------------------------------------------------------------------

				from __future__ import print_function, absolute_import

				# Stdlib imports
				import codecs
				import io
				import logging
				import os
				import pprint
				import re
				from types import FunctionType

				# IPython imports
				from IPython.nbformat import current as nbformat
				from IPython.config.configurable import Configurable, SingletonConfigurable
				from IPython.utils.traitlets import (List, Unicode, Type, Bool, Dict, CaselessStrEnum,
				Any)

				# Our own imports
				from .utils import remove_fake_files_url


				#-----------------------------------------------------------------------------
				# Local utilities
				#-----------------------------------------------------------------------------

				def clean_filename(filename):
				"""
				Remove non-alphanumeric characters from filenames.

				Parameters
				----------
				filename : str
				The filename to be sanitized.

				Returns
				-------
				clean : str
				A sanitized filename that contains only alphanumeric
				characters and underscores.
				"""
				filename = re.sub(r'[^a-zA-Z0-9_]', '_', filename)
				return filename


				#-----------------------------------------------------------------------------
				# Class declarations
				#-----------------------------------------------------------------------------

				class ConversionException(Exception):
				pass


				class DocStringInheritor(type):
				"""
				This metaclass will walk the list of bases until the desired
				superclass method is found AND if that method has a docstring and only
				THEN does it attach the superdocstring to the derived class method.

				Please use carefully, I just did the metaclass thing by following
				Michael Foord's Metaclass tutorial
				(http://www.voidspace.org.uk/python/articles/metaclasses.shtml), I may
				have missed a step or two.

				source:
				http://groups.google.com/group/comp.lang.python/msg/26f7b4fcb4d66c95
				by Paul McGuire
				"""
				def __new__(meta, classname, bases, classDict):
				newClassDict = {}
				for attributeName, attribute in classDict.items():
				if type(attribute) == FunctionType:
				# look through bases for matching function by name
				for baseclass in bases:
				if hasattr(baseclass, attributeName):
				basefn = getattr(baseclass, attributeName)
				if basefn.__doc__:
				attribute.__doc__ = basefn.__doc__
				break
				newClassDict[attributeName] = attribute
				return type.__new__(meta, classname, bases, newClassDict)


				class Converter(Configurable):
				#__metaclass__ = DocStringInheritor
				#-------------------------------------------------------------------------
				# Class-level attributes determining the behaviour of the class but
				# probably not varying from instance to instance.
				#-------------------------------------------------------------------------
				default_encoding = 'utf-8'
				extension = str()
				blank_symbol = " "
				# Which display data format is best? Subclasses can override if
				# they have specific requirements.
				display_data_priority = ['pdf', 'svg', 'png', 'jpg', 'text']
				#-------------------------------------------------------------------------
				# Instance-level attributes that are set in the constructor for this
				# class.
				#-------------------------------------------------------------------------
				infile = Any()

				highlight_source = Bool(True,
				config=True,
				help="Enable syntax highlighting for code blocks.")

				preamble = Unicode( "" ,
				config=True,
				help="Path to a user-specified preamble file")

				extract_figures = Bool( True,
				config=True,
				help="""extract base-64 encoded figures of the notebook into separate files,
				replace by link to corresponding file in source.""")

				infile_dir = Unicode()
				infile_root = Unicode()
				clean_name = Unicode()
				files_dir = Unicode()
				outbase = Unicode()
				#-------------------------------------------------------------------------
				# Instance-level attributes that are set by other methods in the base
				# class.
				#-------------------------------------------------------------------------
				figures_counter = 0
				output = Unicode()
				#-------------------------------------------------------------------------
				# Instance-level attributes that are not actually mentioned further
				# in this class. TODO: Could they be usefully moved to a subclass?
				#-------------------------------------------------------------------------
				with_preamble = Bool(True,config=True)
				user_preamble = None
				raw_as_verbatim = False


				def __init__(self, infile=None, config=None, exclude=[] **kw):
				super(Converter,self).__init__(config=config)

				#DocStringInheritor.__init__(self=config)
				# N.B. Initialized in the same order as defined above. Please try to
				# keep in this way for readability's sake.
				self.exclude_cells = exclude
				self.infile = infile
				if infile:
				self.infile_dir, infile_root = os.path.split(infile)
				self.infile_root = os.path.splitext(infile_root)[0]
				self.clean_name = clean_filename(self.infile_root)
				# Handle the creation of a directory for ancillary files, for
				# formats that need one.
				files_dir = os.path.join(self.infile_dir, self.clean_name + '_files')
				if not os.path.isdir(files_dir):
				os.mkdir(files_dir)
				self.files_dir = files_dir
				self.outbase = os.path.join(self.infile_dir, self.infile_root)

				def __del__(self):
				if os.path.isdir(self.files_dir) and not os.listdir(self.files_dir):
				os.rmdir(self.files_dir)

				def _get_prompt_number(self, cell):
				return cell.prompt_number if hasattr(cell, 'prompt_number') \
				else self.blank_symbol

				def dispatch(self, cell_type):
				"""return cell_type dependent render method, for example render_code
				"""
				return getattr(self, 'render_' + cell_type, self.render_unknown)

				def dispatch_display_format(self, format):
				"""
				return output_type dependent render method, for example
				render_output_text
				"""
				return getattr(self, 'render_display_format_' + format,
				self.render_unknown_display)

				def convert(self, cell_separator='\n'):
				"""
				Generic method to converts notebook to a string representation.

				This is accomplished by dispatching on the cell_type, so subclasses of
				Convereter class do not need to re-implement this method, but just
				need implementation for the methods that will be dispatched.

				Parameters
				----------
				cell_separator : string
				Character or string to join cells with. Default is "\n"

				Returns
				-------
				out : string
				"""
				lines = []
				lines.extend(self.optional_header())
				lines.extend(self.main_body(cell_separator))
				lines.extend(self.optional_footer())
				return u'\n'.join(lines)

				def main_body(self, cell_separator='\n'):
				converted_cells = []
				for worksheet in self.nb.worksheets:
				for cell in worksheet.cells:
				#print(cell.cell_type) # dbg
				conv_fn = self.dispatch(cell.cell_type)
				if cell.cell_type in ('markdown', 'raw'):
				remove_fake_files_url(cell)
				converted_cells.append('\n'.join(conv_fn(cell)))
				cell_lines = cell_separator.join(converted_cells).split('\n')
				return cell_lines

				def render(self):
				"read, convert, and save self.infile"
				if not hasattr(self, 'nb'):
				self.read()
				self.output = self.convert()
				assert(type(self.output) == unicode)
				return self.save()

				def read(self):
				"read and parse notebook into NotebookNode called self.nb"
				with open(self.infile) as f:
				self.nb = nbformat.read(f, 'json')

				def save(self, outfile=None, encoding=None):
				"read and parse notebook into self.nb"
				if outfile is None:
				outfile = self.outbase + '.' + self.extension
				if encoding is None:
				encoding = self.default_encoding
				with io.open(outfile, 'w', encoding=encoding) as f:
				f.write(self.output)
				return os.path.abspath(outfile)

				def optional_header(self):
				"""
				Optional header to insert at the top of the converted notebook

				Returns a list
				"""
				return []

				def optional_footer(self):
				"""
				Optional footer to insert at the end of the converted notebook

				Returns a list
				"""
				return []

				def _new_figure(self, data, fmt):
				"""Create a new figure file in the given format.

				Returns a path relative to the input file.
				"""
				figname = '%s_fig_%02i.%s' % (self.clean_name,
				self.figures_counter, fmt)
				self.figures_counter += 1
				fullname = os.path.join(self.files_dir, figname)

				# Binary files are base64-encoded, SVG is already XML
				if fmt in ('png', 'jpg', 'pdf'):
				data = data.decode('base64')
				fopen = lambda fname: open(fname, 'wb')
				else:
				fopen = lambda fname: codecs.open(fname, 'wb',
				self.default_encoding)

				with fopen(fullname) as f:
				f.write(data)

				return fullname

				def render_heading(self, cell):
				"""convert a heading cell

				Returns list."""
				raise NotImplementedError

				def render_code(self, cell):
				"""Convert a code cell

				Returns list."""
				raise NotImplementedError

				def render_markdown(self, cell):
				"""convert a markdown cell

				Returns list."""
				raise NotImplementedError

				def _img_lines(self, img_file):
				"""Return list of lines to include an image file."""
				# Note: subclasses may choose to implement format-specific _FMT_lines
				# methods if they so choose (FMT in {png, svg, jpg, pdf}).
				raise NotImplementedError

				def render_display_data(self, output):
				"""convert display data from the output of a code cell

				Returns list.
				"""
				for fmt in self.display_data_priority:
				if fmt in output:
				break
				else:
				for fmt in output:
				if fmt != 'output_type':
				break
				else:
				raise RuntimeError('no display data')

				# Is it an image?
				if fmt in ['png', 'svg', 'jpg', 'pdf'] and self.extract_figures:
				print('I will extract this', fmt)
				img_file = self._new_figure(output[fmt], fmt)
				# Subclasses can have format-specific render functions (e.g.,
				# latex has to auto-convert all SVG to PDF first).
				lines_fun = getattr(self, '_%s_lines' % fmt, None)
				if not lines_fun:
				lines_fun = self._img_lines
				lines = lines_fun(img_file)
				else:
				print('I will NOT extract this', fmt)
				lines_fun = self.dispatch_display_format(fmt)
				lines = lines_fun(output)

				return lines

				def render_raw(self, cell):
				"""convert a cell with raw text

				Returns list."""
				raise NotImplementedError

				def render_unknown(self, cell):
				"""Render cells of unkown type

				Returns list."""
				data = pprint.pformat(cell)
				logging.warning('Unknown cell: %s' % cell.cell_type)
				return self._unknown_lines(data)

				def render_unknown_display(self, output, type):
				"""Render cells of unkown type

				Returns list."""
				data = pprint.pformat(output)
				logging.warning('Unknown output: %s' % output.output_type)
				return self._unknown_lines(data)

				def render_stream(self, output):
				"""render the stream part of an output

				Returns list.

				Identical to render_display_format_text
				"""
				return self.render_display_format_text(output)

				def render_pyout(self, output):
				"""convert pyout part of a code cell

				Returns list."""
				raise NotImplementedError

				def render_pyerr(self, output):
				"""convert pyerr part of a code cell

				Returns list."""
				raise NotImplementedError

				def _unknown_lines(self, data):
				"""Return list of lines for an unknown cell.

				Parameters
				----------
				data : str
				The content of the unknown data as a single string.
				"""
				raise NotImplementedError

				# These are the possible format types in an output node

				def render_display_format_text(self, output):
				"""render the text part of an output

				Returns list.
				"""
				raise NotImplementedError

				def render_display_format_html(self, output):
				"""render the html part of an output

				Returns list.
				"""
				raise NotImplementedError

				def render_display_format_latex(self, output):
				"""render the latex part of an output

				Returns list.
				"""
				raise NotImplementedError

				def render_display_format_json(self, output):
				"""render the json part of an output

				Returns list.
				"""
				raise NotImplementedError

				def render_display_format_javascript(self, output):
				"""render the javascript part of an output

				Returns list.
				"""
				raise NotImplementedError