From ffb0ed8398db3b6dbe98f0a57b9713ad5b51c194 2012-01-18 02:27:56
From: MinRK <benjaminrk@gmail.com>
Date: 2012-01-18 02:27:56
Subject: [PATCH] add NoDB for non-recording Hub

When used, this disables database-based actions in the Hub.

Useful for minimizing Hub memory consumption.

---

diff --git a/IPython/parallel/apps/ipcontrollerapp.py b/IPython/parallel/apps/ipcontrollerapp.py
index f9fba94..74e67ae 100755
--- a/IPython/parallel/apps/ipcontrollerapp.py
+++ b/IPython/parallel/apps/ipcontrollerapp.py
@@ -106,6 +106,13 @@ flags.update({
                     'use the MongoDB backend'),
     'dictdb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.DictDB'}},
                     'use the in-memory DictDB backend'),
+    'nodb' : ({'HubFactory' : {'db_class' : 'IPython.parallel.controller.dictdb.NoDB'}},
+                    """use dummy DB backend, which doesn't store any information.
+                    
+                    This can be used to prevent growth of the memory footprint of the Hub
+                    in cases where its record-keeping is not required.  Requesting results
+                    of tasks submitted by other clients, db_queries, and task resubmission
+                    will not be available."""),
     'reuse' : ({'IPControllerApp' : {'reuse_files' : True}},
                     'reuse existing json connection files')
 })
diff --git a/IPython/parallel/controller/dictdb.py b/IPython/parallel/controller/dictdb.py
index 21e44c5..10ce73e 100644
--- a/IPython/parallel/controller/dictdb.py
+++ b/IPython/parallel/controller/dictdb.py
@@ -183,3 +183,34 @@ class DictDB(BaseDB):
         """get all msg_ids, ordered by time submitted."""
         msg_ids = self._records.keys()
         return sorted(msg_ids, key=lambda m: self._records[m]['submitted'])
+
+class NoDB(DictDB):
+    """A blackhole db backend that actually stores no information.
+    
+    Provides the full DB interface, but raises KeyErrors on any
+    method that tries to access the records.  This can be used to
+    minimize the memory footprint of the Hub when its record-keeping
+    functionality is not required.
+    """
+    
+    def add_record(self, msg_id, record):
+        pass
+    
+    def get_record(self, msg_id):
+        raise KeyError("NoDB does not support record access")
+    
+    def update_record(self, msg_id, record):
+        pass
+    
+    def drop_matching_records(self, check):
+        pass
+    
+    def drop_record(self, msg_id):
+        pass
+    
+    def find_records(self, check, keys=None):
+        raise KeyError("NoDB does not store information")
+    
+    def get_history(self):
+        raise KeyError("NoDB does not store information")
+
diff --git a/docs/source/parallel/parallel_db.txt b/docs/source/parallel/parallel_db.txt
index f3dea61..648223f 100644
--- a/docs/source/parallel/parallel_db.txt
+++ b/docs/source/parallel/parallel_db.txt
@@ -112,3 +112,26 @@ Result headers for all jobs on engine 3 or 4:
     In [1]: uuids = map(rc._engines.get, (3,4))
 
     In [2]: hist34 = rc.db_query({'engine_uuid' : {'$in' : uuids }, keys='result_header')
+
+
+Cost
+====
+
+The advantage of the database backends is, of course, that large amounts of
+data can be stored that won't fit in memory.  The default 'backend' is actually
+to just store all of this information in a Python dictionary.  This is very fast,
+but will run out of memory quickly if you move a lot of data around, or your
+cluster is to run for a long time.
+
+Unfortunately, the DB backends (SQLite and MongoDB) right now are rather slow,
+and can still consume large amounts of resources, particularly if large tasks
+or results are being created at a high frequency.
+
+For this reason, we have added :class:`~.NoDB`,a dummy backend that doesn't
+actually store any information. When you use this database, nothing is stored,
+and any request for results will result in a KeyError.  This obviously prevents
+later requests for results and task resubmission from functioning, but
+sometimes those nice features are not as useful as keeping Hub memory under
+control.
+
+
diff --git a/docs/source/parallel/parallel_process.txt b/docs/source/parallel/parallel_process.txt
index 33068b6..d8c34c6 100644
--- a/docs/source/parallel/parallel_process.txt
+++ b/docs/source/parallel/parallel_process.txt
@@ -762,6 +762,10 @@ To use one of these backends, you must set the :attr:`HubFactory.db_class` trait
     
     # and SQLite:
     c.HubFactory.db_class = 'IPython.parallel.controller.sqlitedb.SQLiteDB'
+    
+    # You can use NoDB to disable the database altogether, in case you don't need
+    # to reuse tasks or results, and want to keep memory consumption under control.
+    c.HubFactory.db_class = 'IPython.parallel.controller.dictdb.NoDB'
 
 When using the proper databases, you can actually allow for tasks to persist from
 one session to the next by specifying the MongoDB database or SQLite table in
@@ -789,6 +793,22 @@ you can specify any arguments you may need to the PyMongo `Connection
     # keyword args to pymongo.Connection
     c.MongoDB.connection_kwargs = {}
 
+But sometimes you are moving lots of data around quickly, and you don't need
+that information to be stored for later access, even by other Clients to this
+same session. For this case, we have a dummy database, which doesn't actually
+store anything. This lets the Hub stay small in memory, at the obvious expense
+of being able to access the information that would have been stored in the
+database (used for task resubmission, requesting results of tasks you didn't
+submit, etc.). To use this backend, simply pass ``--nodb`` to
+:command:`ipcontroller` on the command-line, or specify the :class:`NoDB` class
+in your :file:`ipcontroller_config.py` as described above.
+
+
+.. seealso::
+
+    For more information on the database backends, see the :ref:`db backend reference <parallel_db>`.
+
+
 .. _PyMongo: http://api.mongodb.org/python/1.9/
 
 Configuring `ipengine`