upstream/ipython Commit - r8018:e0803aa9

Merge pull request from minrk/limitdictdb...

Min RK -

r8018:e0803aa9

parent child

IPython/parallel/controller/dictdb.py

0 +93 -7

             """A Task logger that presents our DB interface,
             but exists entirely in memory and implemented with dicts.
             Authors:
             * Min RK
             TaskRecords are dicts of the form:
             {
                 'msg_id' : str(uuid),
                 'client_uuid' : str(uuid),
                 'engine_uuid' : str(uuid) or None,
                 'header' : dict(header),
                 'content': dict(content),
                 'buffers': list(buffers),
                 'submitted': datetime,
                 'started': datetime or None,
                 'completed': datetime or None,
                 'resubmitted': datetime or None,
                 'result_header' : dict(header) or None,
                 'result_content' : dict(content) or None,
                 'result_buffers' : list(buffers) or None,
             }
             With this info, many of the special categories of tasks can be defined by query:
             pending:  completed is None
             client's outstanding: client_uuid = uuid && completed is None
             MIA: arrived is None (and completed is None)
             etc.
             EngineRecords are dicts of the form:
             {
                 'eid' : int(id),
                 'uuid': str(uuid)
             }
             This may be extended, but is currently.
             We support a subset of mongodb operators:
                 $lt,$gt,$lte,$gte,$ne,$in,$nin,$all,$mod,$exists
             """
             #-----------------------------------------------------------------------------
             #  Copyright (C) 2010-2011  The IPython Development Team
             #
             #  Distributed under the terms of the BSD License.  The full license is in
             #  the file COPYING, distributed as part of this software.
             #-----------------------------------------------------------------------------
             from copy import deepcopy as copy
             from datetime import datetime
             from IPython.config.configurable import LoggingConfigurable
-            from IPython.utils.traitlets import Dict, Unicode, Instance
+            from IPython.utils.traitlets import Dict, Unicode, Integer, Float
             filters = {
              '$lt' : lambda a,b: a < b,
              '$gt' : lambda a,b: b > a,
              '$eq' : lambda a,b: a == b,
              '$ne' : lambda a,b: a != b,
              '$lte': lambda a,b: a <= b,
              '$gte': lambda a,b: a >= b,
              '$in' : lambda a,b: a in b,
              '$nin': lambda a,b: a not in b,
              '$all': lambda a,b: all([ a in bb for bb in b ]),
              '$mod': lambda a,b: a%b[0] == b[1],
              '$exists' : lambda a,b: (b and a is not None) or (a is None and not b)
             }
             class CompositeFilter(object):
                 """Composite filter for matching multiple properties."""
                 def __init__(self, dikt):
                     self.tests = []
                     self.values = []
                     for key, value in dikt.iteritems():
                         self.tests.append(filters[key])
                         self.values.append(value)
                 def __call__(self, value):
                     for test,check in zip(self.tests, self.values):
                         if not test(value, check):
                             return False
                     return True
             class BaseDB(LoggingConfigurable):
                 """Empty Parent class so traitlets work on DB."""
                 # base configurable traits:
                 session = Unicode("")
             class DictDB(BaseDB):
                 """Basic in-memory dict-based object for saving Task Records.
                 This is the first object to present the DB interface
                 for logging tasks out of memory.
                 The interface is based on MongoDB, so adding a MongoDB
                 backend should be straightforward.
                 """
                 _records = Dict()
+                _culled_ids = set() # set of ids which have been culled
+                _buffer_bytes = Integer(0) # running total of the bytes in the DB
+                size_limit = Integer(1024*1024, config=True,
+                    help="""The maximum total size (in bytes) of the buffers stored in the db
+                    When the db exceeds this size, the oldest records will be culled until
+                    the total size is under size_limit * (1-cull_fraction).
+                    """
+                )
+                record_limit = Integer(1024, config=True,
+                    help="""The maximum number of records in the db
+                    When the history exceeds this size, the first record_limit * cull_fraction
+                    records will be culled.
+                    """
+                )
+                cull_fraction = Float(0.1, config=True,
+                    help="""The fraction by which the db should culled when one of the limits is exceeded
+                    In general, the db size will spend most of its time with a size in the range:
+                    [limit * (1-cull_fraction), limit]
+                    for each of size_limit and record_limit.
+                    """
+                )
                 def _match_one(self, rec, tests):
                     """Check if a specific record matches tests."""
                     for key,test in tests.iteritems():
                         if not test(rec.get(key, None)):
                             return False
                     return True
                 def _match(self, check):
                     """Find all the matches for a check dict."""
                     matches = []
                     tests = {}
                     for k,v in check.iteritems():
                         if isinstance(v, dict):
                             tests[k] = CompositeFilter(v)
                         else:
                             tests[k] = lambda o: o==v
                     for rec in self._records.itervalues():
                         if self._match_one(rec, tests):
                             matches.append(copy(rec))
                     return matches
                 def _extract_subdict(self, rec, keys):
                     """extract subdict of keys"""
                     d = {}
                     d['msg_id'] = rec['msg_id']
                     for key in keys:
                         d[key] = rec[key]
                     return copy(d)
+                # methods for monitoring size / culling history
+                def _add_bytes(self, rec):
+                    for key in ('buffers', 'result_buffers'):
+                        for buf in rec.get(key) or []:
+                            self._buffer_bytes += len(buf)
+                    self._maybe_cull()
+                def _drop_bytes(self, rec):
+                    for key in ('buffers', 'result_buffers'):
+                        for buf in rec.get(key) or []:
+                            self._buffer_bytes -= len(buf)
+                def _cull_oldest(self, n=1):
+                    """cull the oldest N records"""
+                    for msg_id in self.get_history()[:n]:
+                        self.log.debug("Culling record: %r", msg_id)
+                        self._culled_ids.add(msg_id)
+                        self.drop_record(msg_id)
+                def _maybe_cull(self):
+                    # cull by count:
+                    if len(self._records) > self.record_limit:
+                        to_cull = int(self.cull_fraction * self.record_limit)
+                        self.log.info("%i records exceeds limit of %i, culling oldest %i",
+                            len(self._records), self.record_limit, to_cull
+                        )
+                        self._cull_oldest(to_cull)
+                    # cull by size:
+                    if self._buffer_bytes > self.size_limit:
+                        limit = self.size_limit * (1 - self.cull_fraction)
+                        before = self._buffer_bytes
+                        before_count = len(self._records)
+                        culled = 0
+                        while self._buffer_bytes > limit:
+                            self._cull_oldest(1)
+                            culled += 1
+                        self.log.info("%i records with total buffer size %i exceeds limit: %i. Culled oldest %i records.",
+                            before_count, before, self.size_limit, culled
+                        )
+                # public API methods:
                 def add_record(self, msg_id, rec):
                     """Add a new Task Record, by msg_id."""
                     if msg_id in self._records:
                         raise KeyError("Already have msg_id %r"%(msg_id))
                     self._records[msg_id] = rec
+                    self._add_bytes(rec)
+                    self._maybe_cull()
                 def get_record(self, msg_id):
                     """Get a specific Task Record, by msg_id."""
+                    if msg_id in self._culled_ids:
+                        raise KeyError("Record %r has been culled for size" % msg_id)
                     if not msg_id in self._records:
                         raise KeyError("No such msg_id %r"%(msg_id))
                     return copy(self._records[msg_id])
                 def update_record(self, msg_id, rec):
                     """Update the data in an existing record."""
-                    self._records[msg_id].update(rec)
+                    if msg_id in self._culled_ids:
+                        raise KeyError("Record %r has been culled for size" % msg_id)
+                    _rec = self._records[msg_id]
+                    self._drop_bytes(_rec)
+                    _rec.update(rec)
+                    self._add_bytes(_rec)
                 def drop_matching_records(self, check):
                     """Remove a record from the DB."""
                     matches = self._match(check)
-                    for m in matches:
+                    for rec in matches:
-                        del self._records[m['msg_id']]
+                        self._drop_bytes(rec)
+                        del self._records[rec['msg_id']]
                 def drop_record(self, msg_id):
                     """Remove a record from the DB."""
+                    rec = self._records[msg_id]
+                    self._drop_bytes(rec)
                     del self._records[msg_id]
                 def find_records(self, check, keys=None):
                     """Find records matching a query dict, optionally extracting subset of keys.
                     Returns dict keyed by msg_id of matching records.
                     Parameters
                     ----------
                     check: dict
                         mongodb-style query argument
                     keys: list of strs [optional]
                         if specified, the subset of keys to extract.  msg_id will *always* be
                         included.
                     """
                     matches = self._match(check)
                     if keys:
                         return [ self._extract_subdict(rec, keys) for rec in matches ]
                     else:
                         return matches
                 def get_history(self):
                     """get all msg_ids, ordered by time submitted."""
                     msg_ids = self._records.keys()
                     return sorted(msg_ids, key=lambda m: self._records[m]['submitted'])
             NODATA = KeyError("NoDB backend doesn't store any data. "
             "Start the Controller with a DB backend to enable resubmission / result persistence."
             )
-            class NoDB(DictDB):
+            class NoDB(BaseDB):
                 """A blackhole db backend that actually stores no information.
                 Provides the full DB interface, but raises KeyErrors on any
                 method that tries to access the records.  This can be used to
                 minimize the memory footprint of the Hub when its record-keeping
                 functionality is not required.
                 """
                 def add_record(self, msg_id, record):
                     pass
                 def get_record(self, msg_id):
                     raise NODATA
                 def update_record(self, msg_id, record):
                     pass
                 def drop_matching_records(self, check):
                     pass
                 def drop_record(self, msg_id):
                     pass
                 def find_records(self, check, keys=None):
                     raise NODATA
                 def get_history(self):
                     raise NODATA

IPython/parallel/tests/test_db.py

0 +70 -5

             """Tests for db backends
             Authors:
             * Min RK
             """
             #-------------------------------------------------------------------------------
             #  Copyright (C) 2011  The IPython Development Team
             #
             #  Distributed under the terms of the BSD License.  The full license is in
             #  the file COPYING, distributed as part of this software.
             #-------------------------------------------------------------------------------
             #-------------------------------------------------------------------------------
             # Imports
             #-------------------------------------------------------------------------------
             from __future__ import division
             import logging
             import os
             import tempfile
             import time
             from datetime import datetime, timedelta
             from unittest import TestCase
             from IPython.parallel import error
             from IPython.parallel.controller.dictdb import DictDB
             from IPython.parallel.controller.sqlitedb import SQLiteDB
             from IPython.parallel.controller.hub import init_record, empty_record
             from IPython.testing import decorators as dec
             from IPython.zmq.session import Session
             #-------------------------------------------------------------------------------
             # TestCases
             #-------------------------------------------------------------------------------
             def setup():
                 global temp_db
                 temp_db = tempfile.NamedTemporaryFile(suffix='.db').name
-            class TestDictBackend(TestCase):
+            class TaskDBTest:
                 def setUp(self):
                     self.session = Session()
                     self.db = self.create_db()
                     self.load_records(16)
                 def create_db(self):
-                    return DictDB()
+                    raise NotImplementedError
-                def load_records(self, n=1):
+                def load_records(self, n=1, buffer_size=100):
                     """load n records for testing"""
                     #sleep 1/10 s, to ensure timestamp is different to previous calls
                     time.sleep(0.1)
                     msg_ids = []
                     for i in range(n):
                         msg = self.session.msg('apply_request', content=dict(a=5))
-                        msg['buffers'] = []
+                        msg['buffers'] = [os.urandom(buffer_size)]
                         rec = init_record(msg)
                         msg_id = msg['header']['msg_id']
                         msg_ids.append(msg_id)
                         self.db.add_record(msg_id, rec)
                     return msg_ids
                 def test_add_record(self):
                     before = self.db.get_history()
                     self.load_records(5)
                     after = self.db.get_history()
                     self.assertEqual(len(after), len(before)+5)
                     self.assertEqual(after[:-5],before)
                 def test_drop_record(self):
                     msg_id = self.load_records()[-1]
                     rec = self.db.get_record(msg_id)
                     self.db.drop_record(msg_id)
                     self.assertRaises(KeyError,self.db.get_record, msg_id)
                 def _round_to_millisecond(self, dt):
                     """necessary because mongodb rounds microseconds"""
                     micro = dt.microsecond
                     extra = int(str(micro)[-3:])
                     return dt - timedelta(microseconds=extra)
                 def test_update_record(self):
                     now = self._round_to_millisecond(datetime.now())
                     #
                     msg_id = self.db.get_history()[-1]
                     rec1 = self.db.get_record(msg_id)
                     data = {'stdout': 'hello there', 'completed' : now}
                     self.db.update_record(msg_id, data)
                     rec2 = self.db.get_record(msg_id)
                     self.assertEqual(rec2['stdout'], 'hello there')
                     self.assertEqual(rec2['completed'], now)
                     rec1.update(data)
                     self.assertEqual(rec1, rec2)
                 # def test_update_record_bad(self):
                 #     """test updating nonexistant records"""
                 #     msg_id = str(uuid.uuid4())
                 #     data = {'stdout': 'hello there'}
                 #     self.assertRaises(KeyError, self.db.update_record, msg_id, data)
                 def test_find_records_dt(self):
                     """test finding records by date"""
                     hist = self.db.get_history()
                     middle = self.db.get_record(hist[len(hist)//2])
                     tic = middle['submitted']
                     before = self.db.find_records({'submitted' : {'$lt' : tic}})
                     after = self.db.find_records({'submitted' : {'$gte' : tic}})
                     self.assertEqual(len(before)+len(after),len(hist))
                     for b in before:
                         self.assertTrue(b['submitted'] < tic)
                     for a in after:
                         self.assertTrue(a['submitted'] >= tic)
                     same = self.db.find_records({'submitted' : tic})
                     for s in same:
                         self.assertTrue(s['submitted'] == tic)
                 def test_find_records_keys(self):
                     """test extracting subset of record keys"""
                     found = self.db.find_records({'msg_id': {'$ne' : ''}},keys=['submitted', 'completed'])
                     for rec in found:
                         self.assertEqual(set(rec.keys()), set(['msg_id', 'submitted', 'completed']))
                 def test_find_records_msg_id(self):
                     """ensure msg_id is always in found records"""
                     found = self.db.find_records({'msg_id': {'$ne' : ''}},keys=['submitted', 'completed'])
                     for rec in found:
                         self.assertTrue('msg_id' in rec.keys())
                     found = self.db.find_records({'msg_id': {'$ne' : ''}},keys=['submitted'])
                     for rec in found:
                         self.assertTrue('msg_id' in rec.keys())
                     found = self.db.find_records({'msg_id': {'$ne' : ''}},keys=['msg_id'])
                     for rec in found:
                         self.assertTrue('msg_id' in rec.keys())
                 def test_find_records_in(self):
                     """test finding records with '$in','$nin' operators"""
                     hist = self.db.get_history()
                     even = hist[::2]
                     odd = hist[1::2]
                     recs = self.db.find_records({ 'msg_id' : {'$in' : even}})
                     found = [ r['msg_id'] for r in recs ]
                     self.assertEqual(set(even), set(found))
                     recs = self.db.find_records({ 'msg_id' : {'$nin' : even}})
                     found = [ r['msg_id'] for r in recs ]
                     self.assertEqual(set(odd), set(found))
                 def test_get_history(self):
                     msg_ids = self.db.get_history()
                     latest = datetime(1984,1,1)
                     for msg_id in msg_ids:
                         rec = self.db.get_record(msg_id)
                         newt = rec['submitted']
                         self.assertTrue(newt >= latest)
                         latest = newt
                     msg_id = self.load_records(1)[-1]
                     self.assertEqual(self.db.get_history()[-1],msg_id)
                 def test_datetime(self):
                     """get/set timestamps with datetime objects"""
                     msg_id = self.db.get_history()[-1]
                     rec = self.db.get_record(msg_id)
                     self.assertTrue(isinstance(rec['submitted'], datetime))
                     self.db.update_record(msg_id, dict(completed=datetime.now()))
                     rec = self.db.get_record(msg_id)
                     self.assertTrue(isinstance(rec['completed'], datetime))
                 def test_drop_matching(self):
                     msg_ids = self.load_records(10)
                     query = {'msg_id' : {'$in':msg_ids}}
                     self.db.drop_matching_records(query)
                     recs = self.db.find_records(query)
                     self.assertEqual(len(recs), 0)
                 def test_null(self):
                     """test None comparison queries"""
                     msg_ids = self.load_records(10)
                     query = {'msg_id' : None}
                     recs = self.db.find_records(query)
                     self.assertEqual(len(recs), 0)
                     query = {'msg_id' : {'$ne' : None}}
                     recs = self.db.find_records(query)
                     self.assertTrue(len(recs) >= 10)
                 def test_pop_safe_get(self):
                     """editing query results shouldn't affect record [get]"""
                     msg_id = self.db.get_history()[-1]
                     rec = self.db.get_record(msg_id)
                     rec.pop('buffers')
                     rec['garbage'] = 'hello'
                     rec['header']['msg_id'] = 'fubar'
                     rec2 = self.db.get_record(msg_id)
                     self.assertTrue('buffers' in rec2)
                     self.assertFalse('garbage' in rec2)
                     self.assertEqual(rec2['header']['msg_id'], msg_id)
                 def test_pop_safe_find(self):
                     """editing query results shouldn't affect record [find]"""
                     msg_id = self.db.get_history()[-1]
                     rec = self.db.find_records({'msg_id' : msg_id})[0]
                     rec.pop('buffers')
                     rec['garbage'] = 'hello'
                     rec['header']['msg_id'] = 'fubar'
                     rec2 = self.db.find_records({'msg_id' : msg_id})[0]
                     self.assertTrue('buffers' in rec2)
                     self.assertFalse('garbage' in rec2)
                     self.assertEqual(rec2['header']['msg_id'], msg_id)
                 def test_pop_safe_find_keys(self):
                     """editing query results shouldn't affect record [find+keys]"""
                     msg_id = self.db.get_history()[-1]
                     rec = self.db.find_records({'msg_id' : msg_id}, keys=['buffers', 'header'])[0]
                     rec.pop('buffers')
                     rec['garbage'] = 'hello'
                     rec['header']['msg_id'] = 'fubar'
                     rec2 = self.db.find_records({'msg_id' : msg_id})[0]
                     self.assertTrue('buffers' in rec2)
                     self.assertFalse('garbage' in rec2)
                     self.assertEqual(rec2['header']['msg_id'], msg_id)
-            class TestSQLiteBackend(TestDictBackend):
+            class TestDictBackend(TaskDBTest, TestCase):
+                def create_db(self):
+                    return DictDB()
+                def test_cull_count(self):
+                    self.db = self.create_db() # skip the load-records init from setUp
+                    self.db.record_limit = 20
+                    self.db.cull_fraction = 0.2
+                    self.load_records(20)
+                    self.assertEquals(len(self.db.get_history()), 20)
+                    self.load_records(1)
+                    # 0.2 * 20 = 4, 21 - 4 = 17
+                    self.assertEquals(len(self.db.get_history()), 17)
+                    self.load_records(3)
+                    self.assertEquals(len(self.db.get_history()), 20)
+                    self.load_records(1)
+                    self.assertEquals(len(self.db.get_history()), 17)
+                    for i in range(100):
+                        self.load_records(1)
+                        self.assertTrue(len(self.db.get_history()) >= 17)
+                        self.assertTrue(len(self.db.get_history()) <= 20)
+                def test_cull_size(self):
+                    self.db = self.create_db() # skip the load-records init from setUp
+                    self.db.size_limit = 1000
+                    self.db.cull_fraction = 0.2
+                    self.load_records(100, buffer_size=10)
+                    self.assertEquals(len(self.db.get_history()), 100)
+                    self.load_records(1, buffer_size=0)
+                    self.assertEquals(len(self.db.get_history()), 101)
+                    self.load_records(1, buffer_size=1)
+                    # 0.2 * 100 = 20, 101 - 20 = 81
+                    self.assertEquals(len(self.db.get_history()), 81)
+                def test_cull_size_drop(self):
+                    """dropping records updates tracked buffer size"""
+                    self.db = self.create_db() # skip the load-records init from setUp
+                    self.db.size_limit = 1000
+                    self.db.cull_fraction = 0.2
+                    self.load_records(100, buffer_size=10)
+                    self.assertEquals(len(self.db.get_history()), 100)
+                    self.db.drop_record(self.db.get_history()[-1])
+                    self.assertEquals(len(self.db.get_history()), 99)
+                    self.load_records(1, buffer_size=5)
+                    self.assertEquals(len(self.db.get_history()), 100)
+                    self.load_records(1, buffer_size=5)
+                    self.assertEquals(len(self.db.get_history()), 101)
+                    self.load_records(1, buffer_size=1)
+                    self.assertEquals(len(self.db.get_history()), 81)
+                def test_cull_size_update(self):
+                    """updating records updates tracked buffer size"""
+                    self.db = self.create_db() # skip the load-records init from setUp
+                    self.db.size_limit = 1000
+                    self.db.cull_fraction = 0.2
+                    self.load_records(100, buffer_size=10)
+                    self.assertEquals(len(self.db.get_history()), 100)
+                    msg_id = self.db.get_history()[-1]
+                    self.db.update_record(msg_id, dict(result_buffers = [os.urandom(10)], buffers=[]))
+                    self.assertEquals(len(self.db.get_history()), 100)
+                    self.db.update_record(msg_id, dict(result_buffers = [os.urandom(11)], buffers=[]))
+                    self.assertEquals(len(self.db.get_history()), 79)
+            class TestSQLiteBackend(TaskDBTest, TestCase):
                 @dec.skip_without('sqlite3')
                 def create_db(self):
                     location, fname = os.path.split(temp_db)
                     log = logging.getLogger('test')
                     log.setLevel(logging.CRITICAL)
                     return SQLiteDB(location=location, fname=fname, log=log)
                 def tearDown(self):
                     self.db._db.close()
             def teardown():
                 """cleanup task db file after all tests have run"""
                 try:
                     os.remove(temp_db)
                 except:
                     pass

IPython/parallel/tests/test_mongodb.py

0 +3 -1

             """Tests for mongodb backend
             Authors:
             * Min RK
             """
             #-------------------------------------------------------------------------------
             #  Copyright (C) 2011  The IPython Development Team
             #
             #  Distributed under the terms of the BSD License.  The full license is in
             #  the file COPYING, distributed as part of this software.
             #-------------------------------------------------------------------------------
             #-------------------------------------------------------------------------------
             # Imports
             #-------------------------------------------------------------------------------
+            from unittest import TestCase
             from nose import SkipTest
             from pymongo import Connection
             from IPython.parallel.controller.mongodb import MongoDB
             from . import test_db
             try:
                 c = Connection()
             except Exception:
                 c=None
-            class TestMongoBackend(test_db.TestDictBackend):
+            class TestMongoBackend(test_db.TaskDBTest, TestCase):
                 """MongoDB backend tests"""
                 def create_db(self):
                     try:
                         return MongoDB(database='iptestdb', _connection=c)
                     except Exception:
                         raise SkipTest("Couldn't connect to mongodb")
             def teardown(self):
                 if c is not None:
                     c.drop_database('iptestdb')

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages